|
import warnings |
|
from collections import OrderedDict |
|
from functools import partial |
|
from typing import Any, Callable, Dict, List, Optional, Union |
|
|
|
import torch |
|
from torch import nn, Tensor |
|
|
|
from ...ops.misc import Conv2dNormActivation |
|
from ...transforms._presets import ObjectDetection |
|
from ...utils import _log_api_usage_once |
|
from .. import mobilenet |
|
from .._api import register_model, Weights, WeightsEnum |
|
from .._meta import _COCO_CATEGORIES |
|
from .._utils import _ovewrite_value_param, handle_legacy_interface |
|
from ..mobilenetv3 import mobilenet_v3_large, MobileNet_V3_Large_Weights |
|
from . import _utils as det_utils |
|
from .anchor_utils import DefaultBoxGenerator |
|
from .backbone_utils import _validate_trainable_layers |
|
from .ssd import SSD, SSDScoringHead |
|
|
|
|
|
__all__ = [ |
|
"SSDLite320_MobileNet_V3_Large_Weights", |
|
"ssdlite320_mobilenet_v3_large", |
|
] |
|
|
|
|
|
|
|
def _prediction_block( |
|
in_channels: int, out_channels: int, kernel_size: int, norm_layer: Callable[..., nn.Module] |
|
) -> nn.Sequential: |
|
return nn.Sequential( |
|
|
|
Conv2dNormActivation( |
|
in_channels, |
|
in_channels, |
|
kernel_size=kernel_size, |
|
groups=in_channels, |
|
norm_layer=norm_layer, |
|
activation_layer=nn.ReLU6, |
|
), |
|
|
|
nn.Conv2d(in_channels, out_channels, 1), |
|
) |
|
|
|
|
|
def _extra_block(in_channels: int, out_channels: int, norm_layer: Callable[..., nn.Module]) -> nn.Sequential: |
|
activation = nn.ReLU6 |
|
intermediate_channels = out_channels // 2 |
|
return nn.Sequential( |
|
|
|
Conv2dNormActivation( |
|
in_channels, intermediate_channels, kernel_size=1, norm_layer=norm_layer, activation_layer=activation |
|
), |
|
|
|
Conv2dNormActivation( |
|
intermediate_channels, |
|
intermediate_channels, |
|
kernel_size=3, |
|
stride=2, |
|
groups=intermediate_channels, |
|
norm_layer=norm_layer, |
|
activation_layer=activation, |
|
), |
|
|
|
Conv2dNormActivation( |
|
intermediate_channels, out_channels, kernel_size=1, norm_layer=norm_layer, activation_layer=activation |
|
), |
|
) |
|
|
|
|
|
def _normal_init(conv: nn.Module): |
|
for layer in conv.modules(): |
|
if isinstance(layer, nn.Conv2d): |
|
torch.nn.init.normal_(layer.weight, mean=0.0, std=0.03) |
|
if layer.bias is not None: |
|
torch.nn.init.constant_(layer.bias, 0.0) |
|
|
|
|
|
class SSDLiteHead(nn.Module): |
|
def __init__( |
|
self, in_channels: List[int], num_anchors: List[int], num_classes: int, norm_layer: Callable[..., nn.Module] |
|
): |
|
super().__init__() |
|
self.classification_head = SSDLiteClassificationHead(in_channels, num_anchors, num_classes, norm_layer) |
|
self.regression_head = SSDLiteRegressionHead(in_channels, num_anchors, norm_layer) |
|
|
|
def forward(self, x: List[Tensor]) -> Dict[str, Tensor]: |
|
return { |
|
"bbox_regression": self.regression_head(x), |
|
"cls_logits": self.classification_head(x), |
|
} |
|
|
|
|
|
class SSDLiteClassificationHead(SSDScoringHead): |
|
def __init__( |
|
self, in_channels: List[int], num_anchors: List[int], num_classes: int, norm_layer: Callable[..., nn.Module] |
|
): |
|
cls_logits = nn.ModuleList() |
|
for channels, anchors in zip(in_channels, num_anchors): |
|
cls_logits.append(_prediction_block(channels, num_classes * anchors, 3, norm_layer)) |
|
_normal_init(cls_logits) |
|
super().__init__(cls_logits, num_classes) |
|
|
|
|
|
class SSDLiteRegressionHead(SSDScoringHead): |
|
def __init__(self, in_channels: List[int], num_anchors: List[int], norm_layer: Callable[..., nn.Module]): |
|
bbox_reg = nn.ModuleList() |
|
for channels, anchors in zip(in_channels, num_anchors): |
|
bbox_reg.append(_prediction_block(channels, 4 * anchors, 3, norm_layer)) |
|
_normal_init(bbox_reg) |
|
super().__init__(bbox_reg, 4) |
|
|
|
|
|
class SSDLiteFeatureExtractorMobileNet(nn.Module): |
|
def __init__( |
|
self, |
|
backbone: nn.Module, |
|
c4_pos: int, |
|
norm_layer: Callable[..., nn.Module], |
|
width_mult: float = 1.0, |
|
min_depth: int = 16, |
|
): |
|
super().__init__() |
|
_log_api_usage_once(self) |
|
|
|
if backbone[c4_pos].use_res_connect: |
|
raise ValueError("backbone[c4_pos].use_res_connect should be False") |
|
|
|
self.features = nn.Sequential( |
|
|
|
nn.Sequential(*backbone[:c4_pos], backbone[c4_pos].block[0]), |
|
nn.Sequential(backbone[c4_pos].block[1:], *backbone[c4_pos + 1 :]), |
|
) |
|
|
|
get_depth = lambda d: max(min_depth, int(d * width_mult)) |
|
extra = nn.ModuleList( |
|
[ |
|
_extra_block(backbone[-1].out_channels, get_depth(512), norm_layer), |
|
_extra_block(get_depth(512), get_depth(256), norm_layer), |
|
_extra_block(get_depth(256), get_depth(256), norm_layer), |
|
_extra_block(get_depth(256), get_depth(128), norm_layer), |
|
] |
|
) |
|
_normal_init(extra) |
|
|
|
self.extra = extra |
|
|
|
def forward(self, x: Tensor) -> Dict[str, Tensor]: |
|
|
|
output = [] |
|
for block in self.features: |
|
x = block(x) |
|
output.append(x) |
|
|
|
for block in self.extra: |
|
x = block(x) |
|
output.append(x) |
|
|
|
return OrderedDict([(str(i), v) for i, v in enumerate(output)]) |
|
|
|
|
|
def _mobilenet_extractor( |
|
backbone: Union[mobilenet.MobileNetV2, mobilenet.MobileNetV3], |
|
trainable_layers: int, |
|
norm_layer: Callable[..., nn.Module], |
|
): |
|
backbone = backbone.features |
|
|
|
|
|
stage_indices = [0] + [i for i, b in enumerate(backbone) if getattr(b, "_is_cn", False)] + [len(backbone) - 1] |
|
num_stages = len(stage_indices) |
|
|
|
|
|
if not 0 <= trainable_layers <= num_stages: |
|
raise ValueError("trainable_layers should be in the range [0, {num_stages}], instead got {trainable_layers}") |
|
freeze_before = len(backbone) if trainable_layers == 0 else stage_indices[num_stages - trainable_layers] |
|
|
|
for b in backbone[:freeze_before]: |
|
for parameter in b.parameters(): |
|
parameter.requires_grad_(False) |
|
|
|
return SSDLiteFeatureExtractorMobileNet(backbone, stage_indices[-2], norm_layer) |
|
|
|
|
|
class SSDLite320_MobileNet_V3_Large_Weights(WeightsEnum): |
|
COCO_V1 = Weights( |
|
url="https://download.pytorch.org/models/ssdlite320_mobilenet_v3_large_coco-a79551df.pth", |
|
transforms=ObjectDetection, |
|
meta={ |
|
"num_params": 3440060, |
|
"categories": _COCO_CATEGORIES, |
|
"min_size": (1, 1), |
|
"recipe": "https://github.com/pytorch/vision/tree/main/references/detection#ssdlite320-mobilenetv3-large", |
|
"_metrics": { |
|
"COCO-val2017": { |
|
"box_map": 21.3, |
|
} |
|
}, |
|
"_ops": 0.583, |
|
"_file_size": 13.418, |
|
"_docs": """These weights were produced by following a similar training recipe as on the paper.""", |
|
}, |
|
) |
|
DEFAULT = COCO_V1 |
|
|
|
|
|
@register_model() |
|
@handle_legacy_interface( |
|
weights=("pretrained", SSDLite320_MobileNet_V3_Large_Weights.COCO_V1), |
|
weights_backbone=("pretrained_backbone", MobileNet_V3_Large_Weights.IMAGENET1K_V1), |
|
) |
|
def ssdlite320_mobilenet_v3_large( |
|
*, |
|
weights: Optional[SSDLite320_MobileNet_V3_Large_Weights] = None, |
|
progress: bool = True, |
|
num_classes: Optional[int] = None, |
|
weights_backbone: Optional[MobileNet_V3_Large_Weights] = MobileNet_V3_Large_Weights.IMAGENET1K_V1, |
|
trainable_backbone_layers: Optional[int] = None, |
|
norm_layer: Optional[Callable[..., nn.Module]] = None, |
|
**kwargs: Any, |
|
) -> SSD: |
|
"""SSDlite model architecture with input size 320x320 and a MobileNetV3 Large backbone, as |
|
described at `Searching for MobileNetV3 <https://arxiv.org/abs/1905.02244>`__ and |
|
`MobileNetV2: Inverted Residuals and Linear Bottlenecks <https://arxiv.org/abs/1801.04381>`__. |
|
|
|
.. betastatus:: detection module |
|
|
|
See :func:`~torchvision.models.detection.ssd300_vgg16` for more details. |
|
|
|
Example: |
|
|
|
>>> model = torchvision.models.detection.ssdlite320_mobilenet_v3_large(weights=SSDLite320_MobileNet_V3_Large_Weights.DEFAULT) |
|
>>> model.eval() |
|
>>> x = [torch.rand(3, 320, 320), torch.rand(3, 500, 400)] |
|
>>> predictions = model(x) |
|
|
|
Args: |
|
weights (:class:`~torchvision.models.detection.SSDLite320_MobileNet_V3_Large_Weights`, optional): The |
|
pretrained weights to use. See |
|
:class:`~torchvision.models.detection.SSDLite320_MobileNet_V3_Large_Weights` below for |
|
more details, and possible values. By default, no pre-trained |
|
weights are used. |
|
progress (bool, optional): If True, displays a progress bar of the |
|
download to stderr. Default is True. |
|
num_classes (int, optional): number of output classes of the model |
|
(including the background). |
|
weights_backbone (:class:`~torchvision.models.MobileNet_V3_Large_Weights`, optional): The pretrained |
|
weights for the backbone. |
|
trainable_backbone_layers (int, optional): number of trainable (not frozen) layers |
|
starting from final block. Valid values are between 0 and 6, with 6 meaning all |
|
backbone layers are trainable. If ``None`` is passed (the default) this value is |
|
set to 6. |
|
norm_layer (callable, optional): Module specifying the normalization layer to use. |
|
**kwargs: parameters passed to the ``torchvision.models.detection.ssd.SSD`` |
|
base class. Please refer to the `source code |
|
<https://github.com/pytorch/vision/blob/main/torchvision/models/detection/ssdlite.py>`_ |
|
for more details about this class. |
|
|
|
.. autoclass:: torchvision.models.detection.SSDLite320_MobileNet_V3_Large_Weights |
|
:members: |
|
""" |
|
|
|
weights = SSDLite320_MobileNet_V3_Large_Weights.verify(weights) |
|
weights_backbone = MobileNet_V3_Large_Weights.verify(weights_backbone) |
|
|
|
if "size" in kwargs: |
|
warnings.warn("The size of the model is already fixed; ignoring the parameter.") |
|
|
|
if weights is not None: |
|
weights_backbone = None |
|
num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"])) |
|
elif num_classes is None: |
|
num_classes = 91 |
|
|
|
trainable_backbone_layers = _validate_trainable_layers( |
|
weights is not None or weights_backbone is not None, trainable_backbone_layers, 6, 6 |
|
) |
|
|
|
|
|
reduce_tail = weights_backbone is None |
|
|
|
if norm_layer is None: |
|
norm_layer = partial(nn.BatchNorm2d, eps=0.001, momentum=0.03) |
|
|
|
backbone = mobilenet_v3_large( |
|
weights=weights_backbone, progress=progress, norm_layer=norm_layer, reduced_tail=reduce_tail, **kwargs |
|
) |
|
if weights_backbone is None: |
|
|
|
_normal_init(backbone) |
|
backbone = _mobilenet_extractor( |
|
backbone, |
|
trainable_backbone_layers, |
|
norm_layer, |
|
) |
|
|
|
size = (320, 320) |
|
anchor_generator = DefaultBoxGenerator([[2, 3] for _ in range(6)], min_ratio=0.2, max_ratio=0.95) |
|
out_channels = det_utils.retrieve_out_channels(backbone, size) |
|
num_anchors = anchor_generator.num_anchors_per_location() |
|
if len(out_channels) != len(anchor_generator.aspect_ratios): |
|
raise ValueError( |
|
f"The length of the output channels from the backbone {len(out_channels)} do not match the length of the anchor generator aspect ratios {len(anchor_generator.aspect_ratios)}" |
|
) |
|
|
|
defaults = { |
|
"score_thresh": 0.001, |
|
"nms_thresh": 0.55, |
|
"detections_per_img": 300, |
|
"topk_candidates": 300, |
|
|
|
|
|
"image_mean": [0.5, 0.5, 0.5], |
|
"image_std": [0.5, 0.5, 0.5], |
|
} |
|
kwargs: Any = {**defaults, **kwargs} |
|
model = SSD( |
|
backbone, |
|
anchor_generator, |
|
size, |
|
num_classes, |
|
head=SSDLiteHead(out_channels, num_anchors, num_classes, norm_layer), |
|
**kwargs, |
|
) |
|
|
|
if weights is not None: |
|
model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True)) |
|
|
|
return model |
|
|