|
import warnings |
|
from collections import OrderedDict |
|
from typing import Any, Dict, List, Optional, Tuple |
|
|
|
import torch |
|
import torch.nn.functional as F |
|
from torch import nn, Tensor |
|
|
|
from ...ops import boxes as box_ops |
|
from ...transforms._presets import ObjectDetection |
|
from ...utils import _log_api_usage_once |
|
from .._api import register_model, Weights, WeightsEnum |
|
from .._meta import _COCO_CATEGORIES |
|
from .._utils import _ovewrite_value_param, handle_legacy_interface |
|
from ..vgg import VGG, vgg16, VGG16_Weights |
|
from . import _utils as det_utils |
|
from .anchor_utils import DefaultBoxGenerator |
|
from .backbone_utils import _validate_trainable_layers |
|
from .transform import GeneralizedRCNNTransform |
|
|
|
|
|
__all__ = [ |
|
"SSD300_VGG16_Weights", |
|
"ssd300_vgg16", |
|
] |
|
|
|
|
|
class SSD300_VGG16_Weights(WeightsEnum): |
|
COCO_V1 = Weights( |
|
url="https://download.pytorch.org/models/ssd300_vgg16_coco-b556d3b4.pth", |
|
transforms=ObjectDetection, |
|
meta={ |
|
"num_params": 35641826, |
|
"categories": _COCO_CATEGORIES, |
|
"min_size": (1, 1), |
|
"recipe": "https://github.com/pytorch/vision/tree/main/references/detection#ssd300-vgg16", |
|
"_metrics": { |
|
"COCO-val2017": { |
|
"box_map": 25.1, |
|
} |
|
}, |
|
"_ops": 34.858, |
|
"_file_size": 135.988, |
|
"_docs": """These weights were produced by following a similar training recipe as on the paper.""", |
|
}, |
|
) |
|
DEFAULT = COCO_V1 |
|
|
|
|
|
def _xavier_init(conv: nn.Module): |
|
for layer in conv.modules(): |
|
if isinstance(layer, nn.Conv2d): |
|
torch.nn.init.xavier_uniform_(layer.weight) |
|
if layer.bias is not None: |
|
torch.nn.init.constant_(layer.bias, 0.0) |
|
|
|
|
|
class SSDHead(nn.Module): |
|
def __init__(self, in_channels: List[int], num_anchors: List[int], num_classes: int): |
|
super().__init__() |
|
self.classification_head = SSDClassificationHead(in_channels, num_anchors, num_classes) |
|
self.regression_head = SSDRegressionHead(in_channels, num_anchors) |
|
|
|
def forward(self, x: List[Tensor]) -> Dict[str, Tensor]: |
|
return { |
|
"bbox_regression": self.regression_head(x), |
|
"cls_logits": self.classification_head(x), |
|
} |
|
|
|
|
|
class SSDScoringHead(nn.Module): |
|
def __init__(self, module_list: nn.ModuleList, num_columns: int): |
|
super().__init__() |
|
self.module_list = module_list |
|
self.num_columns = num_columns |
|
|
|
def _get_result_from_module_list(self, x: Tensor, idx: int) -> Tensor: |
|
""" |
|
This is equivalent to self.module_list[idx](x), |
|
but torchscript doesn't support this yet |
|
""" |
|
num_blocks = len(self.module_list) |
|
if idx < 0: |
|
idx += num_blocks |
|
out = x |
|
for i, module in enumerate(self.module_list): |
|
if i == idx: |
|
out = module(x) |
|
return out |
|
|
|
def forward(self, x: List[Tensor]) -> Tensor: |
|
all_results = [] |
|
|
|
for i, features in enumerate(x): |
|
results = self._get_result_from_module_list(features, i) |
|
|
|
|
|
N, _, H, W = results.shape |
|
results = results.view(N, -1, self.num_columns, H, W) |
|
results = results.permute(0, 3, 4, 1, 2) |
|
results = results.reshape(N, -1, self.num_columns) |
|
|
|
all_results.append(results) |
|
|
|
return torch.cat(all_results, dim=1) |
|
|
|
|
|
class SSDClassificationHead(SSDScoringHead): |
|
def __init__(self, in_channels: List[int], num_anchors: List[int], num_classes: int): |
|
cls_logits = nn.ModuleList() |
|
for channels, anchors in zip(in_channels, num_anchors): |
|
cls_logits.append(nn.Conv2d(channels, num_classes * anchors, kernel_size=3, padding=1)) |
|
_xavier_init(cls_logits) |
|
super().__init__(cls_logits, num_classes) |
|
|
|
|
|
class SSDRegressionHead(SSDScoringHead): |
|
def __init__(self, in_channels: List[int], num_anchors: List[int]): |
|
bbox_reg = nn.ModuleList() |
|
for channels, anchors in zip(in_channels, num_anchors): |
|
bbox_reg.append(nn.Conv2d(channels, 4 * anchors, kernel_size=3, padding=1)) |
|
_xavier_init(bbox_reg) |
|
super().__init__(bbox_reg, 4) |
|
|
|
|
|
class SSD(nn.Module): |
|
""" |
|
Implements SSD architecture from `"SSD: Single Shot MultiBox Detector" <https://arxiv.org/abs/1512.02325>`_. |
|
|
|
The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each |
|
image, and should be in 0-1 range. Different images can have different sizes, but they will be resized |
|
to a fixed size before passing it to the backbone. |
|
|
|
The behavior of the model changes depending on if it is in training or evaluation mode. |
|
|
|
During training, the model expects both the input tensors and targets (list of dictionary), |
|
containing: |
|
- boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with |
|
``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. |
|
- labels (Int64Tensor[N]): the class label for each ground-truth box |
|
|
|
The model returns a Dict[Tensor] during training, containing the classification and regression |
|
losses. |
|
|
|
During inference, the model requires only the input tensors, and returns the post-processed |
|
predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as |
|
follows, where ``N`` is the number of detections: |
|
|
|
- boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with |
|
``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. |
|
- labels (Int64Tensor[N]): the predicted labels for each detection |
|
- scores (Tensor[N]): the scores for each detection |
|
|
|
Args: |
|
backbone (nn.Module): the network used to compute the features for the model. |
|
It should contain an out_channels attribute with the list of the output channels of |
|
each feature map. The backbone should return a single Tensor or an OrderedDict[Tensor]. |
|
anchor_generator (DefaultBoxGenerator): module that generates the default boxes for a |
|
set of feature maps. |
|
size (Tuple[int, int]): the width and height to which images will be rescaled before feeding them |
|
to the backbone. |
|
num_classes (int): number of output classes of the model (including the background). |
|
image_mean (Tuple[float, float, float]): mean values used for input normalization. |
|
They are generally the mean values of the dataset on which the backbone has been trained |
|
on |
|
image_std (Tuple[float, float, float]): std values used for input normalization. |
|
They are generally the std values of the dataset on which the backbone has been trained on |
|
head (nn.Module, optional): Module run on top of the backbone features. Defaults to a module containing |
|
a classification and regression module. |
|
score_thresh (float): Score threshold used for postprocessing the detections. |
|
nms_thresh (float): NMS threshold used for postprocessing the detections. |
|
detections_per_img (int): Number of best detections to keep after NMS. |
|
iou_thresh (float): minimum IoU between the anchor and the GT box so that they can be |
|
considered as positive during training. |
|
topk_candidates (int): Number of best detections to keep before NMS. |
|
positive_fraction (float): a number between 0 and 1 which indicates the proportion of positive |
|
proposals used during the training of the classification head. It is used to estimate the negative to |
|
positive ratio. |
|
""" |
|
|
|
__annotations__ = { |
|
"box_coder": det_utils.BoxCoder, |
|
"proposal_matcher": det_utils.Matcher, |
|
} |
|
|
|
def __init__( |
|
self, |
|
backbone: nn.Module, |
|
anchor_generator: DefaultBoxGenerator, |
|
size: Tuple[int, int], |
|
num_classes: int, |
|
image_mean: Optional[List[float]] = None, |
|
image_std: Optional[List[float]] = None, |
|
head: Optional[nn.Module] = None, |
|
score_thresh: float = 0.01, |
|
nms_thresh: float = 0.45, |
|
detections_per_img: int = 200, |
|
iou_thresh: float = 0.5, |
|
topk_candidates: int = 400, |
|
positive_fraction: float = 0.25, |
|
**kwargs: Any, |
|
): |
|
super().__init__() |
|
_log_api_usage_once(self) |
|
|
|
self.backbone = backbone |
|
|
|
self.anchor_generator = anchor_generator |
|
|
|
self.box_coder = det_utils.BoxCoder(weights=(10.0, 10.0, 5.0, 5.0)) |
|
|
|
if head is None: |
|
if hasattr(backbone, "out_channels"): |
|
out_channels = backbone.out_channels |
|
else: |
|
out_channels = det_utils.retrieve_out_channels(backbone, size) |
|
|
|
if len(out_channels) != len(anchor_generator.aspect_ratios): |
|
raise ValueError( |
|
f"The length of the output channels from the backbone ({len(out_channels)}) do not match the length of the anchor generator aspect ratios ({len(anchor_generator.aspect_ratios)})" |
|
) |
|
|
|
num_anchors = self.anchor_generator.num_anchors_per_location() |
|
head = SSDHead(out_channels, num_anchors, num_classes) |
|
self.head = head |
|
|
|
self.proposal_matcher = det_utils.SSDMatcher(iou_thresh) |
|
|
|
if image_mean is None: |
|
image_mean = [0.485, 0.456, 0.406] |
|
if image_std is None: |
|
image_std = [0.229, 0.224, 0.225] |
|
self.transform = GeneralizedRCNNTransform( |
|
min(size), max(size), image_mean, image_std, size_divisible=1, fixed_size=size, **kwargs |
|
) |
|
|
|
self.score_thresh = score_thresh |
|
self.nms_thresh = nms_thresh |
|
self.detections_per_img = detections_per_img |
|
self.topk_candidates = topk_candidates |
|
self.neg_to_pos_ratio = (1.0 - positive_fraction) / positive_fraction |
|
|
|
|
|
self._has_warned = False |
|
|
|
@torch.jit.unused |
|
def eager_outputs( |
|
self, losses: Dict[str, Tensor], detections: List[Dict[str, Tensor]] |
|
) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]: |
|
if self.training: |
|
return losses |
|
|
|
return detections |
|
|
|
def compute_loss( |
|
self, |
|
targets: List[Dict[str, Tensor]], |
|
head_outputs: Dict[str, Tensor], |
|
anchors: List[Tensor], |
|
matched_idxs: List[Tensor], |
|
) -> Dict[str, Tensor]: |
|
bbox_regression = head_outputs["bbox_regression"] |
|
cls_logits = head_outputs["cls_logits"] |
|
|
|
|
|
num_foreground = 0 |
|
bbox_loss = [] |
|
cls_targets = [] |
|
for ( |
|
targets_per_image, |
|
bbox_regression_per_image, |
|
cls_logits_per_image, |
|
anchors_per_image, |
|
matched_idxs_per_image, |
|
) in zip(targets, bbox_regression, cls_logits, anchors, matched_idxs): |
|
|
|
foreground_idxs_per_image = torch.where(matched_idxs_per_image >= 0)[0] |
|
foreground_matched_idxs_per_image = matched_idxs_per_image[foreground_idxs_per_image] |
|
num_foreground += foreground_matched_idxs_per_image.numel() |
|
|
|
|
|
matched_gt_boxes_per_image = targets_per_image["boxes"][foreground_matched_idxs_per_image] |
|
bbox_regression_per_image = bbox_regression_per_image[foreground_idxs_per_image, :] |
|
anchors_per_image = anchors_per_image[foreground_idxs_per_image, :] |
|
target_regression = self.box_coder.encode_single(matched_gt_boxes_per_image, anchors_per_image) |
|
bbox_loss.append( |
|
torch.nn.functional.smooth_l1_loss(bbox_regression_per_image, target_regression, reduction="sum") |
|
) |
|
|
|
|
|
gt_classes_target = torch.zeros( |
|
(cls_logits_per_image.size(0),), |
|
dtype=targets_per_image["labels"].dtype, |
|
device=targets_per_image["labels"].device, |
|
) |
|
gt_classes_target[foreground_idxs_per_image] = targets_per_image["labels"][ |
|
foreground_matched_idxs_per_image |
|
] |
|
cls_targets.append(gt_classes_target) |
|
|
|
bbox_loss = torch.stack(bbox_loss) |
|
cls_targets = torch.stack(cls_targets) |
|
|
|
|
|
num_classes = cls_logits.size(-1) |
|
cls_loss = F.cross_entropy(cls_logits.view(-1, num_classes), cls_targets.view(-1), reduction="none").view( |
|
cls_targets.size() |
|
) |
|
|
|
|
|
foreground_idxs = cls_targets > 0 |
|
num_negative = self.neg_to_pos_ratio * foreground_idxs.sum(1, keepdim=True) |
|
|
|
negative_loss = cls_loss.clone() |
|
negative_loss[foreground_idxs] = -float("inf") |
|
values, idx = negative_loss.sort(1, descending=True) |
|
|
|
background_idxs = idx.sort(1)[1] < num_negative |
|
|
|
N = max(1, num_foreground) |
|
return { |
|
"bbox_regression": bbox_loss.sum() / N, |
|
"classification": (cls_loss[foreground_idxs].sum() + cls_loss[background_idxs].sum()) / N, |
|
} |
|
|
|
def forward( |
|
self, images: List[Tensor], targets: Optional[List[Dict[str, Tensor]]] = None |
|
) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]: |
|
if self.training: |
|
if targets is None: |
|
torch._assert(False, "targets should not be none when in training mode") |
|
else: |
|
for target in targets: |
|
boxes = target["boxes"] |
|
if isinstance(boxes, torch.Tensor): |
|
torch._assert( |
|
len(boxes.shape) == 2 and boxes.shape[-1] == 4, |
|
f"Expected target boxes to be a tensor of shape [N, 4], got {boxes.shape}.", |
|
) |
|
else: |
|
torch._assert(False, f"Expected target boxes to be of type Tensor, got {type(boxes)}.") |
|
|
|
|
|
original_image_sizes: List[Tuple[int, int]] = [] |
|
for img in images: |
|
val = img.shape[-2:] |
|
torch._assert( |
|
len(val) == 2, |
|
f"expecting the last two dimensions of the Tensor to be H and W instead got {img.shape[-2:]}", |
|
) |
|
original_image_sizes.append((val[0], val[1])) |
|
|
|
|
|
images, targets = self.transform(images, targets) |
|
|
|
|
|
if targets is not None: |
|
for target_idx, target in enumerate(targets): |
|
boxes = target["boxes"] |
|
degenerate_boxes = boxes[:, 2:] <= boxes[:, :2] |
|
if degenerate_boxes.any(): |
|
bb_idx = torch.where(degenerate_boxes.any(dim=1))[0][0] |
|
degen_bb: List[float] = boxes[bb_idx].tolist() |
|
torch._assert( |
|
False, |
|
"All bounding boxes should have positive height and width." |
|
f" Found invalid box {degen_bb} for target at index {target_idx}.", |
|
) |
|
|
|
|
|
features = self.backbone(images.tensors) |
|
if isinstance(features, torch.Tensor): |
|
features = OrderedDict([("0", features)]) |
|
|
|
features = list(features.values()) |
|
|
|
|
|
head_outputs = self.head(features) |
|
|
|
|
|
anchors = self.anchor_generator(images, features) |
|
|
|
losses = {} |
|
detections: List[Dict[str, Tensor]] = [] |
|
if self.training: |
|
matched_idxs = [] |
|
if targets is None: |
|
torch._assert(False, "targets should not be none when in training mode") |
|
else: |
|
for anchors_per_image, targets_per_image in zip(anchors, targets): |
|
if targets_per_image["boxes"].numel() == 0: |
|
matched_idxs.append( |
|
torch.full( |
|
(anchors_per_image.size(0),), -1, dtype=torch.int64, device=anchors_per_image.device |
|
) |
|
) |
|
continue |
|
|
|
match_quality_matrix = box_ops.box_iou(targets_per_image["boxes"], anchors_per_image) |
|
matched_idxs.append(self.proposal_matcher(match_quality_matrix)) |
|
|
|
losses = self.compute_loss(targets, head_outputs, anchors, matched_idxs) |
|
else: |
|
detections = self.postprocess_detections(head_outputs, anchors, images.image_sizes) |
|
detections = self.transform.postprocess(detections, images.image_sizes, original_image_sizes) |
|
|
|
if torch.jit.is_scripting(): |
|
if not self._has_warned: |
|
warnings.warn("SSD always returns a (Losses, Detections) tuple in scripting") |
|
self._has_warned = True |
|
return losses, detections |
|
return self.eager_outputs(losses, detections) |
|
|
|
def postprocess_detections( |
|
self, head_outputs: Dict[str, Tensor], image_anchors: List[Tensor], image_shapes: List[Tuple[int, int]] |
|
) -> List[Dict[str, Tensor]]: |
|
bbox_regression = head_outputs["bbox_regression"] |
|
pred_scores = F.softmax(head_outputs["cls_logits"], dim=-1) |
|
|
|
num_classes = pred_scores.size(-1) |
|
device = pred_scores.device |
|
|
|
detections: List[Dict[str, Tensor]] = [] |
|
|
|
for boxes, scores, anchors, image_shape in zip(bbox_regression, pred_scores, image_anchors, image_shapes): |
|
boxes = self.box_coder.decode_single(boxes, anchors) |
|
boxes = box_ops.clip_boxes_to_image(boxes, image_shape) |
|
|
|
image_boxes = [] |
|
image_scores = [] |
|
image_labels = [] |
|
for label in range(1, num_classes): |
|
score = scores[:, label] |
|
|
|
keep_idxs = score > self.score_thresh |
|
score = score[keep_idxs] |
|
box = boxes[keep_idxs] |
|
|
|
|
|
num_topk = det_utils._topk_min(score, self.topk_candidates, 0) |
|
score, idxs = score.topk(num_topk) |
|
box = box[idxs] |
|
|
|
image_boxes.append(box) |
|
image_scores.append(score) |
|
image_labels.append(torch.full_like(score, fill_value=label, dtype=torch.int64, device=device)) |
|
|
|
image_boxes = torch.cat(image_boxes, dim=0) |
|
image_scores = torch.cat(image_scores, dim=0) |
|
image_labels = torch.cat(image_labels, dim=0) |
|
|
|
|
|
keep = box_ops.batched_nms(image_boxes, image_scores, image_labels, self.nms_thresh) |
|
keep = keep[: self.detections_per_img] |
|
|
|
detections.append( |
|
{ |
|
"boxes": image_boxes[keep], |
|
"scores": image_scores[keep], |
|
"labels": image_labels[keep], |
|
} |
|
) |
|
return detections |
|
|
|
|
|
class SSDFeatureExtractorVGG(nn.Module): |
|
def __init__(self, backbone: nn.Module, highres: bool): |
|
super().__init__() |
|
|
|
_, _, maxpool3_pos, maxpool4_pos, _ = (i for i, layer in enumerate(backbone) if isinstance(layer, nn.MaxPool2d)) |
|
|
|
|
|
backbone[maxpool3_pos].ceil_mode = True |
|
|
|
|
|
self.scale_weight = nn.Parameter(torch.ones(512) * 20) |
|
|
|
|
|
self.features = nn.Sequential(*backbone[:maxpool4_pos]) |
|
|
|
|
|
extra = nn.ModuleList( |
|
[ |
|
nn.Sequential( |
|
nn.Conv2d(1024, 256, kernel_size=1), |
|
nn.ReLU(inplace=True), |
|
nn.Conv2d(256, 512, kernel_size=3, padding=1, stride=2), |
|
nn.ReLU(inplace=True), |
|
), |
|
nn.Sequential( |
|
nn.Conv2d(512, 128, kernel_size=1), |
|
nn.ReLU(inplace=True), |
|
nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2), |
|
nn.ReLU(inplace=True), |
|
), |
|
nn.Sequential( |
|
nn.Conv2d(256, 128, kernel_size=1), |
|
nn.ReLU(inplace=True), |
|
nn.Conv2d(128, 256, kernel_size=3), |
|
nn.ReLU(inplace=True), |
|
), |
|
nn.Sequential( |
|
nn.Conv2d(256, 128, kernel_size=1), |
|
nn.ReLU(inplace=True), |
|
nn.Conv2d(128, 256, kernel_size=3), |
|
nn.ReLU(inplace=True), |
|
), |
|
] |
|
) |
|
if highres: |
|
|
|
extra.append( |
|
nn.Sequential( |
|
nn.Conv2d(256, 128, kernel_size=1), |
|
nn.ReLU(inplace=True), |
|
nn.Conv2d(128, 256, kernel_size=4), |
|
nn.ReLU(inplace=True), |
|
) |
|
) |
|
_xavier_init(extra) |
|
|
|
fc = nn.Sequential( |
|
nn.MaxPool2d(kernel_size=3, stride=1, padding=1, ceil_mode=False), |
|
nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, padding=6, dilation=6), |
|
nn.ReLU(inplace=True), |
|
nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=1), |
|
nn.ReLU(inplace=True), |
|
) |
|
_xavier_init(fc) |
|
extra.insert( |
|
0, |
|
nn.Sequential( |
|
*backbone[maxpool4_pos:-1], |
|
fc, |
|
), |
|
) |
|
self.extra = extra |
|
|
|
def forward(self, x: Tensor) -> Dict[str, Tensor]: |
|
|
|
x = self.features(x) |
|
rescaled = self.scale_weight.view(1, -1, 1, 1) * F.normalize(x) |
|
output = [rescaled] |
|
|
|
|
|
for block in self.extra: |
|
x = block(x) |
|
output.append(x) |
|
|
|
return OrderedDict([(str(i), v) for i, v in enumerate(output)]) |
|
|
|
|
|
def _vgg_extractor(backbone: VGG, highres: bool, trainable_layers: int): |
|
backbone = backbone.features |
|
|
|
stage_indices = [0] + [i for i, b in enumerate(backbone) if isinstance(b, nn.MaxPool2d)][:-1] |
|
num_stages = len(stage_indices) |
|
|
|
|
|
torch._assert( |
|
0 <= trainable_layers <= num_stages, |
|
f"trainable_layers should be in the range [0, {num_stages}]. Instead got {trainable_layers}", |
|
) |
|
freeze_before = len(backbone) if trainable_layers == 0 else stage_indices[num_stages - trainable_layers] |
|
|
|
for b in backbone[:freeze_before]: |
|
for parameter in b.parameters(): |
|
parameter.requires_grad_(False) |
|
|
|
return SSDFeatureExtractorVGG(backbone, highres) |
|
|
|
|
|
@register_model() |
|
@handle_legacy_interface( |
|
weights=("pretrained", SSD300_VGG16_Weights.COCO_V1), |
|
weights_backbone=("pretrained_backbone", VGG16_Weights.IMAGENET1K_FEATURES), |
|
) |
|
def ssd300_vgg16( |
|
*, |
|
weights: Optional[SSD300_VGG16_Weights] = None, |
|
progress: bool = True, |
|
num_classes: Optional[int] = None, |
|
weights_backbone: Optional[VGG16_Weights] = VGG16_Weights.IMAGENET1K_FEATURES, |
|
trainable_backbone_layers: Optional[int] = None, |
|
**kwargs: Any, |
|
) -> SSD: |
|
"""The SSD300 model is based on the `SSD: Single Shot MultiBox Detector |
|
<https://arxiv.org/abs/1512.02325>`_ paper. |
|
|
|
.. betastatus:: detection module |
|
|
|
The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each |
|
image, and should be in 0-1 range. Different images can have different sizes, but they will be resized |
|
to a fixed size before passing it to the backbone. |
|
|
|
The behavior of the model changes depending on if it is in training or evaluation mode. |
|
|
|
During training, the model expects both the input tensors and targets (list of dictionary), |
|
containing: |
|
|
|
- boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with |
|
``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. |
|
- labels (Int64Tensor[N]): the class label for each ground-truth box |
|
|
|
The model returns a Dict[Tensor] during training, containing the classification and regression |
|
losses. |
|
|
|
During inference, the model requires only the input tensors, and returns the post-processed |
|
predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as |
|
follows, where ``N`` is the number of detections: |
|
|
|
- boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with |
|
``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. |
|
- labels (Int64Tensor[N]): the predicted labels for each detection |
|
- scores (Tensor[N]): the scores for each detection |
|
|
|
Example: |
|
|
|
>>> model = torchvision.models.detection.ssd300_vgg16(weights=SSD300_VGG16_Weights.DEFAULT) |
|
>>> model.eval() |
|
>>> x = [torch.rand(3, 300, 300), torch.rand(3, 500, 400)] |
|
>>> predictions = model(x) |
|
|
|
Args: |
|
weights (:class:`~torchvision.models.detection.SSD300_VGG16_Weights`, optional): The pretrained |
|
weights to use. See |
|
:class:`~torchvision.models.detection.SSD300_VGG16_Weights` |
|
below for more details, and possible values. By default, no |
|
pre-trained weights are used. |
|
progress (bool, optional): If True, displays a progress bar of the download to stderr |
|
Default is True. |
|
num_classes (int, optional): number of output classes of the model (including the background) |
|
weights_backbone (:class:`~torchvision.models.VGG16_Weights`, optional): The pretrained weights for the |
|
backbone |
|
trainable_backbone_layers (int, optional): number of trainable (not frozen) layers starting from final block. |
|
Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable. If ``None`` is |
|
passed (the default) this value is set to 4. |
|
**kwargs: parameters passed to the ``torchvision.models.detection.SSD`` |
|
base class. Please refer to the `source code |
|
<https://github.com/pytorch/vision/blob/main/torchvision/models/detection/ssd.py>`_ |
|
for more details about this class. |
|
|
|
.. autoclass:: torchvision.models.detection.SSD300_VGG16_Weights |
|
:members: |
|
""" |
|
weights = SSD300_VGG16_Weights.verify(weights) |
|
weights_backbone = VGG16_Weights.verify(weights_backbone) |
|
|
|
if "size" in kwargs: |
|
warnings.warn("The size of the model is already fixed; ignoring the parameter.") |
|
|
|
if weights is not None: |
|
weights_backbone = None |
|
num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"])) |
|
elif num_classes is None: |
|
num_classes = 91 |
|
|
|
trainable_backbone_layers = _validate_trainable_layers( |
|
weights is not None or weights_backbone is not None, trainable_backbone_layers, 5, 4 |
|
) |
|
|
|
|
|
backbone = vgg16(weights=weights_backbone, progress=progress) |
|
backbone = _vgg_extractor(backbone, False, trainable_backbone_layers) |
|
anchor_generator = DefaultBoxGenerator( |
|
[[2], [2, 3], [2, 3], [2, 3], [2], [2]], |
|
scales=[0.07, 0.15, 0.33, 0.51, 0.69, 0.87, 1.05], |
|
steps=[8, 16, 32, 64, 100, 300], |
|
) |
|
|
|
defaults = { |
|
|
|
"image_mean": [0.48235, 0.45882, 0.40784], |
|
"image_std": [1.0 / 255.0, 1.0 / 255.0, 1.0 / 255.0], |
|
} |
|
kwargs: Any = {**defaults, **kwargs} |
|
model = SSD(backbone, anchor_generator, (300, 300), num_classes, **kwargs) |
|
|
|
if weights is not None: |
|
model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True)) |
|
|
|
return model |
|
|