|
import math |
|
import warnings |
|
from collections import OrderedDict |
|
from functools import partial |
|
from typing import Any, Callable, Dict, List, Optional, Tuple |
|
|
|
import torch |
|
from torch import nn, Tensor |
|
|
|
from ...ops import boxes as box_ops, generalized_box_iou_loss, misc as misc_nn_ops, sigmoid_focal_loss |
|
from ...ops.feature_pyramid_network import LastLevelP6P7 |
|
from ...transforms._presets import ObjectDetection |
|
from ...utils import _log_api_usage_once |
|
from .._api import register_model, Weights, WeightsEnum |
|
from .._meta import _COCO_CATEGORIES |
|
from .._utils import _ovewrite_value_param, handle_legacy_interface |
|
from ..resnet import resnet50, ResNet50_Weights |
|
from . import _utils as det_utils |
|
from .anchor_utils import AnchorGenerator |
|
from .backbone_utils import _resnet_fpn_extractor, _validate_trainable_layers |
|
from .transform import GeneralizedRCNNTransform |
|
|
|
|
|
__all__ = [ |
|
"FCOS", |
|
"FCOS_ResNet50_FPN_Weights", |
|
"fcos_resnet50_fpn", |
|
] |
|
|
|
|
|
class FCOSHead(nn.Module): |
|
""" |
|
A regression and classification head for use in FCOS. |
|
|
|
Args: |
|
in_channels (int): number of channels of the input feature |
|
num_anchors (int): number of anchors to be predicted |
|
num_classes (int): number of classes to be predicted |
|
num_convs (Optional[int]): number of conv layer of head. Default: 4. |
|
""" |
|
|
|
__annotations__ = { |
|
"box_coder": det_utils.BoxLinearCoder, |
|
} |
|
|
|
def __init__(self, in_channels: int, num_anchors: int, num_classes: int, num_convs: Optional[int] = 4) -> None: |
|
super().__init__() |
|
self.box_coder = det_utils.BoxLinearCoder(normalize_by_size=True) |
|
self.classification_head = FCOSClassificationHead(in_channels, num_anchors, num_classes, num_convs) |
|
self.regression_head = FCOSRegressionHead(in_channels, num_anchors, num_convs) |
|
|
|
def compute_loss( |
|
self, |
|
targets: List[Dict[str, Tensor]], |
|
head_outputs: Dict[str, Tensor], |
|
anchors: List[Tensor], |
|
matched_idxs: List[Tensor], |
|
) -> Dict[str, Tensor]: |
|
|
|
cls_logits = head_outputs["cls_logits"] |
|
bbox_regression = head_outputs["bbox_regression"] |
|
bbox_ctrness = head_outputs["bbox_ctrness"] |
|
|
|
all_gt_classes_targets = [] |
|
all_gt_boxes_targets = [] |
|
for targets_per_image, matched_idxs_per_image in zip(targets, matched_idxs): |
|
if len(targets_per_image["labels"]) == 0: |
|
gt_classes_targets = targets_per_image["labels"].new_zeros((len(matched_idxs_per_image),)) |
|
gt_boxes_targets = targets_per_image["boxes"].new_zeros((len(matched_idxs_per_image), 4)) |
|
else: |
|
gt_classes_targets = targets_per_image["labels"][matched_idxs_per_image.clip(min=0)] |
|
gt_boxes_targets = targets_per_image["boxes"][matched_idxs_per_image.clip(min=0)] |
|
gt_classes_targets[matched_idxs_per_image < 0] = -1 |
|
all_gt_classes_targets.append(gt_classes_targets) |
|
all_gt_boxes_targets.append(gt_boxes_targets) |
|
|
|
|
|
all_gt_boxes_targets, all_gt_classes_targets, anchors = ( |
|
torch.stack(all_gt_boxes_targets), |
|
torch.stack(all_gt_classes_targets), |
|
torch.stack(anchors), |
|
) |
|
|
|
|
|
foregroud_mask = all_gt_classes_targets >= 0 |
|
num_foreground = foregroud_mask.sum().item() |
|
|
|
|
|
gt_classes_targets = torch.zeros_like(cls_logits) |
|
gt_classes_targets[foregroud_mask, all_gt_classes_targets[foregroud_mask]] = 1.0 |
|
loss_cls = sigmoid_focal_loss(cls_logits, gt_classes_targets, reduction="sum") |
|
|
|
|
|
pred_boxes = self.box_coder.decode(bbox_regression, anchors) |
|
|
|
|
|
loss_bbox_reg = generalized_box_iou_loss( |
|
pred_boxes[foregroud_mask], |
|
all_gt_boxes_targets[foregroud_mask], |
|
reduction="sum", |
|
) |
|
|
|
|
|
|
|
bbox_reg_targets = self.box_coder.encode(anchors, all_gt_boxes_targets) |
|
|
|
if len(bbox_reg_targets) == 0: |
|
gt_ctrness_targets = bbox_reg_targets.new_zeros(bbox_reg_targets.size()[:-1]) |
|
else: |
|
left_right = bbox_reg_targets[:, :, [0, 2]] |
|
top_bottom = bbox_reg_targets[:, :, [1, 3]] |
|
gt_ctrness_targets = torch.sqrt( |
|
(left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) |
|
* (top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0]) |
|
) |
|
pred_centerness = bbox_ctrness.squeeze(dim=2) |
|
loss_bbox_ctrness = nn.functional.binary_cross_entropy_with_logits( |
|
pred_centerness[foregroud_mask], gt_ctrness_targets[foregroud_mask], reduction="sum" |
|
) |
|
|
|
return { |
|
"classification": loss_cls / max(1, num_foreground), |
|
"bbox_regression": loss_bbox_reg / max(1, num_foreground), |
|
"bbox_ctrness": loss_bbox_ctrness / max(1, num_foreground), |
|
} |
|
|
|
def forward(self, x: List[Tensor]) -> Dict[str, Tensor]: |
|
cls_logits = self.classification_head(x) |
|
bbox_regression, bbox_ctrness = self.regression_head(x) |
|
return { |
|
"cls_logits": cls_logits, |
|
"bbox_regression": bbox_regression, |
|
"bbox_ctrness": bbox_ctrness, |
|
} |
|
|
|
|
|
class FCOSClassificationHead(nn.Module): |
|
""" |
|
A classification head for use in FCOS. |
|
|
|
Args: |
|
in_channels (int): number of channels of the input feature. |
|
num_anchors (int): number of anchors to be predicted. |
|
num_classes (int): number of classes to be predicted. |
|
num_convs (Optional[int]): number of conv layer. Default: 4. |
|
prior_probability (Optional[float]): probability of prior. Default: 0.01. |
|
norm_layer: Module specifying the normalization layer to use. |
|
""" |
|
|
|
def __init__( |
|
self, |
|
in_channels: int, |
|
num_anchors: int, |
|
num_classes: int, |
|
num_convs: int = 4, |
|
prior_probability: float = 0.01, |
|
norm_layer: Optional[Callable[..., nn.Module]] = None, |
|
) -> None: |
|
super().__init__() |
|
|
|
self.num_classes = num_classes |
|
self.num_anchors = num_anchors |
|
|
|
if norm_layer is None: |
|
norm_layer = partial(nn.GroupNorm, 32) |
|
|
|
conv = [] |
|
for _ in range(num_convs): |
|
conv.append(nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)) |
|
conv.append(norm_layer(in_channels)) |
|
conv.append(nn.ReLU()) |
|
self.conv = nn.Sequential(*conv) |
|
|
|
for layer in self.conv.children(): |
|
if isinstance(layer, nn.Conv2d): |
|
torch.nn.init.normal_(layer.weight, std=0.01) |
|
torch.nn.init.constant_(layer.bias, 0) |
|
|
|
self.cls_logits = nn.Conv2d(in_channels, num_anchors * num_classes, kernel_size=3, stride=1, padding=1) |
|
torch.nn.init.normal_(self.cls_logits.weight, std=0.01) |
|
torch.nn.init.constant_(self.cls_logits.bias, -math.log((1 - prior_probability) / prior_probability)) |
|
|
|
def forward(self, x: List[Tensor]) -> Tensor: |
|
all_cls_logits = [] |
|
|
|
for features in x: |
|
cls_logits = self.conv(features) |
|
cls_logits = self.cls_logits(cls_logits) |
|
|
|
|
|
N, _, H, W = cls_logits.shape |
|
cls_logits = cls_logits.view(N, -1, self.num_classes, H, W) |
|
cls_logits = cls_logits.permute(0, 3, 4, 1, 2) |
|
cls_logits = cls_logits.reshape(N, -1, self.num_classes) |
|
|
|
all_cls_logits.append(cls_logits) |
|
|
|
return torch.cat(all_cls_logits, dim=1) |
|
|
|
|
|
class FCOSRegressionHead(nn.Module): |
|
""" |
|
A regression head for use in FCOS, which combines regression branch and center-ness branch. |
|
This can obtain better performance. |
|
|
|
Reference: `FCOS: A simple and strong anchor-free object detector <https://arxiv.org/abs/2006.09214>`_. |
|
|
|
Args: |
|
in_channels (int): number of channels of the input feature |
|
num_anchors (int): number of anchors to be predicted |
|
num_convs (Optional[int]): number of conv layer. Default: 4. |
|
norm_layer: Module specifying the normalization layer to use. |
|
""" |
|
|
|
def __init__( |
|
self, |
|
in_channels: int, |
|
num_anchors: int, |
|
num_convs: int = 4, |
|
norm_layer: Optional[Callable[..., nn.Module]] = None, |
|
): |
|
super().__init__() |
|
|
|
if norm_layer is None: |
|
norm_layer = partial(nn.GroupNorm, 32) |
|
|
|
conv = [] |
|
for _ in range(num_convs): |
|
conv.append(nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)) |
|
conv.append(norm_layer(in_channels)) |
|
conv.append(nn.ReLU()) |
|
self.conv = nn.Sequential(*conv) |
|
|
|
self.bbox_reg = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=3, stride=1, padding=1) |
|
self.bbox_ctrness = nn.Conv2d(in_channels, num_anchors * 1, kernel_size=3, stride=1, padding=1) |
|
for layer in [self.bbox_reg, self.bbox_ctrness]: |
|
torch.nn.init.normal_(layer.weight, std=0.01) |
|
torch.nn.init.zeros_(layer.bias) |
|
|
|
for layer in self.conv.children(): |
|
if isinstance(layer, nn.Conv2d): |
|
torch.nn.init.normal_(layer.weight, std=0.01) |
|
torch.nn.init.zeros_(layer.bias) |
|
|
|
def forward(self, x: List[Tensor]) -> Tuple[Tensor, Tensor]: |
|
all_bbox_regression = [] |
|
all_bbox_ctrness = [] |
|
|
|
for features in x: |
|
bbox_feature = self.conv(features) |
|
bbox_regression = nn.functional.relu(self.bbox_reg(bbox_feature)) |
|
bbox_ctrness = self.bbox_ctrness(bbox_feature) |
|
|
|
|
|
N, _, H, W = bbox_regression.shape |
|
bbox_regression = bbox_regression.view(N, -1, 4, H, W) |
|
bbox_regression = bbox_regression.permute(0, 3, 4, 1, 2) |
|
bbox_regression = bbox_regression.reshape(N, -1, 4) |
|
all_bbox_regression.append(bbox_regression) |
|
|
|
|
|
bbox_ctrness = bbox_ctrness.view(N, -1, 1, H, W) |
|
bbox_ctrness = bbox_ctrness.permute(0, 3, 4, 1, 2) |
|
bbox_ctrness = bbox_ctrness.reshape(N, -1, 1) |
|
all_bbox_ctrness.append(bbox_ctrness) |
|
|
|
return torch.cat(all_bbox_regression, dim=1), torch.cat(all_bbox_ctrness, dim=1) |
|
|
|
|
|
class FCOS(nn.Module): |
|
""" |
|
Implements FCOS. |
|
|
|
The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each |
|
image, and should be in 0-1 range. Different images can have different sizes. |
|
|
|
The behavior of the model changes depending on if it is in training or evaluation mode. |
|
|
|
During training, the model expects both the input tensors and targets (list of dictionary), |
|
containing: |
|
- boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with |
|
``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. |
|
- labels (Int64Tensor[N]): the class label for each ground-truth box |
|
|
|
The model returns a Dict[Tensor] during training, containing the classification, regression |
|
and centerness losses. |
|
|
|
During inference, the model requires only the input tensors, and returns the post-processed |
|
predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as |
|
follows: |
|
- boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with |
|
``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. |
|
- labels (Int64Tensor[N]): the predicted labels for each image |
|
- scores (Tensor[N]): the scores for each prediction |
|
|
|
Args: |
|
backbone (nn.Module): the network used to compute the features for the model. |
|
It should contain an out_channels attribute, which indicates the number of output |
|
channels that each feature map has (and it should be the same for all feature maps). |
|
The backbone should return a single Tensor or an OrderedDict[Tensor]. |
|
num_classes (int): number of output classes of the model (including the background). |
|
min_size (int): Images are rescaled before feeding them to the backbone: |
|
we attempt to preserve the aspect ratio and scale the shorter edge |
|
to ``min_size``. If the resulting longer edge exceeds ``max_size``, |
|
then downscale so that the longer edge does not exceed ``max_size``. |
|
This may result in the shorter edge beeing lower than ``min_size``. |
|
max_size (int): See ``min_size``. |
|
image_mean (Tuple[float, float, float]): mean values used for input normalization. |
|
They are generally the mean values of the dataset on which the backbone has been trained |
|
on |
|
image_std (Tuple[float, float, float]): std values used for input normalization. |
|
They are generally the std values of the dataset on which the backbone has been trained on |
|
anchor_generator (AnchorGenerator): module that generates the anchors for a set of feature |
|
maps. For FCOS, only set one anchor for per position of each level, the width and height equal to |
|
the stride of feature map, and set aspect ratio = 1.0, so the center of anchor is equivalent to the point |
|
in FCOS paper. |
|
head (nn.Module): Module run on top of the feature pyramid. |
|
Defaults to a module containing a classification and regression module. |
|
center_sampling_radius (int): radius of the "center" of a groundtruth box, |
|
within which all anchor points are labeled positive. |
|
score_thresh (float): Score threshold used for postprocessing the detections. |
|
nms_thresh (float): NMS threshold used for postprocessing the detections. |
|
detections_per_img (int): Number of best detections to keep after NMS. |
|
topk_candidates (int): Number of best detections to keep before NMS. |
|
|
|
Example: |
|
|
|
>>> import torch |
|
>>> import torchvision |
|
>>> from torchvision.models.detection import FCOS |
|
>>> from torchvision.models.detection.anchor_utils import AnchorGenerator |
|
>>> # load a pre-trained model for classification and return |
|
>>> # only the features |
|
>>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features |
|
>>> # FCOS needs to know the number of |
|
>>> # output channels in a backbone. For mobilenet_v2, it's 1280, |
|
>>> # so we need to add it here |
|
>>> backbone.out_channels = 1280 |
|
>>> |
|
>>> # let's make the network generate 5 x 3 anchors per spatial |
|
>>> # location, with 5 different sizes and 3 different aspect |
|
>>> # ratios. We have a Tuple[Tuple[int]] because each feature |
|
>>> # map could potentially have different sizes and |
|
>>> # aspect ratios |
|
>>> anchor_generator = AnchorGenerator( |
|
>>> sizes=((8,), (16,), (32,), (64,), (128,)), |
|
>>> aspect_ratios=((1.0,),) |
|
>>> ) |
|
>>> |
|
>>> # put the pieces together inside a FCOS model |
|
>>> model = FCOS( |
|
>>> backbone, |
|
>>> num_classes=80, |
|
>>> anchor_generator=anchor_generator, |
|
>>> ) |
|
>>> model.eval() |
|
>>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)] |
|
>>> predictions = model(x) |
|
""" |
|
|
|
__annotations__ = { |
|
"box_coder": det_utils.BoxLinearCoder, |
|
} |
|
|
|
def __init__( |
|
self, |
|
backbone: nn.Module, |
|
num_classes: int, |
|
|
|
min_size: int = 800, |
|
max_size: int = 1333, |
|
image_mean: Optional[List[float]] = None, |
|
image_std: Optional[List[float]] = None, |
|
|
|
anchor_generator: Optional[AnchorGenerator] = None, |
|
head: Optional[nn.Module] = None, |
|
center_sampling_radius: float = 1.5, |
|
score_thresh: float = 0.2, |
|
nms_thresh: float = 0.6, |
|
detections_per_img: int = 100, |
|
topk_candidates: int = 1000, |
|
**kwargs, |
|
): |
|
super().__init__() |
|
_log_api_usage_once(self) |
|
|
|
if not hasattr(backbone, "out_channels"): |
|
raise ValueError( |
|
"backbone should contain an attribute out_channels " |
|
"specifying the number of output channels (assumed to be the " |
|
"same for all the levels)" |
|
) |
|
self.backbone = backbone |
|
|
|
if not isinstance(anchor_generator, (AnchorGenerator, type(None))): |
|
raise TypeError( |
|
f"anchor_generator should be of type AnchorGenerator or None, instead got {type(anchor_generator)}" |
|
) |
|
|
|
if anchor_generator is None: |
|
anchor_sizes = ((8,), (16,), (32,), (64,), (128,)) |
|
aspect_ratios = ((1.0,),) * len(anchor_sizes) |
|
anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios) |
|
self.anchor_generator = anchor_generator |
|
if self.anchor_generator.num_anchors_per_location()[0] != 1: |
|
raise ValueError( |
|
f"anchor_generator.num_anchors_per_location()[0] should be 1 instead of {anchor_generator.num_anchors_per_location()[0]}" |
|
) |
|
|
|
if head is None: |
|
head = FCOSHead(backbone.out_channels, anchor_generator.num_anchors_per_location()[0], num_classes) |
|
self.head = head |
|
|
|
self.box_coder = det_utils.BoxLinearCoder(normalize_by_size=True) |
|
|
|
if image_mean is None: |
|
image_mean = [0.485, 0.456, 0.406] |
|
if image_std is None: |
|
image_std = [0.229, 0.224, 0.225] |
|
self.transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std, **kwargs) |
|
|
|
self.center_sampling_radius = center_sampling_radius |
|
self.score_thresh = score_thresh |
|
self.nms_thresh = nms_thresh |
|
self.detections_per_img = detections_per_img |
|
self.topk_candidates = topk_candidates |
|
|
|
|
|
self._has_warned = False |
|
|
|
@torch.jit.unused |
|
def eager_outputs( |
|
self, losses: Dict[str, Tensor], detections: List[Dict[str, Tensor]] |
|
) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]: |
|
if self.training: |
|
return losses |
|
|
|
return detections |
|
|
|
def compute_loss( |
|
self, |
|
targets: List[Dict[str, Tensor]], |
|
head_outputs: Dict[str, Tensor], |
|
anchors: List[Tensor], |
|
num_anchors_per_level: List[int], |
|
) -> Dict[str, Tensor]: |
|
matched_idxs = [] |
|
for anchors_per_image, targets_per_image in zip(anchors, targets): |
|
if targets_per_image["boxes"].numel() == 0: |
|
matched_idxs.append( |
|
torch.full((anchors_per_image.size(0),), -1, dtype=torch.int64, device=anchors_per_image.device) |
|
) |
|
continue |
|
|
|
gt_boxes = targets_per_image["boxes"] |
|
gt_centers = (gt_boxes[:, :2] + gt_boxes[:, 2:]) / 2 |
|
anchor_centers = (anchors_per_image[:, :2] + anchors_per_image[:, 2:]) / 2 |
|
anchor_sizes = anchors_per_image[:, 2] - anchors_per_image[:, 0] |
|
|
|
pairwise_match = (anchor_centers[:, None, :] - gt_centers[None, :, :]).abs_().max( |
|
dim=2 |
|
).values < self.center_sampling_radius * anchor_sizes[:, None] |
|
|
|
x, y = anchor_centers.unsqueeze(dim=2).unbind(dim=1) |
|
x0, y0, x1, y1 = gt_boxes.unsqueeze(dim=0).unbind(dim=2) |
|
pairwise_dist = torch.stack([x - x0, y - y0, x1 - x, y1 - y], dim=2) |
|
|
|
|
|
pairwise_match &= pairwise_dist.min(dim=2).values > 0 |
|
|
|
|
|
lower_bound = anchor_sizes * 4 |
|
lower_bound[: num_anchors_per_level[0]] = 0 |
|
upper_bound = anchor_sizes * 8 |
|
upper_bound[-num_anchors_per_level[-1] :] = float("inf") |
|
pairwise_dist = pairwise_dist.max(dim=2).values |
|
pairwise_match &= (pairwise_dist > lower_bound[:, None]) & (pairwise_dist < upper_bound[:, None]) |
|
|
|
|
|
gt_areas = (gt_boxes[:, 2] - gt_boxes[:, 0]) * (gt_boxes[:, 3] - gt_boxes[:, 1]) |
|
pairwise_match = pairwise_match.to(torch.float32) * (1e8 - gt_areas[None, :]) |
|
min_values, matched_idx = pairwise_match.max(dim=1) |
|
matched_idx[min_values < 1e-5] = -1 |
|
|
|
matched_idxs.append(matched_idx) |
|
|
|
return self.head.compute_loss(targets, head_outputs, anchors, matched_idxs) |
|
|
|
def postprocess_detections( |
|
self, head_outputs: Dict[str, List[Tensor]], anchors: List[List[Tensor]], image_shapes: List[Tuple[int, int]] |
|
) -> List[Dict[str, Tensor]]: |
|
class_logits = head_outputs["cls_logits"] |
|
box_regression = head_outputs["bbox_regression"] |
|
box_ctrness = head_outputs["bbox_ctrness"] |
|
|
|
num_images = len(image_shapes) |
|
|
|
detections: List[Dict[str, Tensor]] = [] |
|
|
|
for index in range(num_images): |
|
box_regression_per_image = [br[index] for br in box_regression] |
|
logits_per_image = [cl[index] for cl in class_logits] |
|
box_ctrness_per_image = [bc[index] for bc in box_ctrness] |
|
anchors_per_image, image_shape = anchors[index], image_shapes[index] |
|
|
|
image_boxes = [] |
|
image_scores = [] |
|
image_labels = [] |
|
|
|
for box_regression_per_level, logits_per_level, box_ctrness_per_level, anchors_per_level in zip( |
|
box_regression_per_image, logits_per_image, box_ctrness_per_image, anchors_per_image |
|
): |
|
num_classes = logits_per_level.shape[-1] |
|
|
|
|
|
scores_per_level = torch.sqrt( |
|
torch.sigmoid(logits_per_level) * torch.sigmoid(box_ctrness_per_level) |
|
).flatten() |
|
keep_idxs = scores_per_level > self.score_thresh |
|
scores_per_level = scores_per_level[keep_idxs] |
|
topk_idxs = torch.where(keep_idxs)[0] |
|
|
|
|
|
num_topk = det_utils._topk_min(topk_idxs, self.topk_candidates, 0) |
|
scores_per_level, idxs = scores_per_level.topk(num_topk) |
|
topk_idxs = topk_idxs[idxs] |
|
|
|
anchor_idxs = torch.div(topk_idxs, num_classes, rounding_mode="floor") |
|
labels_per_level = topk_idxs % num_classes |
|
|
|
boxes_per_level = self.box_coder.decode( |
|
box_regression_per_level[anchor_idxs], anchors_per_level[anchor_idxs] |
|
) |
|
boxes_per_level = box_ops.clip_boxes_to_image(boxes_per_level, image_shape) |
|
|
|
image_boxes.append(boxes_per_level) |
|
image_scores.append(scores_per_level) |
|
image_labels.append(labels_per_level) |
|
|
|
image_boxes = torch.cat(image_boxes, dim=0) |
|
image_scores = torch.cat(image_scores, dim=0) |
|
image_labels = torch.cat(image_labels, dim=0) |
|
|
|
|
|
keep = box_ops.batched_nms(image_boxes, image_scores, image_labels, self.nms_thresh) |
|
keep = keep[: self.detections_per_img] |
|
|
|
detections.append( |
|
{ |
|
"boxes": image_boxes[keep], |
|
"scores": image_scores[keep], |
|
"labels": image_labels[keep], |
|
} |
|
) |
|
|
|
return detections |
|
|
|
def forward( |
|
self, |
|
images: List[Tensor], |
|
targets: Optional[List[Dict[str, Tensor]]] = None, |
|
) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]: |
|
""" |
|
Args: |
|
images (list[Tensor]): images to be processed |
|
targets (list[Dict[Tensor]]): ground-truth boxes present in the image (optional) |
|
|
|
Returns: |
|
result (list[BoxList] or dict[Tensor]): the output from the model. |
|
During training, it returns a dict[Tensor] which contains the losses. |
|
During testing, it returns list[BoxList] contains additional fields |
|
like `scores`, `labels` and `mask` (for Mask R-CNN models). |
|
""" |
|
if self.training: |
|
|
|
if targets is None: |
|
torch._assert(False, "targets should not be none when in training mode") |
|
else: |
|
for target in targets: |
|
boxes = target["boxes"] |
|
torch._assert(isinstance(boxes, torch.Tensor), "Expected target boxes to be of type Tensor.") |
|
torch._assert( |
|
len(boxes.shape) == 2 and boxes.shape[-1] == 4, |
|
f"Expected target boxes to be a tensor of shape [N, 4], got {boxes.shape}.", |
|
) |
|
|
|
original_image_sizes: List[Tuple[int, int]] = [] |
|
for img in images: |
|
val = img.shape[-2:] |
|
torch._assert( |
|
len(val) == 2, |
|
f"expecting the last two dimensions of the Tensor to be H and W instead got {img.shape[-2:]}", |
|
) |
|
original_image_sizes.append((val[0], val[1])) |
|
|
|
|
|
images, targets = self.transform(images, targets) |
|
|
|
|
|
if targets is not None: |
|
for target_idx, target in enumerate(targets): |
|
boxes = target["boxes"] |
|
degenerate_boxes = boxes[:, 2:] <= boxes[:, :2] |
|
if degenerate_boxes.any(): |
|
|
|
bb_idx = torch.where(degenerate_boxes.any(dim=1))[0][0] |
|
degen_bb: List[float] = boxes[bb_idx].tolist() |
|
torch._assert( |
|
False, |
|
f"All bounding boxes should have positive height and width. Found invalid box {degen_bb} for target at index {target_idx}.", |
|
) |
|
|
|
|
|
features = self.backbone(images.tensors) |
|
if isinstance(features, torch.Tensor): |
|
features = OrderedDict([("0", features)]) |
|
|
|
features = list(features.values()) |
|
|
|
|
|
head_outputs = self.head(features) |
|
|
|
|
|
anchors = self.anchor_generator(images, features) |
|
|
|
num_anchors_per_level = [x.size(2) * x.size(3) for x in features] |
|
|
|
losses = {} |
|
detections: List[Dict[str, Tensor]] = [] |
|
if self.training: |
|
if targets is None: |
|
torch._assert(False, "targets should not be none when in training mode") |
|
else: |
|
|
|
losses = self.compute_loss(targets, head_outputs, anchors, num_anchors_per_level) |
|
else: |
|
|
|
split_head_outputs: Dict[str, List[Tensor]] = {} |
|
for k in head_outputs: |
|
split_head_outputs[k] = list(head_outputs[k].split(num_anchors_per_level, dim=1)) |
|
split_anchors = [list(a.split(num_anchors_per_level)) for a in anchors] |
|
|
|
|
|
detections = self.postprocess_detections(split_head_outputs, split_anchors, images.image_sizes) |
|
detections = self.transform.postprocess(detections, images.image_sizes, original_image_sizes) |
|
|
|
if torch.jit.is_scripting(): |
|
if not self._has_warned: |
|
warnings.warn("FCOS always returns a (Losses, Detections) tuple in scripting") |
|
self._has_warned = True |
|
return losses, detections |
|
return self.eager_outputs(losses, detections) |
|
|
|
|
|
class FCOS_ResNet50_FPN_Weights(WeightsEnum): |
|
COCO_V1 = Weights( |
|
url="https://download.pytorch.org/models/fcos_resnet50_fpn_coco-99b0c9b7.pth", |
|
transforms=ObjectDetection, |
|
meta={ |
|
"num_params": 32269600, |
|
"categories": _COCO_CATEGORIES, |
|
"min_size": (1, 1), |
|
"recipe": "https://github.com/pytorch/vision/tree/main/references/detection#fcos-resnet-50-fpn", |
|
"_metrics": { |
|
"COCO-val2017": { |
|
"box_map": 39.2, |
|
} |
|
}, |
|
"_ops": 128.207, |
|
"_file_size": 123.608, |
|
"_docs": """These weights were produced by following a similar training recipe as on the paper.""", |
|
}, |
|
) |
|
DEFAULT = COCO_V1 |
|
|
|
|
|
@register_model() |
|
@handle_legacy_interface( |
|
weights=("pretrained", FCOS_ResNet50_FPN_Weights.COCO_V1), |
|
weights_backbone=("pretrained_backbone", ResNet50_Weights.IMAGENET1K_V1), |
|
) |
|
def fcos_resnet50_fpn( |
|
*, |
|
weights: Optional[FCOS_ResNet50_FPN_Weights] = None, |
|
progress: bool = True, |
|
num_classes: Optional[int] = None, |
|
weights_backbone: Optional[ResNet50_Weights] = ResNet50_Weights.IMAGENET1K_V1, |
|
trainable_backbone_layers: Optional[int] = None, |
|
**kwargs: Any, |
|
) -> FCOS: |
|
""" |
|
Constructs a FCOS model with a ResNet-50-FPN backbone. |
|
|
|
.. betastatus:: detection module |
|
|
|
Reference: `FCOS: Fully Convolutional One-Stage Object Detection <https://arxiv.org/abs/1904.01355>`_. |
|
`FCOS: A simple and strong anchor-free object detector <https://arxiv.org/abs/2006.09214>`_. |
|
|
|
The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each |
|
image, and should be in ``0-1`` range. Different images can have different sizes. |
|
|
|
The behavior of the model changes depending on if it is in training or evaluation mode. |
|
|
|
During training, the model expects both the input tensors and targets (list of dictionary), |
|
containing: |
|
|
|
- boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with |
|
``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. |
|
- labels (``Int64Tensor[N]``): the class label for each ground-truth box |
|
|
|
The model returns a ``Dict[Tensor]`` during training, containing the classification and regression |
|
losses. |
|
|
|
During inference, the model requires only the input tensors, and returns the post-processed |
|
predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as |
|
follows, where ``N`` is the number of detections: |
|
|
|
- boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with |
|
``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. |
|
- labels (``Int64Tensor[N]``): the predicted labels for each detection |
|
- scores (``Tensor[N]``): the scores of each detection |
|
|
|
For more details on the output, you may refer to :ref:`instance_seg_output`. |
|
|
|
Example: |
|
|
|
>>> model = torchvision.models.detection.fcos_resnet50_fpn(weights=FCOS_ResNet50_FPN_Weights.DEFAULT) |
|
>>> model.eval() |
|
>>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)] |
|
>>> predictions = model(x) |
|
|
|
Args: |
|
weights (:class:`~torchvision.models.detection.FCOS_ResNet50_FPN_Weights`, optional): The |
|
pretrained weights to use. See |
|
:class:`~torchvision.models.detection.FCOS_ResNet50_FPN_Weights` |
|
below for more details, and possible values. By default, no |
|
pre-trained weights are used. |
|
progress (bool): If True, displays a progress bar of the download to stderr |
|
num_classes (int, optional): number of output classes of the model (including the background) |
|
weights_backbone (:class:`~torchvision.models.ResNet50_Weights`, optional): The pretrained weights for |
|
the backbone. |
|
trainable_backbone_layers (int, optional): number of trainable (not frozen) resnet layers starting |
|
from final block. Valid values are between 0 and 5, with 5 meaning all backbone layers are |
|
trainable. If ``None`` is passed (the default) this value is set to 3. Default: None |
|
**kwargs: parameters passed to the ``torchvision.models.detection.FCOS`` |
|
base class. Please refer to the `source code |
|
<https://github.com/pytorch/vision/blob/main/torchvision/models/detection/fcos.py>`_ |
|
for more details about this class. |
|
|
|
.. autoclass:: torchvision.models.detection.FCOS_ResNet50_FPN_Weights |
|
:members: |
|
""" |
|
weights = FCOS_ResNet50_FPN_Weights.verify(weights) |
|
weights_backbone = ResNet50_Weights.verify(weights_backbone) |
|
|
|
if weights is not None: |
|
weights_backbone = None |
|
num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"])) |
|
elif num_classes is None: |
|
num_classes = 91 |
|
|
|
is_trained = weights is not None or weights_backbone is not None |
|
trainable_backbone_layers = _validate_trainable_layers(is_trained, trainable_backbone_layers, 5, 3) |
|
norm_layer = misc_nn_ops.FrozenBatchNorm2d if is_trained else nn.BatchNorm2d |
|
|
|
backbone = resnet50(weights=weights_backbone, progress=progress, norm_layer=norm_layer) |
|
backbone = _resnet_fpn_extractor( |
|
backbone, trainable_backbone_layers, returned_layers=[2, 3, 4], extra_blocks=LastLevelP6P7(256, 256) |
|
) |
|
model = FCOS(backbone, num_classes, **kwargs) |
|
|
|
if weights is not None: |
|
model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True)) |
|
|
|
return model |
|
|