|
import torch.nn.functional as F |
|
from torch import Tensor |
|
from torch.nn.common_types import _size_any_t |
|
|
|
from .module import Module |
|
|
|
|
|
__all__ = ["Fold", "Unfold"] |
|
|
|
|
|
class Fold(Module): |
|
r"""Combines an array of sliding local blocks into a large containing tensor. |
|
|
|
Consider a batched :attr:`input` tensor containing sliding local blocks, |
|
e.g., patches of images, of shape :math:`(N, C \times \prod(\text{kernel\_size}), L)`, |
|
where :math:`N` is batch dimension, :math:`C \times \prod(\text{kernel\_size})` |
|
is the number of values within a block (a block has :math:`\prod(\text{kernel\_size})` |
|
spatial locations each containing a :math:`C`-channeled vector), and |
|
:math:`L` is the total number of blocks. (This is exactly the |
|
same specification as the output shape of :class:`~torch.nn.Unfold`.) This |
|
operation combines these local blocks into the large :attr:`output` tensor |
|
of shape :math:`(N, C, \text{output\_size}[0], \text{output\_size}[1], \dots)` |
|
by summing the overlapping values. Similar to :class:`~torch.nn.Unfold`, the |
|
arguments must satisfy |
|
|
|
.. math:: |
|
L = \prod_d \left\lfloor\frac{\text{output\_size}[d] + 2 \times \text{padding}[d] % |
|
- \text{dilation}[d] \times (\text{kernel\_size}[d] - 1) - 1}{\text{stride}[d]} + 1\right\rfloor, |
|
|
|
where :math:`d` is over all spatial dimensions. |
|
|
|
* :attr:`output_size` describes the spatial shape of the large containing |
|
tensor of the sliding local blocks. It is useful to resolve the ambiguity |
|
when multiple input shapes map to same number of sliding blocks, e.g., |
|
with ``stride > 0``. |
|
|
|
The :attr:`padding`, :attr:`stride` and :attr:`dilation` arguments specify |
|
how the sliding blocks are retrieved. |
|
|
|
* :attr:`stride` controls the stride for the sliding blocks. |
|
|
|
* :attr:`padding` controls the amount of implicit zero-paddings on both |
|
sides for :attr:`padding` number of points for each dimension before |
|
reshaping. |
|
""" """ |
|
* :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm. |
|
It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does. |
|
""" r""" |
|
Args: |
|
output_size (int or tuple): the shape of the spatial dimensions of the |
|
output (i.e., ``output.sizes()[2:]``) |
|
kernel_size (int or tuple): the size of the sliding blocks |
|
dilation (int or tuple, optional): a parameter that controls the |
|
stride of elements within the |
|
neighborhood. Default: 1 |
|
padding (int or tuple, optional): implicit zero padding to be added on |
|
both sides of input. Default: 0 |
|
stride (int or tuple): the stride of the sliding blocks in the input |
|
spatial dimensions. Default: 1 |
|
|
|
* If :attr:`output_size`, :attr:`kernel_size`, :attr:`dilation`, |
|
:attr:`padding` or :attr:`stride` is an int or a tuple of length 1 then |
|
their values will be replicated across all spatial dimensions. |
|
|
|
* For the case of two output spatial dimensions this operation is sometimes |
|
called ``col2im``. |
|
|
|
.. note:: |
|
:class:`~torch.nn.Fold` calculates each combined value in the resulting |
|
large tensor by summing all values from all containing blocks. |
|
:class:`~torch.nn.Unfold` extracts the values in the local blocks by |
|
copying from the large tensor. So, if the blocks overlap, they are not |
|
inverses of each other. |
|
|
|
In general, folding and unfolding operations are related as |
|
follows. Consider :class:`~torch.nn.Fold` and |
|
:class:`~torch.nn.Unfold` instances created with the same |
|
parameters: |
|
|
|
>>> fold_params = dict(kernel_size=..., dilation=..., padding=..., stride=...) |
|
>>> fold = nn.Fold(output_size=..., **fold_params) |
|
>>> unfold = nn.Unfold(**fold_params) |
|
|
|
Then for any (supported) ``input`` tensor the following |
|
equality holds: |
|
|
|
:: |
|
|
|
fold(unfold(input)) == divisor * input |
|
|
|
where ``divisor`` is a tensor that depends only on the shape |
|
and dtype of the ``input``: |
|
|
|
>>> # xdoctest: +SKIP |
|
>>> input_ones = torch.ones(input.shape, dtype=input.dtype) |
|
>>> divisor = fold(unfold(input_ones)) |
|
|
|
When the ``divisor`` tensor contains no zero elements, then |
|
``fold`` and ``unfold`` operations are inverses of each |
|
other (up to constant divisor). |
|
|
|
.. warning:: |
|
Currently, only unbatched (3D) or batched (4D) image-like output tensors are supported. |
|
|
|
Shape: |
|
- Input: :math:`(N, C \times \prod(\text{kernel\_size}), L)` or :math:`(C \times \prod(\text{kernel\_size}), L)` |
|
- Output: :math:`(N, C, \text{output\_size}[0], \text{output\_size}[1], \dots)` |
|
or :math:`(C, \text{output\_size}[0], \text{output\_size}[1], \dots)` as described above |
|
|
|
Examples:: |
|
|
|
>>> fold = nn.Fold(output_size=(4, 5), kernel_size=(2, 2)) |
|
>>> input = torch.randn(1, 3 * 2 * 2, 12) |
|
>>> output = fold(input) |
|
>>> output.size() |
|
torch.Size([1, 3, 4, 5]) |
|
|
|
.. _link: |
|
https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md |
|
|
|
""" |
|
|
|
__constants__ = ["output_size", "kernel_size", "dilation", "padding", "stride"] |
|
output_size: _size_any_t |
|
kernel_size: _size_any_t |
|
dilation: _size_any_t |
|
padding: _size_any_t |
|
stride: _size_any_t |
|
|
|
def __init__( |
|
self, |
|
output_size: _size_any_t, |
|
kernel_size: _size_any_t, |
|
dilation: _size_any_t = 1, |
|
padding: _size_any_t = 0, |
|
stride: _size_any_t = 1, |
|
) -> None: |
|
super().__init__() |
|
self.output_size = output_size |
|
self.kernel_size = kernel_size |
|
self.dilation = dilation |
|
self.padding = padding |
|
self.stride = stride |
|
|
|
def forward(self, input: Tensor) -> Tensor: |
|
return F.fold( |
|
input, |
|
self.output_size, |
|
self.kernel_size, |
|
self.dilation, |
|
self.padding, |
|
self.stride, |
|
) |
|
|
|
def extra_repr(self) -> str: |
|
return ( |
|
"output_size={output_size}, kernel_size={kernel_size}, " |
|
"dilation={dilation}, padding={padding}, stride={stride}".format( |
|
**self.__dict__ |
|
) |
|
) |
|
|
|
|
|
class Unfold(Module): |
|
r"""Extracts sliding local blocks from a batched input tensor. |
|
|
|
Consider a batched :attr:`input` tensor of shape :math:`(N, C, *)`, |
|
where :math:`N` is the batch dimension, :math:`C` is the channel dimension, |
|
and :math:`*` represent arbitrary spatial dimensions. This operation flattens |
|
each sliding :attr:`kernel_size`-sized block within the spatial dimensions |
|
of :attr:`input` into a column (i.e., last dimension) of a 3-D :attr:`output` |
|
tensor of shape :math:`(N, C \times \prod(\text{kernel\_size}), L)`, where |
|
:math:`C \times \prod(\text{kernel\_size})` is the total number of values |
|
within each block (a block has :math:`\prod(\text{kernel\_size})` spatial |
|
locations each containing a :math:`C`-channeled vector), and :math:`L` is |
|
the total number of such blocks: |
|
|
|
.. math:: |
|
L = \prod_d \left\lfloor\frac{\text{spatial\_size}[d] + 2 \times \text{padding}[d] % |
|
- \text{dilation}[d] \times (\text{kernel\_size}[d] - 1) - 1}{\text{stride}[d]} + 1\right\rfloor, |
|
|
|
where :math:`\text{spatial\_size}` is formed by the spatial dimensions |
|
of :attr:`input` (:math:`*` above), and :math:`d` is over all spatial |
|
dimensions. |
|
|
|
Therefore, indexing :attr:`output` at the last dimension (column dimension) |
|
gives all values within a certain block. |
|
|
|
The :attr:`padding`, :attr:`stride` and :attr:`dilation` arguments specify |
|
how the sliding blocks are retrieved. |
|
|
|
* :attr:`stride` controls the stride for the sliding blocks. |
|
|
|
* :attr:`padding` controls the amount of implicit zero-paddings on both |
|
sides for :attr:`padding` number of points for each dimension before |
|
reshaping. |
|
""" """ |
|
* :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm. |
|
It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does. |
|
""" r""" |
|
Args: |
|
kernel_size (int or tuple): the size of the sliding blocks |
|
dilation (int or tuple, optional): a parameter that controls the |
|
stride of elements within the |
|
neighborhood. Default: 1 |
|
padding (int or tuple, optional): implicit zero padding to be added on |
|
both sides of input. Default: 0 |
|
stride (int or tuple, optional): the stride of the sliding blocks in the input |
|
spatial dimensions. Default: 1 |
|
|
|
* If :attr:`kernel_size`, :attr:`dilation`, :attr:`padding` or |
|
:attr:`stride` is an int or a tuple of length 1, their values will be |
|
replicated across all spatial dimensions. |
|
|
|
* For the case of two input spatial dimensions this operation is sometimes |
|
called ``im2col``. |
|
|
|
.. note:: |
|
:class:`~torch.nn.Fold` calculates each combined value in the resulting |
|
large tensor by summing all values from all containing blocks. |
|
:class:`~torch.nn.Unfold` extracts the values in the local blocks by |
|
copying from the large tensor. So, if the blocks overlap, they are not |
|
inverses of each other. |
|
|
|
In general, folding and unfolding operations are related as |
|
follows. Consider :class:`~torch.nn.Fold` and |
|
:class:`~torch.nn.Unfold` instances created with the same |
|
parameters: |
|
|
|
>>> fold_params = dict(kernel_size=..., dilation=..., padding=..., stride=...) |
|
>>> fold = nn.Fold(output_size=..., **fold_params) |
|
>>> unfold = nn.Unfold(**fold_params) |
|
|
|
Then for any (supported) ``input`` tensor the following |
|
equality holds: |
|
|
|
:: |
|
|
|
fold(unfold(input)) == divisor * input |
|
|
|
where ``divisor`` is a tensor that depends only on the shape |
|
and dtype of the ``input``: |
|
|
|
>>> # xdoctest: +SKIP |
|
>>> input_ones = torch.ones(input.shape, dtype=input.dtype) |
|
>>> divisor = fold(unfold(input_ones)) |
|
|
|
When the ``divisor`` tensor contains no zero elements, then |
|
``fold`` and ``unfold`` operations are inverses of each |
|
other (up to constant divisor). |
|
|
|
.. warning:: |
|
Currently, only 4-D input tensors (batched image-like tensors) are |
|
supported. |
|
|
|
Shape: |
|
- Input: :math:`(N, C, *)` |
|
- Output: :math:`(N, C \times \prod(\text{kernel\_size}), L)` as described above |
|
|
|
Examples:: |
|
|
|
>>> unfold = nn.Unfold(kernel_size=(2, 3)) |
|
>>> input = torch.randn(2, 5, 3, 4) |
|
>>> output = unfold(input) |
|
>>> # each patch contains 30 values (2x3=6 vectors, each of 5 channels) |
|
>>> # 4 blocks (2x3 kernels) in total in the 3x4 input |
|
>>> output.size() |
|
torch.Size([2, 30, 4]) |
|
|
|
>>> # xdoctest: +IGNORE_WANT |
|
>>> # Convolution is equivalent with Unfold + Matrix Multiplication + Fold (or view to output shape) |
|
>>> inp = torch.randn(1, 3, 10, 12) |
|
>>> w = torch.randn(2, 3, 4, 5) |
|
>>> inp_unf = torch.nn.functional.unfold(inp, (4, 5)) |
|
>>> out_unf = inp_unf.transpose(1, 2).matmul(w.view(w.size(0), -1).t()).transpose(1, 2) |
|
>>> out = torch.nn.functional.fold(out_unf, (7, 8), (1, 1)) |
|
>>> # or equivalently (and avoiding a copy), |
|
>>> # out = out_unf.view(1, 2, 7, 8) |
|
>>> (torch.nn.functional.conv2d(inp, w) - out).abs().max() |
|
tensor(1.9073e-06) |
|
|
|
.. _link: |
|
https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md |
|
|
|
""" |
|
|
|
__constants__ = ["kernel_size", "dilation", "padding", "stride"] |
|
kernel_size: _size_any_t |
|
dilation: _size_any_t |
|
padding: _size_any_t |
|
stride: _size_any_t |
|
|
|
def __init__( |
|
self, |
|
kernel_size: _size_any_t, |
|
dilation: _size_any_t = 1, |
|
padding: _size_any_t = 0, |
|
stride: _size_any_t = 1, |
|
) -> None: |
|
super().__init__() |
|
self.kernel_size = kernel_size |
|
self.dilation = dilation |
|
self.padding = padding |
|
self.stride = stride |
|
|
|
def forward(self, input: Tensor) -> Tensor: |
|
return F.unfold( |
|
input, self.kernel_size, self.dilation, self.padding, self.stride |
|
) |
|
|
|
def extra_repr(self) -> str: |
|
return ( |
|
"kernel_size={kernel_size}, dilation={dilation}, padding={padding}," |
|
" stride={stride}".format(**self.__dict__) |
|
) |
|
|