DistributedDataParallel( (module): SHViT( (patch_embed): Sequential( (0): Conv2d_BN( (c): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False) (bn): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (1): ReLU() (2): Conv2d_BN( (c): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False) (bn): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (3): ReLU() (4): Conv2d_BN( (c): Conv2d(32, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False) (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (blocks1): Sequential( (0): BasicBlock( (conv): Residual( (m): Conv2d_BN( (c): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=128, bias=False) (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (mixer): Identity() (ffn): Residual( (m): FFN( (pw1): Conv2d_BN( (c): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (act): ReLU() (pw2): Conv2d_BN( (c): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) ) ) (1): BasicBlock( (conv): Residual( (m): Conv2d_BN( (c): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=128, bias=False) (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (mixer): Identity() (ffn): Residual( (m): FFN( (pw1): Conv2d_BN( (c): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (act): ReLU() (pw2): Conv2d_BN( (c): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) ) ) ) (blocks2): Sequential( (0): Sequential( (0): Residual( (m): Conv2d_BN( (c): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=128, bias=False) (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (1): Residual( (m): FFN( (pw1): Conv2d_BN( (c): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (act): ReLU() (pw2): Conv2d_BN( (c): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) ) ) (1): PatchMerging( (conv1): Conv2d_BN( (c): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (act): ReLU() (conv2): Conv2d_BN( (c): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=512, bias=False) (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (se): SEModule( (fc1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1)) (bn): Identity() (act): ReLU(inplace=True) (fc2): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1)) (gate): Sigmoid() ) (conv3): Conv2d_BN( (c): Conv2d(512, 308, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(308, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (2): Sequential( (0): Residual( (m): Conv2d_BN( (c): Conv2d(308, 308, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=308, bias=False) (bn): BatchNorm2d(308, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (1): Residual( (m): FFN( (pw1): Conv2d_BN( (c): Conv2d(308, 616, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(616, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (act): ReLU() (pw2): Conv2d_BN( (c): Conv2d(616, 308, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(308, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) ) ) (3): BasicBlock( (conv): Residual( (m): Conv2d_BN( (c): Conv2d(308, 308, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=308, bias=False) (bn): BatchNorm2d(308, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (mixer): Residual( (m): SHSA( (pre_norm): GroupNorm(1, 66, eps=1e-05, affine=True) (qkv): Conv2d_BN( (c): Conv2d(66, 98, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(98, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (proj): Sequential( (0): ReLU() (1): Conv2d_BN( (c): Conv2d(308, 308, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(308, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) ) ) (ffn): Residual( (m): FFN( (pw1): Conv2d_BN( (c): Conv2d(308, 616, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(616, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (act): ReLU() (pw2): Conv2d_BN( (c): Conv2d(616, 308, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(308, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) ) ) (4): BasicBlock( (conv): Residual( (m): Conv2d_BN( (c): Conv2d(308, 308, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=308, bias=False) (bn): BatchNorm2d(308, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (mixer): Residual( (m): SHSA( (pre_norm): GroupNorm(1, 66, eps=1e-05, affine=True) (qkv): Conv2d_BN( (c): Conv2d(66, 98, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(98, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (proj): Sequential( (0): ReLU() (1): Conv2d_BN( (c): Conv2d(308, 308, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(308, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) ) ) (ffn): Residual( (m): FFN( (pw1): Conv2d_BN( (c): Conv2d(308, 616, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(616, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (act): ReLU() (pw2): Conv2d_BN( (c): Conv2d(616, 308, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(308, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) ) ) (5): BasicBlock( (conv): Residual( (m): Conv2d_BN( (c): Conv2d(308, 308, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=308, bias=False) (bn): BatchNorm2d(308, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (mixer): Residual( (m): SHSA( (pre_norm): GroupNorm(1, 66, eps=1e-05, affine=True) (qkv): Conv2d_BN( (c): Conv2d(66, 98, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(98, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (proj): Sequential( (0): ReLU() (1): Conv2d_BN( (c): Conv2d(308, 308, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(308, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) ) ) (ffn): Residual( (m): FFN( (pw1): Conv2d_BN( (c): Conv2d(308, 616, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(616, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (act): ReLU() (pw2): Conv2d_BN( (c): Conv2d(616, 308, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(308, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) ) ) (6): BasicBlock( (conv): Residual( (m): Conv2d_BN( (c): Conv2d(308, 308, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=308, bias=False) (bn): BatchNorm2d(308, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (mixer): Residual( (m): SHSA( (pre_norm): GroupNorm(1, 66, eps=1e-05, affine=True) (qkv): Conv2d_BN( (c): Conv2d(66, 98, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(98, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (proj): Sequential( (0): ReLU() (1): Conv2d_BN( (c): Conv2d(308, 308, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(308, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) ) ) (ffn): Residual( (m): FFN( (pw1): Conv2d_BN( (c): Conv2d(308, 616, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(616, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (act): ReLU() (pw2): Conv2d_BN( (c): Conv2d(616, 308, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(308, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) ) ) ) (blocks3): Sequential( (0): Sequential( (0): Residual( (m): Conv2d_BN( (c): Conv2d(308, 308, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=308, bias=False) (bn): BatchNorm2d(308, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (1): Residual( (m): FFN( (pw1): Conv2d_BN( (c): Conv2d(308, 616, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(616, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (act): ReLU() (pw2): Conv2d_BN( (c): Conv2d(616, 308, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(308, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) ) ) (1): PatchMerging( (conv1): Conv2d_BN( (c): Conv2d(308, 1232, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(1232, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (act): ReLU() (conv2): Conv2d_BN( (c): Conv2d(1232, 1232, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=1232, bias=False) (bn): BatchNorm2d(1232, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (se): SEModule( (fc1): Conv2d(1232, 312, kernel_size=(1, 1), stride=(1, 1)) (bn): Identity() (act): ReLU(inplace=True) (fc2): Conv2d(312, 1232, kernel_size=(1, 1), stride=(1, 1)) (gate): Sigmoid() ) (conv3): Conv2d_BN( (c): Conv2d(1232, 448, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(448, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (2): Sequential( (0): Residual( (m): Conv2d_BN( (c): Conv2d(448, 448, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=448, bias=False) (bn): BatchNorm2d(448, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (1): Residual( (m): FFN( (pw1): Conv2d_BN( (c): Conv2d(448, 896, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(896, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (act): ReLU() (pw2): Conv2d_BN( (c): Conv2d(896, 448, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(448, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) ) ) (3): BasicBlock( (conv): Residual( (m): Conv2d_BN( (c): Conv2d(448, 448, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=448, bias=False) (bn): BatchNorm2d(448, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (mixer): Residual( (m): SHSA( (pre_norm): GroupNorm(1, 96, eps=1e-05, affine=True) (qkv): Conv2d_BN( (c): Conv2d(96, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (proj): Sequential( (0): ReLU() (1): Conv2d_BN( (c): Conv2d(448, 448, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(448, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) ) ) (ffn): Residual( (m): FFN( (pw1): Conv2d_BN( (c): Conv2d(448, 896, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(896, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (act): ReLU() (pw2): Conv2d_BN( (c): Conv2d(896, 448, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(448, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) ) ) (4): BasicBlock( (conv): Residual( (m): Conv2d_BN( (c): Conv2d(448, 448, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=448, bias=False) (bn): BatchNorm2d(448, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (mixer): Residual( (m): SHSA( (pre_norm): GroupNorm(1, 96, eps=1e-05, affine=True) (qkv): Conv2d_BN( (c): Conv2d(96, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (proj): Sequential( (0): ReLU() (1): Conv2d_BN( (c): Conv2d(448, 448, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(448, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) ) ) (ffn): Residual( (m): FFN( (pw1): Conv2d_BN( (c): Conv2d(448, 896, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(896, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (act): ReLU() (pw2): Conv2d_BN( (c): Conv2d(896, 448, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(448, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) ) ) (5): BasicBlock( (conv): Residual( (m): Conv2d_BN( (c): Conv2d(448, 448, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=448, bias=False) (bn): BatchNorm2d(448, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (mixer): Residual( (m): SHSA( (pre_norm): GroupNorm(1, 96, eps=1e-05, affine=True) (qkv): Conv2d_BN( (c): Conv2d(96, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (proj): Sequential( (0): ReLU() (1): Conv2d_BN( (c): Conv2d(448, 448, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(448, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) ) ) (ffn): Residual( (m): FFN( (pw1): Conv2d_BN( (c): Conv2d(448, 896, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(896, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (act): ReLU() (pw2): Conv2d_BN( (c): Conv2d(896, 448, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(448, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) ) ) (6): BasicBlock( (conv): Residual( (m): Conv2d_BN( (c): Conv2d(448, 448, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=448, bias=False) (bn): BatchNorm2d(448, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (mixer): Residual( (m): SHSA( (pre_norm): GroupNorm(1, 96, eps=1e-05, affine=True) (qkv): Conv2d_BN( (c): Conv2d(96, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (proj): Sequential( (0): ReLU() (1): Conv2d_BN( (c): Conv2d(448, 448, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(448, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) ) ) (ffn): Residual( (m): FFN( (pw1): Conv2d_BN( (c): Conv2d(448, 896, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(896, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (act): ReLU() (pw2): Conv2d_BN( (c): Conv2d(896, 448, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(448, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) ) ) (7): BasicBlock( (conv): Residual( (m): Conv2d_BN( (c): Conv2d(448, 448, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=448, bias=False) (bn): BatchNorm2d(448, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (mixer): Residual( (m): SHSA( (pre_norm): GroupNorm(1, 96, eps=1e-05, affine=True) (qkv): Conv2d_BN( (c): Conv2d(96, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (proj): Sequential( (0): ReLU() (1): Conv2d_BN( (c): Conv2d(448, 448, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(448, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) ) ) (ffn): Residual( (m): FFN( (pw1): Conv2d_BN( (c): Conv2d(448, 896, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(896, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (act): ReLU() (pw2): Conv2d_BN( (c): Conv2d(896, 448, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): BatchNorm2d(448, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) ) ) ) (head): BN_Linear( (bn): BatchNorm1d(448, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (l): Linear(in_features=448, out_features=10, bias=True) ) ) )