Spaces:

gabrielsemiceki9
/

SceneDreamer

Runtime error

App Files Files Community

gabrielsemiceki9 commited on 3 days ago

Commit

b72e09b

verified ·

1 Parent(s): 5f7ebb7

Upload 125 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
activation.py +18 -0
app_gradio.py +137 -0
assets/biome_image.png +0 -0
assets/sample_traj.gif +3 -0
assets/teaser.gif +3 -0
configs/img2lmdb.yaml +177 -0
configs/landscape1m.yaml +175 -0
configs/scenedreamer_inference.yaml +93 -0
configs/scenedreamer_train.yaml +223 -0
encoding.py +67 -0
environment.yaml +44 -0
gridencoder/__init__.py +1 -0
gridencoder/backend.py +40 -0
gridencoder/grid.py +224 -0
gridencoder/setup.py +50 -0
gridencoder/src/bindings.cpp +8 -0
gridencoder/src/gridencoder.cu +478 -0
gridencoder/src/gridencoder.h +15 -0
imaginaire/__init__.py +4 -0
imaginaire/config.py +238 -0
imaginaire/discriminators/__init__.py +0 -0
imaginaire/discriminators/gancraft.py +278 -0
imaginaire/generators/__init__.py +4 -0
imaginaire/generators/gancraft_base.py +603 -0
imaginaire/generators/scenedreamer.py +851 -0
imaginaire/generators/spade.py +571 -0
imaginaire/layers/__init__.py +27 -0
imaginaire/layers/activation_norm.py +629 -0
imaginaire/layers/conv.py +1377 -0
imaginaire/layers/misc.py +61 -0
imaginaire/layers/non_local.py +88 -0
imaginaire/layers/nonlinearity.py +65 -0
imaginaire/layers/residual.py +1411 -0
imaginaire/layers/residual_deep.py +346 -0
imaginaire/layers/vit.py +204 -0
imaginaire/layers/weight_norm.py +267 -0
imaginaire/losses/__init__.py +18 -0
imaginaire/losses/feature_matching.py +38 -0
imaginaire/losses/gan.py +173 -0
imaginaire/losses/info_nce.py +87 -0
imaginaire/losses/kl.py +23 -0
imaginaire/losses/perceptual.py +395 -0
imaginaire/losses/weighted_mse.py +28 -0
imaginaire/model_utils/__init__.py +4 -0
imaginaire/model_utils/gancraft/camctl.py +679 -0
imaginaire/model_utils/gancraft/gaugan_lbl2col.csv +182 -0
imaginaire/model_utils/gancraft/gaugan_reduction.csv +182 -0
imaginaire/model_utils/gancraft/id2name_gg.csv +680 -0
imaginaire/model_utils/gancraft/loss.py +96 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/sample_traj.gif filter=lfs diff=lfs merge=lfs -text
+assets/teaser.gif filter=lfs diff=lfs merge=lfs -text

activation.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import torch
+from torch.autograd import Function
+from torch.cuda.amp import custom_bwd, custom_fwd
+class _trunc_exp(Function):
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float32) # cast to float32
+    def forward(ctx, x):
+        ctx.save_for_backward(x)
+        return torch.exp(x)
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, g):
+        x = ctx.saved_tensors[0]
+        return g * torch.exp(x.clamp(-15, 15))
+trunc_exp = _trunc_exp.apply

app_gradio.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import os
+import torch
+import torch.nn as nn
+import importlib
+import argparse
+from imaginaire.config import Config
+from imaginaire.utils.cudnn import init_cudnn
+import gradio as gr
+from PIL import Image
+class WrappedModel(nn.Module):
+    r"""Dummy wrapping the module.
+    """
+    def __init__(self, module):
+        super(WrappedModel, self).__init__()
+        self.module = module
+    def forward(self, *args, **kwargs):
+        r"""PyTorch module forward function overload."""
+        return self.module(*args, **kwargs)
+def parse_args():
+    parser = argparse.ArgumentParser(description='Training')
+    parser.add_argument('--config', type=str, default='./configs/scenedreamer_inference.yaml', help='Path to the training config file.')
+    parser.add_argument('--checkpoint', default='./scenedreamer_released.pt',
+                        help='Checkpoint path.')
+    parser.add_argument('--output_dir', type=str, default='./test/',
+                        help='Location to save the image outputs')
+    parser.add_argument('--seed', type=int, default=8888,
+                        help='Random seed.')
+    args = parser.parse_args()
+    return args
+args = parse_args()
+cfg = Config(args.config)
+# Initialize cudnn.
+init_cudnn(cfg.cudnn.deterministic, cfg.cudnn.benchmark)
+# Initialize data loaders and models.
+lib_G = importlib.import_module(cfg.gen.type)
+net_G = lib_G.Generator(cfg.gen, cfg.data)
+net_G = net_G.to('cuda')
+net_G = WrappedModel(net_G)
+if args.checkpoint == '':
+    raise NotImplementedError("No checkpoint is provided for inference!")
+# Load checkpoint.
+# trainer.load_checkpoint(cfg, args.checkpoint)
+checkpoint = torch.load(args.checkpoint, map_location='cpu')
+net_G.load_state_dict(checkpoint['net_G'])
+# Do inference.
+net_G = net_G.module
+net_G.eval()
+for name, param in net_G.named_parameters():
+    param.requires_grad = False
+torch.cuda.empty_cache()
+world_dir = os.path.join(args.output_dir)
+os.makedirs(world_dir, exist_ok=True)
+def get_bev(seed):
+    print('[PCGGenerator] Generating BEV scene representation...')
+    os.system('python terrain_generator.py --size {} --seed {} --outdir {}'.format(net_G.voxel.sample_size, seed, world_dir))
+    heightmap_path = os.path.join(world_dir, 'heightmap.png')
+    semantic_path = os.path.join(world_dir, 'colormap.png')
+    heightmap = Image.open(heightmap_path)
+    semantic = Image.open(semantic_path)
+    return semantic, heightmap
+def get_video(seed, num_frames, reso_h, reso_w):
+    device = torch.device('cuda')
+    rng_cuda = torch.Generator(device=device)
+    rng_cuda = rng_cuda.manual_seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    net_G.voxel.next_world(device, world_dir, checkpoint)
+    cam_mode = cfg.inference_args.camera_mode
+    cfg.inference_args.cam_maxstep = num_frames
+    cfg.inference_args.resolution_hw = [reso_h, reso_w]
+    current_outdir = os.path.join(world_dir, 'camera_{:02d}'.format(cam_mode))
+    os.makedirs(current_outdir, exist_ok=True)
+    z = torch.empty(1, net_G.style_dims, dtype=torch.float32, device=device)
+    z.normal_(generator=rng_cuda)
+    net_G.inference_givenstyle(z, current_outdir, **vars(cfg.inference_args))
+    return os.path.join(current_outdir, 'rgb_render.mp4')
+markdown=f'''
+  # SceneDreamer: Unbounded 3D Scene Generation from 2D Image Collections
+  Authored by Zhaoxi Chen, Guangcong Wang, Ziwei Liu
+  ### Useful links:
+  - [Official Github Repo](https://github.com/FrozenBurning/SceneDreamer)
+  - [Project Page](https://scene-dreamer.github.io/)
+  - [arXiv Link](https://arxiv.org/abs/2302.01330)
+  Licensed under the S-Lab License.
+  We offer a sampled scene whose BEVs are shown on the right. You can also use the button "Generate BEV" to randomly sample a new 3D world represented by a height map and a semantic map. But it requires a long time.
+  To render video, push the button "Render" to generate a camera trajectory flying through the world. You can specify rendering options as shown below!
+'''
+with gr.Blocks() as demo:
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown(markdown)
+        with gr.Column():
+            with gr.Row():
+                with gr.Column():
+                    semantic = gr.Image(value='./test/colormap.png',type="pil", shape=(512, 512))
+                with gr.Column():
+                    height = gr.Image(value='./test/heightmap.png', type="pil", shape=(512, 512))
+            with gr.Row():
+                # with gr.Column():
+                #     image = gr.Image(type='pil', shape(540, 960))
+                with gr.Column():
+                    video = gr.Video()
+    with gr.Row():
+        num_frames = gr.Slider(minimum=10, maximum=200, value=20, step=1, label='Number of rendered frames')
+        user_seed = gr.Slider(minimum=0, maximum=999999, value=8888, step=1, label='Random seed')
+        resolution_h = gr.Slider(minimum=256, maximum=2160, value=270, step=1, label='Height of rendered image')
+        resolution_w = gr.Slider(minimum=256, maximum=3840, value=480, step=1, label='Width of rendered image')
+    with gr.Row():
+        btn = gr.Button(value="Generate BEV")
+        btn_2=gr.Button(value="Render")
+    btn.click(get_bev,[user_seed],[semantic, height])
+    btn_2.click(get_video,[user_seed, num_frames, resolution_h, resolution_w], [video])
+demo.launch(debug=True)

assets/biome_image.png ADDED Viewed

assets/sample_traj.gif ADDED Viewed

Git LFS Details

SHA256: 9bff3115871f1a78fbe237b44ba51e1bf3eb0578c2850815b0b71dd012c3be6a
Pointer size: 132 Bytes
Size of remote file: 6.58 MB

assets/teaser.gif ADDED Viewed

Git LFS Details

SHA256: 12c4e75a99a9a5dc17b89fc1c272fa994068bdd83e0636202f001e875f203a05
Pointer size: 133 Bytes
Size of remote file: 24.5 MB

configs/img2lmdb.yaml ADDED Viewed

	@@ -0,0 +1,177 @@

+inference_args:
+    random_style: True
+    use_fixed_random_style: False
+    keep_original_size: True
+image_save_iter: 5000
+snapshot_save_epoch: 5
+max_epoch: 400
+logging_iter: 100
+trainer:
+    type: imaginaire.trainers.spade
+    model_average_config:
+        enabled: True
+        beta: 0.9999
+        start_iteration: 1000
+        num_batch_norm_estimation_iterations: 30
+    amp_config:
+        enabled: True
+    gan_mode: hinge
+    gan_relativistic: False
+    perceptual_loss:
+        mode: 'vgg19'
+        layers: ['relu_1_1', 'relu_2_1', 'relu_3_1', 'relu_4_1', 'relu_5_1']
+        weights: [0.03125, 0.0625, 0.125, 0.25, 1.0]
+        fp16: True
+    loss_weight:
+        gan: 1.0
+        perceptual: 10.0
+        feature_matching: 10.0
+        kl: 0.05
+    init:
+        type: xavier
+        gain: 0.02
+gen_opt:
+    type: adam
+    lr: 0.0001
+    adam_beta1: 0.
+    adam_beta2: 0.999
+    lr_policy:
+        iteration_mode: False
+        type: step
+        step_size: 400
+        gamma: 0.1
+dis_opt:
+    type: adam
+    lr: 0.0004
+    adam_beta1: 0.
+    adam_beta2: 0.999
+    lr_policy:
+        iteration_mode: False
+        type: step
+        step_size: 400
+        gamma: 0.1
+gen:
+    type: imaginaire.generators.spade
+    version: v20
+    style_dims: 256
+    num_filters: 128
+    kernel_size: 3
+    weight_norm_type: 'spectral'
+    use_posenc_in_input_layer: False
+    global_adaptive_norm_type: 'sync_batch'
+    activation_norm_params:
+        num_filters: 128
+        kernel_size: 5
+        separate_projection: True
+        activation_norm_type: 'sync_batch'
+    style_enc:
+        num_filters: 64
+        kernel_size: 3
+dis:
+    type: imaginaire.discriminators.spade
+    kernel_size: 4
+    num_filters: 128
+    max_num_filters: 512
+    num_discriminators: 2
+    num_layers: 5
+    activation_norm_type: 'none'
+    weight_norm_type: 'spectral'
+# Data options.
+data:
+    type: imaginaire.datasets.paired_images
+    # How many data loading workers per GPU?
+    num_workers: 8
+    input_types:
+        - images:
+            ext: jpg
+            num_channels: 3
+            normalize: True
+            use_dont_care: False
+        - seg_maps:
+            ext: jpg
+            num_channels: 1
+            is_mask: True
+            normalize: False
+        # - edge_maps:
+        #     ext: png
+        #     num_channels: 1
+        #     normalize: False
+    full_data_ops: imaginaire.model_utils.label::make_one_hot, imaginaire.model_utils.label::concat_labels
+    use_dont_care: True
+    one_hot_num_classes:
+        seg_maps: 183
+    input_labels:
+        - seg_maps
+        # - edge_maps
+    # Which lmdb contains the ground truth image.
+    input_image:
+        - images
+    # Train dataset details.
+    train:
+        # Input LMDBs.
+        roots:
+            - ./data/lhq/train
+        # Batch size per GPU.
+        batch_size: 4
+        # Data augmentations to be performed in given order.
+        augmentations:
+            resize_smallest_side: 256
+            # Rotate in (-rotate, rotate) in degrees.
+            rotate: 0
+            # Scale image by factor \in [1, 1+random_scale_limit].
+            random_scale_limit: 0.2
+            # Horizontal flip?
+            horizontal_flip: True
+            # Crop size.
+            random_crop_h_w: 256, 256
+    # Train dataset details.
+    val:
+        # Input LMDBs.
+        roots:
+            - ./data/lhq/val
+        # Batch size per GPU.
+        batch_size: 4
+        # Data augmentations to be performed in given order.
+        augmentations:
+            # Crop size.
+            resize_h_w: 256, 256
+test_data:
+    type: imaginaire.datasets.paired_images
+    num_workers: 8
+    input_types:
+        - seg_maps:
+            ext: jpg
+            num_channels: 1
+            is_mask: True
+            normalize: False
+        # - edge_maps:
+        #     ext: png
+        #     num_channels: 1
+        #     normalize: False
+    full_data_ops: imaginaire.model_utils.label::make_one_hot, imaginaire.model_utils.label::concat_labels
+    use_dont_care: True
+    one_hot_num_classes:
+        seg_maps: 183
+    input_labels:
+        - seg_maps
+        # - edge_maps
+    paired: True
+    # Validation dataset details.
+    test:
+        is_lmdb: False
+        roots:
+            - ./data/lhq/train
+        # Batch size per GPU.
+        batch_size: 1
+        # If resize_h_w is not given, then it is assumed to be same as crop_h_w.
+        augmentations:
+            resize_h_w: 256, 256
+            horizontal_flip: False

configs/landscape1m.yaml ADDED Viewed

	@@ -0,0 +1,175 @@

+pretrained_weight: ./landscape1m-segformer.pt
+inference_args:
+    random_style: True
+    use_fixed_random_style: False
+    keep_original_size: True
+image_save_iter: 5000
+snapshot_save_epoch: 5
+snapshot_save_iter: 30000
+max_epoch: 400
+logging_iter: 100
+trainer:
+    type: imaginaire.trainers.spade
+    model_average_config:
+        enabled: True
+        beta: 0.9999
+        start_iteration: 1000
+        num_batch_norm_estimation_iterations: 30
+    amp_config:
+        enabled: True
+    gan_mode: hinge
+    gan_relativistic: False
+    perceptual_loss:
+        mode: 'vgg19'
+        layers: ['relu_1_1', 'relu_2_1', 'relu_3_1', 'relu_4_1', 'relu_5_1']
+        weights: [0.03125, 0.0625, 0.125, 0.25, 1.0]
+        fp16: True
+    loss_weight:
+        gan: 1.0
+        perceptual: 10.0
+        feature_matching: 10.0
+        kl: 0.05
+    init:
+        type: xavier
+        gain: 0.02
+gen_opt:
+    type: adam
+    lr: 0.0001
+    adam_beta1: 0.
+    adam_beta2: 0.999
+    lr_policy:
+        iteration_mode: False
+        type: step
+        step_size: 400
+        gamma: 0.1
+dis_opt:
+    type: adam
+    lr: 0.0004
+    adam_beta1: 0.
+    adam_beta2: 0.999
+    lr_policy:
+        iteration_mode: False
+        type: step
+        step_size: 400
+        gamma: 0.1
+gen:
+    type: imaginaire.generators.spade
+    version: v20
+    output_multiplier: 0.5
+    image_channels: 3
+    num_labels: 184
+    style_dims: 256
+    num_filters: 128
+    kernel_size: 3
+    weight_norm_type: 'spectral'
+    use_posenc_in_input_layer: False
+    global_adaptive_norm_type: 'sync_batch'
+    activation_norm_params:
+        num_filters: 128
+        kernel_size: 5
+        separate_projection: True
+        activation_norm_type: 'sync_batch'
+    style_enc:
+        num_filters: 64
+        kernel_size: 3
+dis:
+    type: imaginaire.discriminators.spade
+    kernel_size: 4
+    num_filters: 128
+    max_num_filters: 512
+    num_discriminators: 2
+    num_layers: 5
+    activation_norm_type: 'none'
+    weight_norm_type: 'spectral'
+# Data options.
+data:
+    type: imaginaire.datasets.paired_images
+    # How many data loading workers per GPU?
+    num_workers: 8
+    input_types:
+        - images:
+            ext: jpg
+            num_channels: 3
+            normalize: True
+            use_dont_care: False
+        - seg_maps:
+            ext: jpg
+            num_channels: 1
+            is_mask: True
+            normalize: False
+    full_data_ops: imaginaire.model_utils.label::make_one_hot, imaginaire.model_utils.label::concat_labels
+    use_dont_care: True
+    one_hot_num_classes:
+        seg_maps: 183
+    input_labels:
+        - seg_maps
+    # Which lmdb contains the ground truth image.
+    input_image:
+        - images
+    # Train dataset details.
+    train:
+        # Input LMDBs.
+        dataset_type: lmdb
+        roots:
+            - ./data/lhq_lmdb/train
+        # Batch size per GPU.
+        batch_size: 4
+        # Data augmentations to be performed in given order.
+        augmentations:
+            resize_smallest_side: 512
+            # Rotate in (-rotate, rotate) in degrees.
+            rotate: 0
+            # Scale image by factor \in [1, 1+random_scale_limit].
+            random_scale_limit: 0.2
+            # Horizontal flip?
+            horizontal_flip: True
+            # Crop size.
+            random_crop_h_w: 512, 512
+    # Train dataset details.
+    val:
+        dataset_type: lmdb
+        # Input LMDBs.
+        roots:
+            - ./data/lhq_lmdb/val
+        # Batch size per GPU.
+        batch_size: 4
+        # Data augmentations to be performed in given order.
+        augmentations:
+            # Crop size.
+            resize_h_w:  512, 512
+test_data:
+    type: imaginaire.datasets.paired_images
+    num_workers: 8
+    input_types:
+        - seg_maps:
+            ext: jpg
+            num_channels: 1
+            is_mask: True
+            normalize: False
+    full_data_ops: imaginaire.model_utils.label::make_one_hot, imaginaire.model_utils.label::concat_labels
+    use_dont_care: True
+    one_hot_num_classes:
+        seg_maps: 183
+    input_labels:
+        - seg_maps
+    paired: True
+    # Validation dataset details.
+    test:
+        is_lmdb: True
+        roots:
+            - ./data/lhq_lmdb/val
+        # Batch size per GPU.
+        batch_size: 1
+        # If resize_h_w is not given, then it is assumed to be same as crop_h_w.
+        augmentations:
+            resize_h_w: 256, 256
+            horizontal_flip: False

configs/scenedreamer_inference.yaml ADDED Viewed

	@@ -0,0 +1,93 @@

+inference_args:
+    # 0: Camera orbiting the scene & looking at the center
+    # 1: Camera orbiting the scene & zooming in
+    # 2: Camera orbiting the scene & coming closer and closer to the center
+    # 3: Similar to 2, camera orbiting at the opposite direction
+    # 4: Simliar to 2, camera stays further away from the center
+    # 5: Camera sits at the center and look outwards
+    # 6: Camera rises while looking down
+    # 7: Camera really far away looking down at a 45deg angle
+    # 8: Camera for perpetual view generation, non-sliding window
+    # 9: Camera for infinite world generation, sliding window
+    camera_mode: 4
+    cam_maxstep: 40
+    resolution_hw: [540, 960]
+    num_samples: 40
+    cam_ang: 72
+gen:
+    type: imaginaire.generators.scenedreamer
+    pcg_dataset_path: None
+    pcg_cache: False
+    scene_size: 2048
+    blk_feat_dim: 64
+    pe_lvl_feat: 4
+    pe_incl_orig_feat: False
+    pe_no_pe_feat_dim: 40
+    pe_lvl_raydir: 0
+    pe_incl_orig_raydir: False
+    style_dims: 128  # Set to 0 to disable style.
+    interm_style_dims: 256
+    final_feat_dim: 64
+    # Number of pixels removed from each edge to reduce boundary artifact of CNN
+    # both sides combined (8 -> 4 on left and 4 on right).
+    pad: 6
+    # ======== Sky network ========
+    pe_lvl_raydir_sky: 5
+    pe_incl_orig_raydir_sky: True
+    # ======== Style Encoder =========
+    # Comment out to disable style encoder.
+    style_enc:
+        num_filters: 64
+        kernel_size: 3
+        weight_norm_type: 'none'
+    stylenet_model: StyleMLP
+    stylenet_model_kwargs:
+        normalize_input: True
+        num_layers: 5
+    mlp_model: RenderMLP
+    mlp_model_kwargs:
+        use_seg: True
+    # ======== Ray Casting Params ========
+    num_blocks_early_stop: 6
+    num_samples: 24 # Original model uses 24. Reduced to 4 to allow training on 12GB GPUs (with significant performance penalty)
+    sample_depth: 3 # Stop the ray after certain depth
+    coarse_deterministic_sampling: False
+    sample_use_box_boundaries: False # Including voxel boundaries into the sample
+    # ======== Blender ========
+    raw_noise_std: 0.0
+    dists_scale: 0.25
+    clip_feat_map: True
+    # Prevent sky from leaking to the foreground.
+    keep_sky_out: True
+    keep_sky_out_avgpool: True
+    sky_global_avgpool: True
+    # ======== Label translator ========
+    reduced_label_set: True
+    use_label_smooth: True
+    use_label_smooth_real: True
+    use_label_smooth_pgt: True
+    label_smooth_dia: 11
+    # ======== Camera sampler ========
+    camera_sampler_type: 'traditional'
+    cam_res: [360, 640] # Camera resolution before cropping.
+    crop_size: [256, 256] # Actual crop size is crop_size+pad. It should generally match random_crop_h_w in dataloader.
+    # Threshold for rejecting camera poses that will result in a seg mask with low entropy.
+    # Generally, 0.5 min, 0.8 max.
+    camera_min_entropy: 0.75
+    # Threshold for rejecting camera poses that are too close to the objects.
+    camera_rej_avg_depth: 2.0

configs/scenedreamer_train.yaml ADDED Viewed

	@@ -0,0 +1,223 @@

+image_save_iter: 5000
+snapshot_save_epoch: 5
+snapshot_save_iter: 10000
+max_epoch: 400
+logging_iter: 10
+trainer:
+    type: imaginaire.trainers.gancraft
+    model_average_config:
+        enabled: False
+    amp_config:
+        enabled: False
+    perceptual_loss:
+        mode: 'vgg19'
+        layers: ['relu_3_1', 'relu_4_1', 'relu_5_1']
+        weights: [0.125, 0.25, 1.0]
+    loss_weight:
+        l2: 10.0
+        gan: 0.5
+        pseudo_gan: 0.5
+        perceptual: 10.0
+        kl: 0.05
+    init:
+        type: xavier
+        gain: 0.02
+    # SPADE/GauGAN model for pseudo-GT generation.
+    gaugan_loader:
+        config: configs/landscape1m.yaml
+    image_to_tensorboard: True
+    distributed_data_parallel_params:
+        find_unused_parameters: False
+        broadcast_buffers: False
+gen_opt:
+    type: adam
+    lr: 0.0001
+    eps: 1.e-7
+    adam_beta1: 0.
+    adam_beta2: 0.999
+    lr_policy:
+        iteration_mode: False
+        type: step
+        step_size: 400
+        gamma: 0.1
+    param_groups:
+        world_encoder:
+            lr: 0.0005
+        hash_encoder:
+            lr: 0.0001
+        render_net:
+            lr: 0.0001
+        sky_net:
+            lr: 0.0001
+        style_net:
+            lr: 0.0001
+        style_encoder:
+            lr: 0.0001
+        denoiser:
+            lr: 0.0001
+dis_opt:
+    type: adam
+    lr: 0.0004
+    eps: 1.e-7
+    adam_beta1: 0.
+    adam_beta2: 0.999
+    lr_policy:
+        iteration_mode: False
+        type: step
+        step_size: 400
+        gamma: 0.1
+gen:
+    type: imaginaire.generators.scenedreamer
+    pcg_dataset_path: ./data/terrain_cache
+    pcg_cache: True
+    scene_size: 2048
+    blk_feat_dim: 64
+    pe_lvl_feat: 4
+    pe_incl_orig_feat: False
+    pe_no_pe_feat_dim: 40
+    pe_lvl_raydir: 0
+    pe_incl_orig_raydir: False
+    style_dims: 128  # Set to 0 to disable style.
+    interm_style_dims: 256
+    final_feat_dim: 64
+    # Number of pixels removed from each edge to reduce boundary artifact of CNN
+    # both sides combined (8 -> 4 on left and 4 on right).
+    pad: 6
+    # ======== Sky network ========
+    pe_lvl_raydir_sky: 5
+    pe_incl_orig_raydir_sky: True
+    # ======== Style Encoder =========
+    # Comment out to disable style encoder.
+    style_enc:
+        num_filters: 64
+        kernel_size: 3
+        weight_norm_type: 'none'
+    stylenet_model: StyleMLP
+    stylenet_model_kwargs:
+        normalize_input: True
+        num_layers: 5
+    mlp_model: RenderMLP
+    mlp_model_kwargs:
+        use_seg: True
+    # ======== Ray Casting Params ========
+    num_blocks_early_stop: 6
+    num_samples: 24 # Decrease it if you got OOM on lowend GPU
+    sample_depth: 3 # Stop the ray after certain depth
+    coarse_deterministic_sampling: False
+    sample_use_box_boundaries: False # Including voxel boundaries into the sample
+    # ======== Blender ========
+    raw_noise_std: 0.0
+    dists_scale: 0.25
+    clip_feat_map: True
+    # Prevent sky from leaking to the foreground.
+    keep_sky_out: True
+    keep_sky_out_avgpool: True
+    sky_global_avgpool: True
+    # ======== Label translator ========
+    reduced_label_set: True
+    use_label_smooth: True
+    use_label_smooth_real: True
+    use_label_smooth_pgt: True
+    label_smooth_dia: 11
+    # ======== Camera sampler ========
+    camera_sampler_type: 'traditional'
+    cam_res: [360, 640] # Camera resolution before cropping.
+    crop_size: [256, 256] # Actual crop size is crop_size+pad. It should generally match random_crop_h_w in dataloader.
+    # Threshold for rejecting camera poses that will result in a seg mask with low entropy.
+    # Generally, 0.5 min, 0.8 max.
+    camera_min_entropy: 0.75
+    # Threshold for rejecting camera poses that are too close to the objects.
+    camera_rej_avg_depth: 2.0
+dis:
+    type: imaginaire.discriminators.gancraft
+    image_channels: 3
+    num_labels: 12  # Same as num_reduced_lbls.
+    use_label: True
+    num_filters: 128
+    fpse_kernel_size: 3
+    activation_norm_type: 'none'
+    weight_norm_type: spectral
+    smooth_resample: True
+# Data options.
+data:
+    type: imaginaire.datasets.paired_images
+    num_workers: 8
+    input_types:
+        - images:
+            ext: jpg
+            num_channels: 3
+            normalize: True
+            use_dont_care: False
+        - seg_maps:
+            ext: png
+            num_channels: 1
+            is_mask: True
+            normalize: False
+    full_data_ops: imaginaire.model_utils.label::make_one_hot, imaginaire.model_utils.label::concat_labels
+    use_dont_care: False
+    one_hot_num_classes:
+        seg_maps: 184
+    input_labels:
+        - seg_maps
+    # Which lmdb contains the ground truth image.
+    input_image:
+        - images
+    # Train dataset details.
+    train:
+        dataset_type: lmdb
+        # Input LMDBs.
+        roots:
+            - ./data/lhq_lmdb/train
+        # Batch size per GPU.
+        batch_size: 1
+        # Data augmentations to be performed in given order.
+        augmentations:
+            resize_smallest_side: 256
+            # Rotate in (-rotate, rotate) in degrees.
+            rotate: 0
+            # Scale image by factor \in [1, 1+random_scale_limit].
+            random_scale_limit: 0.2
+            # Horizontal flip?
+            horizontal_flip: True
+            # Crop size.
+            random_crop_h_w: 256, 256
+    # Train dataset details.
+    val:
+        dataset_type: lmdb
+        # Input LMDBs.
+        roots:
+            - ./data/lhq_lmdb/val
+        # Batch size per GPU.
+        batch_size: 1
+        # Data augmentations to be performed in given order.
+        augmentations:
+            # Crop size.
+            resize_h_w: 256, 256
+test_data:
+    type: imaginaire.datasets.dummy
+    num_workers: 0

encoding.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class FreqEncoder(nn.Module):
+    def __init__(self, input_dim, max_freq_log2, N_freqs,
+                 log_sampling=True, include_input=True,
+                 periodic_fns=(torch.sin, torch.cos)):
+        super().__init__()
+        self.input_dim = input_dim
+        self.include_input = include_input
+        self.periodic_fns = periodic_fns
+        self.output_dim = 0
+        if self.include_input:
+            self.output_dim += self.input_dim
+        self.output_dim += self.input_dim * N_freqs * len(self.periodic_fns)
+        if log_sampling:
+            self.freq_bands = 2. ** torch.linspace(0., max_freq_log2, N_freqs)
+        else:
+            self.freq_bands = torch.linspace(2. ** 0., 2. ** max_freq_log2, N_freqs)
+        self.freq_bands = self.freq_bands.numpy().tolist()
+    def forward(self, input, **kwargs):
+        out = []
+        if self.include_input:
+            out.append(input)
+        for i in range(len(self.freq_bands)):
+            freq = self.freq_bands[i]
+            for p_fn in self.periodic_fns:
+                out.append(p_fn(input * freq))
+        out = torch.cat(out, dim=-1)
+        return out
+def get_encoder(encoding, input_dim=3,
+                multires=6,
+                degree=4,
+                num_levels=16, level_dim=2, base_resolution=16, log2_hashmap_size=19, desired_resolution=2048, align_corners=False,
+                **kwargs):
+    if encoding == 'None':
+        return lambda x, **kwargs: x, input_dim
+    elif encoding == 'hashgrid':
+        from gridencoder import GridEncoder
+        encoder = GridEncoder(input_dim=input_dim, num_levels=num_levels, level_dim=level_dim, base_resolution=base_resolution, log2_hashmap_size=log2_hashmap_size, desired_resolution=desired_resolution, gridtype='hash', align_corners=align_corners)
+    elif encoding == 'tiledgrid':
+        from gridencoder import GridEncoder
+        encoder = GridEncoder(input_dim=input_dim, num_levels=num_levels, level_dim=level_dim, base_resolution=base_resolution, log2_hashmap_size=log2_hashmap_size, desired_resolution=desired_resolution, gridtype='tiled', align_corners=align_corners)
+    elif encoding == 'varhashgrid':
+        from gridencoder.grid import VarGridEncoder
+        encoder = VarGridEncoder(input_dim=input_dim, num_levels=num_levels, level_dim=level_dim, base_resolution=base_resolution, log2_hashmap_size=log2_hashmap_size, desired_resolution=desired_resolution, gridtype='tiled', align_corners=align_corners, hash_entries = kwargs['hash_feat_dim'])
+    else:
+        raise NotImplementedError('Unknown encoding mode, choose from [None, frequency, sphere_harmonics, hashgrid, tiledgrid]')
+    return encoder, encoder.output_dim

environment.yaml ADDED Viewed

	@@ -0,0 +1,44 @@

+name: scenedreamer
+channels:
+  - pytorch
+  - nvidia
+dependencies:
+  - python=3.9
+  - pytorch=1.12.0
+  - cudatoolkit=11.3
+  - torchvision
+  - pip
+  - numpy
+  - scipy
+  - scikit-image
+  - pip:
+    - einops
+    - noise
+    - opencv-python
+    - cmake
+    - pynvml
+    - Pillow>=8.3.2
+    - tqdm==4.35.0
+    - wget
+    - cython
+    - lmdb
+    - av
+    - opencv-python
+    - opencv-contrib-python
+    - imutils
+    - imageio-ffmpeg
+    - qimage2ndarray
+    - albumentations
+    - requests==2.25.1
+    - nvidia-ml-py3==7.352.0
+    - pyglet
+    - timm
+    - diskcache
+    - boto3
+    - awscli_plugin_endpoint
+    - awscli
+    - rsa
+    - wandb
+    - tensorboard
+    - lpips
+    - matplotlib

gridencoder/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .grid import GridEncoder

gridencoder/backend.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import os
+from torch.utils.cpp_extension import load
+_src_path = os.path.dirname(os.path.abspath(__file__))
+nvcc_flags = [
+    '-O3', '-std=c++17',
+    '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__',
+]
+if os.name == "posix":
+    c_flags = ['-O3', '-std=c++17']
+elif os.name == "nt":
+    c_flags = ['/O2', '/std:c++17']
+    # find cl.exe
+    def find_cl_path():
+        import glob
+        for edition in ["Enterprise", "Professional", "BuildTools", "Community"]:
+            paths = sorted(glob.glob(r"C:\\Program Files (x86)\\Microsoft Visual Studio\\*\\%s\\VC\\Tools\\MSVC\\*\\bin\\Hostx64\\x64" % edition), reverse=True)
+            if paths:
+                return paths[0]
+    # If cl.exe is not on path, try to find it.
+    if os.system("where cl.exe >nul 2>nul") != 0:
+        cl_path = find_cl_path()
+        if cl_path is None:
+            raise RuntimeError("Could not locate a supported Microsoft Visual C++ installation")
+        os.environ["PATH"] += ";" + cl_path
+_backend = load(name='_grid_encoder',
+                extra_cflags=c_flags,
+                extra_cuda_cflags=nvcc_flags,
+                sources=[os.path.join(_src_path, 'src', f) for f in [
+                    'gridencoder.cu',
+                    'bindings.cpp',
+                ]],
+                )
+__all__ = ['_backend']

gridencoder/grid.py ADDED Viewed

	@@ -0,0 +1,224 @@

+import numpy as np
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.cuda.amp import custom_bwd, custom_fwd
+try:
+    import _gridencoder as _backend
+except ImportError:
+    from .backend import _backend
+_gridtype_to_id = {
+    'hash': 0,
+    'tiled': 1,
+}
+class _grid_encode(Function):
+    @staticmethod
+    @custom_fwd
+    def forward(ctx, inputs, embeddings, offsets, per_level_scale, base_resolution, calc_grad_inputs=False, gridtype=0, align_corners=False):
+        # inputs: [B, D], float in [0, 1]
+        # embeddings: [sO, C], float
+        # offsets: [L + 1], int
+        # RETURN: [B, F], float
+        inputs = inputs.contiguous()
+        B, D = inputs.shape # batch size, coord dim
+        L = offsets.shape[0] - 1 # level
+        C = embeddings.shape[1] # embedding dim for each level
+        S = np.log2(per_level_scale) # resolution multiplier at each level, apply log2 for later CUDA exp2f
+        H = base_resolution # base resolution
+        # manually handle autocast (only use half precision embeddings, inputs must be float for enough precision)
+        # if C % 2 != 0, force float, since half for atomicAdd is very slow.
+        if torch.is_autocast_enabled() and C % 2 == 0:
+            embeddings = embeddings.to(torch.half)
+        # L first, optimize cache for cuda kernel, but needs an extra permute later
+        outputs = torch.empty(L, B, C, device=inputs.device, dtype=embeddings.dtype)
+        if calc_grad_inputs:
+            dy_dx = torch.empty(B, L * D * C, device=inputs.device, dtype=embeddings.dtype)
+        else:
+            dy_dx = torch.empty(1, device=inputs.device, dtype=embeddings.dtype) # placeholder... TODO: a better way?
+        _backend.grid_encode_forward(inputs, embeddings, offsets, outputs, B, D, C, L, S, H, calc_grad_inputs, dy_dx, gridtype, align_corners)
+        # permute back to [B, L * C]
+        outputs = outputs.permute(1, 0, 2).reshape(B, L * C)
+        ctx.save_for_backward(inputs, embeddings, offsets, dy_dx)
+        ctx.dims = [B, D, C, L, S, H, gridtype]
+        ctx.calc_grad_inputs = calc_grad_inputs
+        ctx.align_corners = align_corners
+        return outputs
+    @staticmethod
+    #@once_differentiable
+    @custom_bwd
+    def backward(ctx, grad):
+        inputs, embeddings, offsets, dy_dx = ctx.saved_tensors
+        B, D, C, L, S, H, gridtype = ctx.dims
+        calc_grad_inputs = ctx.calc_grad_inputs
+        align_corners = ctx.align_corners
+        # grad: [B, L * C] --> [L, B, C]
+        grad = grad.view(B, L, C).permute(1, 0, 2).contiguous()
+        grad_embeddings = torch.zeros_like(embeddings)
+        if calc_grad_inputs:
+            grad_inputs = torch.zeros_like(inputs, dtype=embeddings.dtype)
+        else:
+            grad_inputs = torch.zeros(1, device=inputs.device, dtype=embeddings.dtype)
+        _backend.grid_encode_backward(grad, inputs, embeddings, offsets, grad_embeddings, B, D, C, L, S, H, calc_grad_inputs, dy_dx, grad_inputs, gridtype, align_corners)
+        if calc_grad_inputs:
+            grad_inputs = grad_inputs.to(inputs.dtype)
+            return grad_inputs, grad_embeddings, None, None, None, None, None, None
+        else:
+            return None, grad_embeddings, None, None, None, None, None, None
+grid_encode = _grid_encode.apply
+class GridEncoder(nn.Module):
+    def __init__(self, input_dim=3, num_levels=16, level_dim=2, per_level_scale=2, base_resolution=16, log2_hashmap_size=19, desired_resolution=None, gridtype='hash', align_corners=False):
+        super().__init__()
+        # the finest resolution desired at the last level, if provided, overridee per_level_scale
+        if desired_resolution is not None:
+            per_level_scale = np.exp2(np.log2(desired_resolution / base_resolution) / (num_levels - 1))
+        self.input_dim = input_dim # coord dims, 2 or 3
+        self.num_levels = num_levels # num levels, each level multiply resolution by 2
+        self.level_dim = level_dim # encode channels per level
+        self.per_level_scale = per_level_scale # multiply resolution by this scale at each level.
+        self.log2_hashmap_size = log2_hashmap_size
+        self.base_resolution = base_resolution
+        self.output_dim = num_levels * level_dim
+        self.gridtype = gridtype
+        self.gridtype_id = _gridtype_to_id[gridtype] # "tiled" or "hash"
+        self.align_corners = align_corners
+        # allocate parameters
+        offsets = []
+        offset = 0
+        self.max_params = 2 ** log2_hashmap_size
+        for i in range(num_levels):
+            resolution = int(np.ceil(base_resolution * per_level_scale ** i))
+            params_in_level = min(self.max_params, (resolution if align_corners else resolution + 1) ** input_dim) # limit max number
+            params_in_level = int(np.ceil(params_in_level / 8) * 8) # make divisible
+            offsets.append(offset)
+            offset += params_in_level
+        offsets.append(offset)
+        offsets = torch.from_numpy(np.array(offsets, dtype=np.int32))
+        self.register_buffer('offsets', offsets)
+        self.n_params = offsets[-1] * level_dim
+        # parameters
+        self.embeddings = nn.Parameter(torch.empty(offset, level_dim))
+        self.reset_parameters()
+    def reset_parameters(self):
+        std = 1e-4
+        self.embeddings.data.uniform_(-std, std)
+    def __repr__(self):
+        return f"GridEncoder: input_dim={self.input_dim} num_levels={self.num_levels} level_dim={self.level_dim} resolution={self.base_resolution} -> {int(round(self.base_resolution * self.per_level_scale ** (self.num_levels - 1)))} per_level_scale={self.per_level_scale:.4f} params={tuple(self.embeddings.shape)} gridtype={self.gridtype} align_corners={self.align_corners}"
+    def forward(self, inputs, bound=1):
+        # inputs: [..., input_dim], normalized real world positions in [-bound, bound]
+        # return: [..., num_levels * level_dim]
+        inputs = (inputs + bound) / (2 * bound) # map to [0, 1]
+        #print('inputs', inputs.shape, inputs.dtype, inputs.min().item(), inputs.max().item())
+        prefix_shape = list(inputs.shape[:-1])
+        inputs = inputs.view(-1, self.input_dim)
+        outputs = grid_encode(inputs, self.embeddings, self.offsets, self.per_level_scale, self.base_resolution, inputs.requires_grad, self.gridtype_id, self.align_corners)
+        outputs = outputs.view(prefix_shape + [self.output_dim])
+        #print('outputs', outputs.shape, outputs.dtype, outputs.min().item(), outputs.max().item())
+        return outputs
+class VarGridEncoder(nn.Module):
+    def __init__(self, input_dim=3, num_levels=16, level_dim=2, per_level_scale=2, base_resolution=16, log2_hashmap_size=19, desired_resolution=None, gridtype='hash', align_corners=False, hash_entries=None):
+        super().__init__()
+        # the finest resolution desired at the last level, if provided, overridee per_level_scale
+        if desired_resolution is not None:
+            per_level_scale = np.exp2(np.log2(desired_resolution / base_resolution) / (num_levels - 1))
+        self.input_dim = input_dim # coord dims, 2 or 3
+        self.num_levels = num_levels # num levels, each level multiply resolution by 2
+        self.level_dim = level_dim # encode channels per level
+        self.per_level_scale = per_level_scale # multiply resolution by this scale at each level.
+        self.log2_hashmap_size = log2_hashmap_size
+        self.base_resolution = base_resolution
+        self.output_dim = num_levels * level_dim
+        self.gridtype = gridtype
+        self.gridtype_id = _gridtype_to_id[gridtype] # "tiled" or "hash"
+        self.align_corners = align_corners
+        # allocate parameters
+        offsets = []
+        offset = 0
+        self.max_params = 2 ** log2_hashmap_size
+        for i in range(num_levels):
+            resolution = int(np.ceil(base_resolution * per_level_scale ** i))
+            params_in_level = min(self.max_params, (resolution if align_corners else resolution + 1) ** input_dim) # limit max number
+            params_in_level = int(np.ceil(params_in_level / 8) * 8) # make divisible
+            offsets.append(offset)
+            offset += params_in_level
+        offsets.append(offset)
+        offsets = torch.from_numpy(np.array(offsets, dtype=np.int32))
+        self.register_buffer('offsets', offsets)
+        self.n_params = offsets[-1] * level_dim
+        self.level_dim = level_dim
+        self.offset = offset
+        # parameters
+        self.embeddings = nn.Parameter(torch.empty(offset - hash_entries, level_dim))
+        self.reset_parameters()
+    def reset_parameters(self):
+        std = 1e-4
+        self.embeddings.data.uniform_(-std, std)
+    def __repr__(self):
+        return f"GridEncoder: input_dim={self.input_dim} num_levels={self.num_levels} level_dim={self.level_dim} resolution={self.base_resolution} -> {int(round(self.base_resolution * self.per_level_scale ** (self.num_levels - 1)))} per_level_scale={self.per_level_scale:.4f} params={tuple(self.embeddings.shape)} gridtype={self.gridtype} align_corners={self.align_corners}"
+    def forward(self, inputs, embeddings, bound=1):
+        # inputs: [..., input_dim], normalized real world positions in [-bound, bound]
+        # return: [..., num_levels * level_dim]
+        input_embeddings = torch.cat([embeddings, self.embeddings], dim=0)
+        inputs = (inputs + bound) / (2 * bound) # map to [0, 1]
+        #print('inputs', inputs.shape, inputs.dtype, inputs.min().item(), inputs.max().item())
+        prefix_shape = list(inputs.shape[:-1])
+        inputs = inputs.view(-1, self.input_dim)
+        outputs = grid_encode(inputs, input_embeddings, self.offsets, self.per_level_scale, self.base_resolution, inputs.requires_grad, self.gridtype_id, self.align_corners)
+        outputs = outputs.view(prefix_shape + [self.output_dim])
+        #print('outputs', outputs.shape, outputs.dtype, outputs.min().item(), outputs.max().item())
+        return outputs

gridencoder/setup.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import os
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+_src_path = os.path.dirname(os.path.abspath(__file__))
+nvcc_flags = [
+    '-O3', '-std=c++17',
+    '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__',
+]
+if os.name == "posix":
+    c_flags = ['-O3', '-std=c++17']
+elif os.name == "nt":
+    c_flags = ['/O2', '/std:c++17']
+    # find cl.exe
+    def find_cl_path():
+        import glob
+        for edition in ["Enterprise", "Professional", "BuildTools", "Community"]:
+            paths = sorted(glob.glob(r"C:\\Program Files (x86)\\Microsoft Visual Studio\\*\\%s\\VC\\Tools\\MSVC\\*\\bin\\Hostx64\\x64" % edition), reverse=True)
+            if paths:
+                return paths[0]
+    # If cl.exe is not on path, try to find it.
+    if os.system("where cl.exe >nul 2>nul") != 0:
+        cl_path = find_cl_path()
+        if cl_path is None:
+            raise RuntimeError("Could not locate a supported Microsoft Visual C++ installation")
+        os.environ["PATH"] += ";" + cl_path
+setup(
+    name='gridencoder', # package name, import this to use python API
+    ext_modules=[
+        CUDAExtension(
+            name='_gridencoder', # extension name, import this to use CUDA API
+            sources=[os.path.join(_src_path, 'src', f) for f in [
+                'gridencoder.cu',
+                'bindings.cpp',
+            ]],
+            extra_compile_args={
+                'cxx': c_flags,
+                'nvcc': nvcc_flags,
+            }
+        ),
+    ],
+    cmdclass={
+        'build_ext': BuildExtension,
+    }
+)

gridencoder/src/bindings.cpp ADDED Viewed

	@@ -0,0 +1,8 @@

+#include <torch/extension.h>
+#include "gridencoder.h"
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("grid_encode_forward", &grid_encode_forward, "grid_encode_forward (CUDA)");
+    m.def("grid_encode_backward", &grid_encode_backward, "grid_encode_backward (CUDA)");
+}

gridencoder/src/gridencoder.cu ADDED Viewed

	@@ -0,0 +1,478 @@

+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/torch.h>
+#include <algorithm>
+#include <stdexcept>
+#include <stdint.h>
+#include <cstdio>
+#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be a contiguous tensor")
+#define CHECK_IS_INT(x) TORCH_CHECK(x.scalar_type() == at::ScalarType::Int, #x " must be an int tensor")
+#define CHECK_IS_FLOATING(x) TORCH_CHECK(x.scalar_type() == at::ScalarType::Float || x.scalar_type() == at::ScalarType::Half || x.scalar_type() == at::ScalarType::Double, #x " must be a floating tensor")
+// just for compatability of half precision in AT_DISPATCH_FLOATING_TYPES_AND_HALF...
+static inline  __device__ at::Half atomicAdd(at::Half *address, at::Half val) {
+  // requires CUDA >= 10 and ARCH >= 70
+  // this is very slow compared to float or __half2, and never used.
+  //return atomicAdd(reinterpret_cast<__half*>(address), val);
+}
+template <typename T>
+static inline __host__ __device__ T div_round_up(T val, T divisor) {
+    return (val + divisor - 1) / divisor;
+}
+template <uint32_t D>
+__device__ uint32_t fast_hash(const uint32_t pos_grid[D]) {
+    static_assert(D <= 7, "fast_hash can only hash up to 7 dimensions.");
+    // While 1 is technically not a good prime for hashing (or a prime at all), it helps memory coherence
+    // and is sufficient for our use case of obtaining a uniformly colliding index from high-dimensional
+    // coordinates.
+    constexpr uint32_t primes[7] = { 1, 2654435761, 805459861, 3674653429, 2097192037, 1434869437, 2165219737 };
+    uint32_t result = 0;
+    #pragma unroll
+    for (uint32_t i = 0; i < D; ++i) {
+        result ^= pos_grid[i] * primes[i];
+    }
+    return result;
+}
+template <uint32_t D, uint32_t C>
+__device__ uint32_t get_grid_index(const uint32_t gridtype, const bool align_corners, const uint32_t ch, const uint32_t hashmap_size, const uint32_t resolution, const uint32_t pos_grid[D]) {
+    uint32_t stride = 1;
+    uint32_t index = 0;
+    #pragma unroll
+    for (uint32_t d = 0; d < D && stride <= hashmap_size; d++) {
+        index += pos_grid[d] * stride;
+        stride *= align_corners ? resolution: (resolution + 1);
+    }
+    // NOTE: for NeRF, the hash is in fact not necessary. Check https://github.com/NVlabs/instant-ngp/issues/97.
+    // gridtype: 0 == hash, 1 == tiled
+    if (gridtype == 0 && stride > hashmap_size) {
+        index = fast_hash<D>(pos_grid);
+    }
+    return (index % hashmap_size) * C + ch;
+}
+template <typename scalar_t, uint32_t D, uint32_t C>
+__global__ void kernel_grid(
+    const float * __restrict__ inputs,
+    const scalar_t * __restrict__ grid,
+    const int * __restrict__ offsets,
+    scalar_t * __restrict__ outputs,
+    const uint32_t B, const uint32_t L, const float S, const uint32_t H,
+    const bool calc_grad_inputs,
+    scalar_t * __restrict__ dy_dx,
+    const uint32_t gridtype,
+    const bool align_corners
+) {
+    const uint32_t b = blockIdx.x * blockDim.x + threadIdx.x;
+    if (b >= B) return;
+    const uint32_t level = blockIdx.y;
+    // locate
+    grid += (uint32_t)offsets[level] * C;
+    inputs += b * D;
+    outputs += level * B * C + b * C;
+    // check input range (should be in [0, 1])
+    bool flag_oob = false;
+    #pragma unroll
+    for (uint32_t d = 0; d < D; d++) {
+        if (inputs[d] < 0 || inputs[d] > 1) {
+            flag_oob = true;
+        }
+    }
+    // if input out of bound, just set output to 0
+    if (flag_oob) {
+        #pragma unroll
+        for (uint32_t ch = 0; ch < C; ch++) {
+            outputs[ch] = 0;
+        }
+        if (calc_grad_inputs) {
+            dy_dx += b * D * L * C + level * D * C; // B L D C
+            #pragma unroll
+            for (uint32_t d = 0; d < D; d++) {
+                #pragma unroll
+                for (uint32_t ch = 0; ch < C; ch++) {
+                    dy_dx[d * C + ch] = 0;
+                }
+            }
+        }
+        return;
+    }
+    const uint32_t hashmap_size = offsets[level + 1] - offsets[level];
+    const float scale = exp2f(level * S) * H - 1.0f;
+    const uint32_t resolution = (uint32_t)ceil(scale) + 1;
+    // calculate coordinate
+    float pos[D];
+    uint32_t pos_grid[D];
+    #pragma unroll
+    for (uint32_t d = 0; d < D; d++) {
+        pos[d] = inputs[d] * scale + (align_corners ? 0.0f : 0.5f);
+        pos_grid[d] = floorf(pos[d]);
+        pos[d] -= (float)pos_grid[d];
+    }
+    //printf("[b=%d, l=%d] pos=(%f, %f)+(%d, %d)\n", b, level, pos[0], pos[1], pos_grid[0], pos_grid[1]);
+    // interpolate
+    scalar_t results[C] = {0}; // temp results in register
+    #pragma unroll
+    for (uint32_t idx = 0; idx < (1 << D); idx++) {
+        float w = 1;
+        uint32_t pos_grid_local[D];
+        #pragma unroll
+        for (uint32_t d = 0; d < D; d++) {
+            if ((idx & (1 << d)) == 0) {
+                w *= 1 - pos[d];
+                pos_grid_local[d] = pos_grid[d];
+            } else {
+                w *= pos[d];
+                pos_grid_local[d] = pos_grid[d] + 1;
+            }
+        }
+        uint32_t index = get_grid_index<D, C>(gridtype, align_corners, 0, hashmap_size, resolution, pos_grid_local);
+        // writing to register (fast)
+        #pragma unroll
+        for (uint32_t ch = 0; ch < C; ch++) {
+            results[ch] += w * grid[index + ch];
+        }
+        //printf("[b=%d, l=%d] int %d, idx %d, w %f, val %f\n", b, level, idx, index, w, grid[index]);
+    }
+    // writing to global memory (slow)
+    #pragma unroll
+    for (uint32_t ch = 0; ch < C; ch++) {
+        outputs[ch] = results[ch];
+    }
+    // prepare dy_dx for calc_grad_inputs
+    // differentiable (soft) indexing: https://discuss.pytorch.org/t/differentiable-indexing/17647/9
+    if (calc_grad_inputs) {
+        dy_dx += b * D * L * C + level * D * C; // B L D C
+        #pragma unroll
+        for (uint32_t gd = 0; gd < D; gd++) {
+            scalar_t results_grad[C] = {0};
+            #pragma unroll
+            for (uint32_t idx = 0; idx < (1 << (D - 1)); idx++) {
+                float w = scale;
+                uint32_t pos_grid_local[D];
+                #pragma unroll
+                for (uint32_t nd = 0; nd < D - 1; nd++) {
+                    const uint32_t d = (nd >= gd) ? (nd + 1) : nd;
+                    if ((idx & (1 << nd)) == 0) {
+                        w *= 1 - pos[d];
+                        pos_grid_local[d] = pos_grid[d];
+                    } else {
+                        w *= pos[d];
+                        pos_grid_local[d] = pos_grid[d] + 1;
+                    }
+                }
+                pos_grid_local[gd] = pos_grid[gd];
+                uint32_t index_left = get_grid_index<D, C>(gridtype, align_corners, 0, hashmap_size, resolution, pos_grid_local);
+                pos_grid_local[gd] = pos_grid[gd] + 1;
+                uint32_t index_right = get_grid_index<D, C>(gridtype, align_corners, 0, hashmap_size, resolution, pos_grid_local);
+                #pragma unroll
+                for (uint32_t ch = 0; ch < C; ch++) {
+                    results_grad[ch] += w * (grid[index_right + ch] - grid[index_left + ch]);
+                }
+            }
+            #pragma unroll
+            for (uint32_t ch = 0; ch < C; ch++) {
+                dy_dx[gd * C + ch] = results_grad[ch];
+            }
+        }
+    }
+}
+template <typename scalar_t, uint32_t D, uint32_t C, uint32_t N_C>
+__global__ void kernel_grid_backward(
+    const scalar_t * __restrict__ grad,
+    const float * __restrict__ inputs,
+    const scalar_t * __restrict__ grid,
+    const int * __restrict__ offsets,
+    scalar_t * __restrict__ grad_grid,
+    const uint32_t B, const uint32_t L, const float S, const uint32_t H,
+    const uint32_t gridtype,
+    const bool align_corners
+) {
+    const uint32_t b = (blockIdx.x * blockDim.x + threadIdx.x) * N_C / C;
+    if (b >= B) return;
+    const uint32_t level = blockIdx.y;
+    const uint32_t ch = (blockIdx.x * blockDim.x + threadIdx.x) * N_C - b * C;
+    // locate
+    grad_grid += offsets[level] * C;
+    inputs += b * D;
+    grad += level * B * C + b * C + ch; // L, B, C
+    const uint32_t hashmap_size = offsets[level + 1] - offsets[level];
+    const float scale = exp2f(level * S) * H - 1.0f;
+    const uint32_t resolution = (uint32_t)ceil(scale) + 1;
+    // check input range (should be in [0, 1])
+    #pragma unroll
+    for (uint32_t d = 0; d < D; d++) {
+        if (inputs[d] < 0 || inputs[d] > 1) {
+            return; // grad is init as 0, so we simply return.
+        }
+    }
+    // calculate coordinate
+    float pos[D];
+    uint32_t pos_grid[D];
+    #pragma unroll
+    for (uint32_t d = 0; d < D; d++) {
+        pos[d] = inputs[d] * scale + (align_corners ? 0.0f : 0.5f);
+        pos_grid[d] = floorf(pos[d]);
+        pos[d] -= (float)pos_grid[d];
+    }
+    scalar_t grad_cur[N_C] = {0}; // fetch to register
+    #pragma unroll
+    for (uint32_t c = 0; c < N_C; c++) {
+        grad_cur[c] = grad[c];
+    }
+    // interpolate
+    #pragma unroll
+    for (uint32_t idx = 0; idx < (1 << D); idx++) {
+        float w = 1;
+        uint32_t pos_grid_local[D];
+        #pragma unroll
+        for (uint32_t d = 0; d < D; d++) {
+            if ((idx & (1 << d)) == 0) {
+                w *= 1 - pos[d];
+                pos_grid_local[d] = pos_grid[d];
+            } else {
+                w *= pos[d];
+                pos_grid_local[d] = pos_grid[d] + 1;
+            }
+        }
+        uint32_t index = get_grid_index<D, C>(gridtype, align_corners, ch, hashmap_size, resolution, pos_grid_local);
+        // atomicAdd for __half is slow (especially for large values), so we use __half2 if N_C % 2 == 0
+        // TODO: use float which is better than __half, if N_C % 2 != 0
+        if (std::is_same<scalar_t, at::Half>::value && N_C % 2 == 0) {
+            #pragma unroll
+            for (uint32_t c = 0; c < N_C; c += 2) {
+                // process two __half at once (by interpreting as a __half2)
+                __half2 v = {(__half)(w * grad_cur[c]), (__half)(w * grad_cur[c + 1])};
+                atomicAdd((__half2*)&grad_grid[index + c], v);
+            }
+        // float, or __half when N_C % 2 != 0 (which means C == 1)
+        } else {
+            #pragma unroll
+            for (uint32_t c = 0; c < N_C; c++) {
+                atomicAdd(&grad_grid[index + c], w * grad_cur[c]);
+            }
+        }
+    }
+}
+template <typename scalar_t, uint32_t D, uint32_t C>
+__global__ void kernel_input_backward(
+    const scalar_t * __restrict__ grad,
+    const scalar_t * __restrict__ dy_dx,
+    scalar_t * __restrict__ grad_inputs,
+    uint32_t B, uint32_t L
+) {
+    const uint32_t t = threadIdx.x + blockIdx.x * blockDim.x;
+    if (t >= B * D) return;
+    const uint32_t b = t / D;
+    const uint32_t d = t - b * D;
+    dy_dx += b * L * D * C;
+    scalar_t result = 0;
+    # pragma unroll
+    for (int l = 0; l < L; l++) {
+        # pragma unroll
+        for (int ch = 0; ch < C; ch++) {
+            result += grad[l * B * C + b * C + ch] * dy_dx[l * D * C + d * C + ch];
+        }
+    }
+    grad_inputs[t] = result;
+}
+template <typename scalar_t, uint32_t D>
+void kernel_grid_wrapper(const float *inputs, const scalar_t *embeddings, const int *offsets, scalar_t *outputs, const uint32_t B, const uint32_t C, const uint32_t L, const float S, const uint32_t H, const bool calc_grad_inputs, scalar_t *dy_dx, const uint32_t gridtype, const bool align_corners) {
+    static constexpr uint32_t N_THREAD = 512;
+    const dim3 blocks_hashgrid = { div_round_up(B, N_THREAD), L, 1 };
+    switch (C) {
+        case 1: kernel_grid<scalar_t, D, 1><<<blocks_hashgrid, N_THREAD>>>(inputs, embeddings, offsets, outputs, B, L, S, H, calc_grad_inputs, dy_dx, gridtype, align_corners); break;
+        case 2: kernel_grid<scalar_t, D, 2><<<blocks_hashgrid, N_THREAD>>>(inputs, embeddings, offsets, outputs, B, L, S, H, calc_grad_inputs, dy_dx, gridtype, align_corners); break;
+        case 4: kernel_grid<scalar_t, D, 4><<<blocks_hashgrid, N_THREAD>>>(inputs, embeddings, offsets, outputs, B, L, S, H, calc_grad_inputs, dy_dx, gridtype, align_corners); break;
+        case 8: kernel_grid<scalar_t, D, 8><<<blocks_hashgrid, N_THREAD>>>(inputs, embeddings, offsets, outputs, B, L, S, H, calc_grad_inputs, dy_dx, gridtype, align_corners); break;
+        default: throw std::runtime_error{"GridEncoding: C must be 1, 2, 4, or 8."};
+    }
+}
+// inputs: [B, D], float, in [0, 1]
+// embeddings: [sO, C], float
+// offsets: [L + 1], uint32_t
+// outputs: [L, B, C], float (L first, so only one level of hashmap needs to fit into cache at a time.)
+// H: base resolution
+// dy_dx: [B, L * D * C]
+template <typename scalar_t>
+void grid_encode_forward_cuda(const float *inputs, const scalar_t *embeddings, const int *offsets, scalar_t *outputs, const uint32_t B, const uint32_t D, const uint32_t C, const uint32_t L, const float S, const uint32_t H, const bool calc_grad_inputs, scalar_t *dy_dx, const uint32_t gridtype, const bool align_corners) {
+    switch (D) {
+        case 2: kernel_grid_wrapper<scalar_t, 2>(inputs, embeddings, offsets, outputs, B, C, L, S, H, calc_grad_inputs, dy_dx, gridtype, align_corners); break;
+        case 3: kernel_grid_wrapper<scalar_t, 3>(inputs, embeddings, offsets, outputs, B, C, L, S, H, calc_grad_inputs, dy_dx, gridtype, align_corners); break;
+        case 4: kernel_grid_wrapper<scalar_t, 4>(inputs, embeddings, offsets, outputs, B, C, L, S, H, calc_grad_inputs, dy_dx, gridtype, align_corners); break;
+        case 5: kernel_grid_wrapper<scalar_t, 5>(inputs, embeddings, offsets, outputs, B, C, L, S, H, calc_grad_inputs, dy_dx, gridtype, align_corners); break;
+        default: throw std::runtime_error{"GridEncoding: C must be 1, 2, 4, or 8."};
+    }
+}
+template <typename scalar_t, uint32_t D>
+void kernel_grid_backward_wrapper(const scalar_t *grad, const float *inputs, const scalar_t *embeddings, const int *offsets, scalar_t *grad_embeddings, const uint32_t B, const uint32_t C, const uint32_t L, const float S, const uint32_t H, const bool calc_grad_inputs, scalar_t *dy_dx, scalar_t *grad_inputs, const uint32_t gridtype, const bool align_corners) {
+    static constexpr uint32_t N_THREAD = 256;
+    const uint32_t N_C = std::min(2u, C); // n_features_per_thread
+    const dim3 blocks_hashgrid = { div_round_up(B * C / N_C, N_THREAD), L, 1 };
+    switch (C) {
+        case 1:
+            kernel_grid_backward<scalar_t, D, 1, 1><<<blocks_hashgrid, N_THREAD>>>(grad, inputs, embeddings, offsets, grad_embeddings, B, L, S, H, gridtype, align_corners);
+            if (calc_grad_inputs) kernel_input_backward<scalar_t, D, 1><<<div_round_up(B * D, N_THREAD), N_THREAD>>>(grad, dy_dx, grad_inputs, B, L);
+            break;
+        case 2:
+            kernel_grid_backward<scalar_t, D, 2, 2><<<blocks_hashgrid, N_THREAD>>>(grad, inputs, embeddings, offsets, grad_embeddings, B, L, S, H, gridtype, align_corners);
+            if (calc_grad_inputs) kernel_input_backward<scalar_t, D, 2><<<div_round_up(B * D, N_THREAD), N_THREAD>>>(grad, dy_dx, grad_inputs, B, L);
+            break;
+        case 4:
+            kernel_grid_backward<scalar_t, D, 4, 2><<<blocks_hashgrid, N_THREAD>>>(grad, inputs, embeddings, offsets, grad_embeddings, B, L, S, H, gridtype, align_corners);
+            if (calc_grad_inputs) kernel_input_backward<scalar_t, D, 4><<<div_round_up(B * D, N_THREAD), N_THREAD>>>(grad, dy_dx, grad_inputs, B, L);
+            break;
+        case 8:
+            kernel_grid_backward<scalar_t, D, 8, 2><<<blocks_hashgrid, N_THREAD>>>(grad, inputs, embeddings, offsets, grad_embeddings, B, L, S, H, gridtype, align_corners);
+            if (calc_grad_inputs) kernel_input_backward<scalar_t, D, 8><<<div_round_up(B * D, N_THREAD), N_THREAD>>>(grad, dy_dx, grad_inputs, B, L);
+            break;
+        default: throw std::runtime_error{"GridEncoding: C must be 1, 2, 4, or 8."};
+    }
+}
+// grad: [L, B, C], float
+// inputs: [B, D], float, in [0, 1]
+// embeddings: [sO, C], float
+// offsets: [L + 1], uint32_t
+// grad_embeddings: [sO, C]
+// H: base resolution
+template <typename scalar_t>
+void grid_encode_backward_cuda(const scalar_t *grad, const float *inputs, const scalar_t *embeddings, const int *offsets, scalar_t *grad_embeddings, const uint32_t B, const uint32_t D, const uint32_t C, const uint32_t L, const float S, const uint32_t H, const bool calc_grad_inputs, scalar_t *dy_dx, scalar_t *grad_inputs, const uint32_t gridtype, const bool align_corners) {
+    switch (D) {
+        case 2: kernel_grid_backward_wrapper<scalar_t, 2>(grad, inputs, embeddings, offsets, grad_embeddings, B, C, L, S, H, calc_grad_inputs, dy_dx, grad_inputs, gridtype, align_corners); break;
+        case 3: kernel_grid_backward_wrapper<scalar_t, 3>(grad, inputs, embeddings, offsets, grad_embeddings, B, C, L, S, H, calc_grad_inputs, dy_dx, grad_inputs, gridtype, align_corners); break;
+        case 4: kernel_grid_backward_wrapper<scalar_t, 4>(grad, inputs, embeddings, offsets, grad_embeddings, B, C, L, S, H, calc_grad_inputs, dy_dx, grad_inputs, gridtype, align_corners); break;
+        case 5: kernel_grid_backward_wrapper<scalar_t, 5>(grad, inputs, embeddings, offsets, grad_embeddings, B, C, L, S, H, calc_grad_inputs, dy_dx, grad_inputs, gridtype, align_corners); break;
+        default: throw std::runtime_error{"GridEncoding: C must be 1, 2, 4, or 8."};
+    }
+}
+void grid_encode_forward(const at::Tensor inputs, const at::Tensor embeddings, const at::Tensor offsets, at::Tensor outputs, const uint32_t B, const uint32_t D, const uint32_t C, const uint32_t L, const float S, const uint32_t H, const bool calc_grad_inputs, at::Tensor dy_dx, const uint32_t gridtype, const bool align_corners) {
+    CHECK_CUDA(inputs);
+    CHECK_CUDA(embeddings);
+    CHECK_CUDA(offsets);
+    CHECK_CUDA(outputs);
+    CHECK_CUDA(dy_dx);
+    CHECK_CONTIGUOUS(inputs);
+    CHECK_CONTIGUOUS(embeddings);
+    CHECK_CONTIGUOUS(offsets);
+    CHECK_CONTIGUOUS(outputs);
+    CHECK_CONTIGUOUS(dy_dx);
+    CHECK_IS_FLOATING(inputs);
+    CHECK_IS_FLOATING(embeddings);
+    CHECK_IS_INT(offsets);
+    CHECK_IS_FLOATING(outputs);
+    CHECK_IS_FLOATING(dy_dx);
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+    embeddings.scalar_type(), "grid_encode_forward", ([&] {
+        grid_encode_forward_cuda<scalar_t>(inputs.data_ptr<float>(), embeddings.data_ptr<scalar_t>(), offsets.data_ptr<int>(), outputs.data_ptr<scalar_t>(), B, D, C, L, S, H, calc_grad_inputs, dy_dx.data_ptr<scalar_t>(), gridtype, align_corners);
+    }));
+}
+void grid_encode_backward(const at::Tensor grad, const at::Tensor inputs, const at::Tensor embeddings, const at::Tensor offsets, at::Tensor grad_embeddings, const uint32_t B, const uint32_t D, const uint32_t C, const uint32_t L, const float S, const uint32_t H, const bool calc_grad_inputs, const at::Tensor dy_dx, at::Tensor grad_inputs, const uint32_t gridtype, const bool align_corners) {
+    CHECK_CUDA(grad);
+    CHECK_CUDA(inputs);
+    CHECK_CUDA(embeddings);
+    CHECK_CUDA(offsets);
+    CHECK_CUDA(grad_embeddings);
+    CHECK_CUDA(dy_dx);
+    CHECK_CUDA(grad_inputs);
+    CHECK_CONTIGUOUS(grad);
+    CHECK_CONTIGUOUS(inputs);
+    CHECK_CONTIGUOUS(embeddings);
+    CHECK_CONTIGUOUS(offsets);
+    CHECK_CONTIGUOUS(grad_embeddings);
+    CHECK_CONTIGUOUS(dy_dx);
+    CHECK_CONTIGUOUS(grad_inputs);
+    CHECK_IS_FLOATING(grad);
+    CHECK_IS_FLOATING(inputs);
+    CHECK_IS_FLOATING(embeddings);
+    CHECK_IS_INT(offsets);
+    CHECK_IS_FLOATING(grad_embeddings);
+    CHECK_IS_FLOATING(dy_dx);
+    CHECK_IS_FLOATING(grad_inputs);
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+    grad.scalar_type(), "grid_encode_backward", ([&] {
+        grid_encode_backward_cuda<scalar_t>(grad.data_ptr<scalar_t>(), inputs.data_ptr<float>(), embeddings.data_ptr<scalar_t>(), offsets.data_ptr<int>(), grad_embeddings.data_ptr<scalar_t>(), B, D, C, L, S, H, calc_grad_inputs, dy_dx.data_ptr<scalar_t>(), grad_inputs.data_ptr<scalar_t>(), gridtype, align_corners);
+    }));
+}

gridencoder/src/gridencoder.h ADDED Viewed

	@@ -0,0 +1,15 @@

+#ifndef _HASH_ENCODE_H
+#define _HASH_ENCODE_H
+#include <stdint.h>
+#include <torch/torch.h>
+// inputs: [B, D], float, in [0, 1]
+// embeddings: [sO, C], float
+// offsets: [L + 1], uint32_t
+// outputs: [B, L * C], float
+// H: base resolution
+void grid_encode_forward(const at::Tensor inputs, const at::Tensor embeddings, const at::Tensor offsets, at::Tensor outputs, const uint32_t B, const uint32_t D, const uint32_t C, const uint32_t L, const float S, const uint32_t H, const bool calc_grad_inputs, at::Tensor dy_dx, const uint32_t gridtype, const bool align_corners);
+void grid_encode_backward(const at::Tensor grad, const at::Tensor inputs, const at::Tensor embeddings, const at::Tensor offsets, at::Tensor grad_embeddings, const uint32_t B, const uint32_t D, const uint32_t C, const uint32_t L, const float S, const uint32_t H, const bool calc_grad_inputs, const at::Tensor dy_dx, at::Tensor grad_inputs, const uint32_t gridtype, const bool align_corners);
+#endif

imaginaire/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, check out LICENSE.md

imaginaire/config.py ADDED Viewed

	@@ -0,0 +1,238 @@

+# Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, check out LICENSE.md
+"""Config utilities for yml file."""
+import collections
+import functools
+import os
+import re
+import yaml
+from imaginaire.utils.distributed import master_only_print as print
+DEBUG = False
+USE_JIT = False
+class AttrDict(dict):
+    """Dict as attribute trick."""
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+        for key, value in self.__dict__.items():
+            if isinstance(value, dict):
+                self.__dict__[key] = AttrDict(value)
+            elif isinstance(value, (list, tuple)):
+                if isinstance(value[0], dict):
+                    self.__dict__[key] = [AttrDict(item) for item in value]
+                else:
+                    self.__dict__[key] = value
+    def yaml(self):
+        """Convert object to yaml dict and return."""
+        yaml_dict = {}
+        for key, value in self.__dict__.items():
+            if isinstance(value, AttrDict):
+                yaml_dict[key] = value.yaml()
+            elif isinstance(value, list):
+                if isinstance(value[0], AttrDict):
+                    new_l = []
+                    for item in value:
+                        new_l.append(item.yaml())
+                    yaml_dict[key] = new_l
+                else:
+                    yaml_dict[key] = value
+            else:
+                yaml_dict[key] = value
+        return yaml_dict
+    def __repr__(self):
+        """Print all variables."""
+        ret_str = []
+        for key, value in self.__dict__.items():
+            if isinstance(value, AttrDict):
+                ret_str.append('{}:'.format(key))
+                child_ret_str = value.__repr__().split('\n')
+                for item in child_ret_str:
+                    ret_str.append('    ' + item)
+            elif isinstance(value, list):
+                if isinstance(value[0], AttrDict):
+                    ret_str.append('{}:'.format(key))
+                    for item in value:
+                        # Treat as AttrDict above.
+                        child_ret_str = item.__repr__().split('\n')
+                        for item in child_ret_str:
+                            ret_str.append('    ' + item)
+                else:
+                    ret_str.append('{}: {}'.format(key, value))
+            else:
+                ret_str.append('{}: {}'.format(key, value))
+        return '\n'.join(ret_str)
+class Config(AttrDict):
+    r"""Configuration class. This should include every human specifiable
+    hyperparameter values for your training."""
+    def __init__(self, filename=None, verbose=False):
+        super(Config, self).__init__()
+        self.source_filename = filename
+        # Set default parameters.
+        # Logging.
+        large_number = 1000000000
+        self.snapshot_save_iter = large_number
+        self.snapshot_save_epoch = large_number
+        self.metrics_iter = None
+        self.metrics_epoch = None
+        self.snapshot_save_start_iter = 0
+        self.snapshot_save_start_epoch = 0
+        self.image_save_iter = large_number
+        self.image_display_iter = large_number
+        self.max_epoch = large_number
+        self.max_iter = large_number
+        self.logging_iter = 100
+        self.speed_benchmark = False
+        # Trainer.
+        self.trainer = AttrDict(
+            model_average_config=AttrDict(enabled=False,
+                                          beta=0.9999,
+                                          start_iteration=1000,
+                                          num_batch_norm_estimation_iterations=30,
+                                          remove_sn=True),
+            # model_average=False,
+            # model_average_beta=0.9999,
+            # model_average_start_iteration=1000,
+            # model_average_batch_norm_estimation_iteration=30,
+            # model_average_remove_sn=True,
+            image_to_tensorboard=False,
+            hparam_to_tensorboard=False,
+            distributed_data_parallel='pytorch',
+            distributed_data_parallel_params=AttrDict(
+                find_unused_parameters=False),
+            delay_allreduce=True,
+            gan_relativistic=False,
+            gen_step=1,
+            dis_step=1,
+            gan_decay_k=1.,
+            gan_min_k=1.,
+            gan_separate_topk=False,
+            aug_policy='',
+            channels_last=False,
+            strict_resume=True,
+            amp_gp=False,
+            amp_config=AttrDict(init_scale=65536.0,
+                                growth_factor=2.0,
+                                backoff_factor=0.5,
+                                growth_interval=2000,
+                                enabled=False))
+        # Networks.
+        self.gen = AttrDict(type='imaginaire.generators.dummy')
+        self.dis = AttrDict(type='imaginaire.discriminators.dummy')
+        # Optimizers.
+        self.gen_opt = AttrDict(type='adam',
+                                fused_opt=False,
+                                lr=0.0001,
+                                adam_beta1=0.0,
+                                adam_beta2=0.999,
+                                eps=1e-8,
+                                lr_policy=AttrDict(iteration_mode=False,
+                                                   type='step',
+                                                   step_size=large_number,
+                                                   gamma=1))
+        self.dis_opt = AttrDict(type='adam',
+                                fused_opt=False,
+                                lr=0.0001,
+                                adam_beta1=0.0,
+                                adam_beta2=0.999,
+                                eps=1e-8,
+                                lr_policy=AttrDict(iteration_mode=False,
+                                                   type='step',
+                                                   step_size=large_number,
+                                                   gamma=1))
+        # Data.
+        self.data = AttrDict(name='dummy',
+                             type='imaginaire.datasets.images',
+                             num_workers=0)
+        self.test_data = AttrDict(name='dummy',
+                                  type='imaginaire.datasets.images',
+                                  num_workers=0,
+                                  test=AttrDict(is_lmdb=False,
+                                                roots='',
+                                                batch_size=1))
+# Cudnn.
+        self.cudnn = AttrDict(deterministic=False,
+                              benchmark=True)
+        # Others.
+        self.pretrained_weight = ''
+        self.inference_args = AttrDict()
+        # Update with given configurations.
+        assert os.path.exists(filename), 'File {} not exist.'.format(filename)
+        loader = yaml.SafeLoader
+        loader.add_implicit_resolver(
+            u'tag:yaml.org,2002:float',
+            re.compile(u'''^(?:
+             [-+]?(?:[0-9][0-9_]*)\\.[0-9_]*(?:[eE][-+]?[0-9]+)?
+            |[-+]?(?:[0-9][0-9_]*)(?:[eE][-+]?[0-9]+)
+            |\\.[0-9_]+(?:[eE][-+][0-9]+)?
+            |[-+]?[0-9][0-9_]*(?::[0-5]?[0-9])+\\.[0-9_]*
+            |[-+]?\\.(?:inf|Inf|INF)
+            |\\.(?:nan|NaN|NAN))$''', re.X),
+            list(u'-+0123456789.'))
+        try:
+            with open(filename, 'r') as f:
+                cfg_dict = yaml.load(f, Loader=loader)
+        except EnvironmentError:
+            print('Please check the file with name of "%s"', filename)
+        recursive_update(self, cfg_dict)
+        # Put common opts in both gen and dis.
+        if 'common' in cfg_dict:
+            self.common = AttrDict(**cfg_dict['common'])
+            self.gen.common = self.common
+            self.dis.common = self.common
+        if verbose:
+            print(' imaginaire config '.center(80, '-'))
+            print(self.__repr__())
+            print(''.center(80, '-'))
+def rsetattr(obj, attr, val):
+    """Recursively find object and set value"""
+    pre, _, post = attr.rpartition('.')
+    return setattr(rgetattr(obj, pre) if pre else obj, post, val)
+def rgetattr(obj, attr, *args):
+    """Recursively find object and return value"""
+    def _getattr(obj, attr):
+        r"""Get attribute."""
+        return getattr(obj, attr, *args)
+    return functools.reduce(_getattr, [obj] + attr.split('.'))
+def recursive_update(d, u):
+    """Recursively update AttrDict d with AttrDict u"""
+    for key, value in u.items():
+        if isinstance(value, collections.abc.Mapping):
+            d.__dict__[key] = recursive_update(d.get(key, AttrDict({})), value)
+        elif isinstance(value, (list, tuple)):
+            if isinstance(value[0], dict):
+                d.__dict__[key] = [AttrDict(item) for item in value]
+            else:
+                d.__dict__[key] = value
+        else:
+            d.__dict__[key] = value
+    return d

imaginaire/discriminators/__init__.py ADDED Viewed

File without changes

imaginaire/discriminators/gancraft.py ADDED Viewed

	@@ -0,0 +1,278 @@

+# Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, check out LICENSE.md
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import functools
+from imaginaire.layers import Conv2dBlock
+from imaginaire.utils.data import get_paired_input_label_channel_number, get_paired_input_image_channel_number
+from imaginaire.utils.distributed import master_only_print as print
+class Discriminator(nn.Module):
+    r"""Multi-resolution patch discriminator. Based on FPSE discriminator but with N+1 labels.
+    Args:
+        dis_cfg (obj): Discriminator definition part of the yaml config file.
+        data_cfg (obj): Data definition part of the yaml config file.
+    """
+    def __init__(self, dis_cfg, data_cfg):
+        super(Discriminator, self).__init__()
+        # We assume the first datum is the ground truth image.
+        image_channels = get_paired_input_image_channel_number(data_cfg)
+        # Calculate number of channels in the input label.
+        num_labels = get_paired_input_label_channel_number(data_cfg)
+        self.use_label = getattr(dis_cfg, 'use_label', True)
+        # Override number of input channels
+        if hasattr(dis_cfg, 'image_channels'):
+            image_channels = dis_cfg.image_channels
+        if hasattr(dis_cfg, 'num_labels'):
+            num_labels = dis_cfg.num_labels
+        else:
+            # We assume the first datum is the ground truth image.
+            image_channels = get_paired_input_image_channel_number(data_cfg)
+            # Calculate number of channels in the input label.
+            num_labels = get_paired_input_label_channel_number(data_cfg)
+        if not self.use_label:
+            num_labels = 2  # ignore + true
+        # Build the discriminator.
+        num_filters = getattr(dis_cfg, 'num_filters', 128)
+        weight_norm_type = getattr(dis_cfg, 'weight_norm_type', 'spectral')
+        fpse_kernel_size = getattr(dis_cfg, 'fpse_kernel_size', 3)
+        fpse_activation_norm_type = getattr(dis_cfg,
+                                            'fpse_activation_norm_type',
+                                            'none')
+        do_multiscale = getattr(dis_cfg, 'do_multiscale', False)
+        smooth_resample = getattr(dis_cfg, 'smooth_resample', False)
+        no_label_except_largest_scale = getattr(dis_cfg, 'no_label_except_largest_scale', False)
+        self.fpse_discriminator = FPSEDiscriminator(
+            image_channels,
+            num_labels,
+            num_filters,
+            fpse_kernel_size,
+            weight_norm_type,
+            fpse_activation_norm_type,
+            do_multiscale,
+            smooth_resample,
+            no_label_except_largest_scale)
+    def _single_forward(self, input_label, input_image, weights):
+        output_list, features_list = self.fpse_discriminator(input_image, input_label, weights)
+        return output_list, [features_list]
+    def forward(self, data, net_G_output, weights=None, incl_real=False, incl_pseudo_real=False):
+        r"""GANcraft discriminator forward.
+        Args:
+            data (dict):
+              - data  (N x C1 x H x W tensor) : Ground truth images.
+              - label (N x C2 x H x W tensor) : Semantic representations.
+              - z (N x style_dims tensor): Gaussian random noise.
+            net_G_output (dict):
+              - fake_images  (N x C1 x H x W tensor) : Fake images.
+        Returns:
+            output_x (dict):
+              - real_outputs (list): list of output tensors produced by
+                individual patch discriminators for real images.
+              - real_features (list): list of lists of features produced by
+                individual patch discriminators for real images.
+              - fake_outputs (list): list of output tensors produced by
+                individual patch discriminators for fake images.
+              - fake_features (list): list of lists of features produced by
+                individual patch discriminators for fake images.
+        """
+        output_x = dict()
+        # Fake.
+        fake_images = net_G_output['fake_images']
+        if self.use_label:
+            fake_labels = data['fake_masks']
+        else:
+            fake_labels = torch.zeros([fake_images.size(0), 2, fake_images.size(
+                2), fake_images.size(3)], device=fake_images.device, dtype=fake_images.dtype)
+            fake_labels[:, 1, :, :] = 1
+        output_x['fake_outputs'], output_x['fake_features'] = \
+            self._single_forward(fake_labels, fake_images, None)
+        # Real.
+        if incl_real:
+            real_images = data['images']
+            if self.use_label:
+                real_labels = data['real_masks']
+            else:
+                real_labels = torch.zeros([real_images.size(0), 2, real_images.size(
+                    2), real_images.size(3)], device=real_images.device, dtype=real_images.dtype)
+                real_labels[:, 1, :, :] = 1
+            output_x['real_outputs'], output_x['real_features'] = \
+                self._single_forward(real_labels, real_images, None)
+        # pseudo-Real.
+        if incl_pseudo_real:
+            preal_images = data['pseudo_real_img']
+            preal_labels = data['fake_masks']
+            if not self.use_label:
+                preal_labels = torch.zeros([preal_images.size(0), 2, preal_images.size(
+                    2), preal_images.size(3)], device=preal_images.device, dtype=preal_images.dtype)
+                preal_labels[:, 1, :, :] = 1
+            output_x['pseudo_real_outputs'], output_x['pseudo_real_features'] = \
+                self._single_forward(preal_labels, preal_images, None)
+        return output_x
+class FPSEDiscriminator(nn.Module):
+    def __init__(self,
+                 num_input_channels,
+                 num_labels,
+                 num_filters,
+                 kernel_size,
+                 weight_norm_type,
+                 activation_norm_type,
+                 do_multiscale,
+                 smooth_resample,
+                 no_label_except_largest_scale):
+        super().__init__()
+        self.do_multiscale = do_multiscale
+        self.no_label_except_largest_scale = no_label_except_largest_scale
+        padding = int(np.ceil((kernel_size - 1.0) / 2))
+        nonlinearity = 'leakyrelu'
+        stride1_conv2d_block = \
+            functools.partial(Conv2dBlock,
+                              kernel_size=kernel_size,
+                              stride=1,
+                              padding=padding,
+                              weight_norm_type=weight_norm_type,
+                              activation_norm_type=activation_norm_type,
+                              nonlinearity=nonlinearity,
+                              # inplace_nonlinearity=True,
+                              order='CNA')
+        down_conv2d_block = \
+            functools.partial(Conv2dBlock,
+                              kernel_size=kernel_size,
+                              stride=2,
+                              padding=padding,
+                              weight_norm_type=weight_norm_type,
+                              activation_norm_type=activation_norm_type,
+                              nonlinearity=nonlinearity,
+                              # inplace_nonlinearity=True,
+                              order='CNA')
+        latent_conv2d_block = \
+            functools.partial(Conv2dBlock,
+                              kernel_size=1,
+                              stride=1,
+                              weight_norm_type=weight_norm_type,
+                              activation_norm_type=activation_norm_type,
+                              nonlinearity=nonlinearity,
+                              # inplace_nonlinearity=True,
+                              order='CNA')
+        # bottom-up pathway
+        self.enc1 = down_conv2d_block(num_input_channels, num_filters)  # 3
+        self.enc2 = down_conv2d_block(1 * num_filters, 2 * num_filters)  # 7
+        self.enc3 = down_conv2d_block(2 * num_filters, 4 * num_filters)  # 15
+        self.enc4 = down_conv2d_block(4 * num_filters, 8 * num_filters)  # 31
+        self.enc5 = down_conv2d_block(8 * num_filters, 8 * num_filters)  # 63
+        # top-down pathway
+        # self.lat1 = latent_conv2d_block(num_filters, 2 * num_filters) # Zekun
+        self.lat2 = latent_conv2d_block(2 * num_filters, 4 * num_filters)
+        self.lat3 = latent_conv2d_block(4 * num_filters, 4 * num_filters)
+        self.lat4 = latent_conv2d_block(8 * num_filters, 4 * num_filters)
+        self.lat5 = latent_conv2d_block(8 * num_filters, 4 * num_filters)
+        # upsampling
+        self.upsample2x = nn.Upsample(scale_factor=2, mode='bilinear',
+                                      align_corners=False)
+        # final layers
+        self.final2 = stride1_conv2d_block(4 * num_filters, 2 * num_filters)
+        self.output = Conv2dBlock(num_filters * 2, num_labels+1, kernel_size=1)
+        if self.do_multiscale:
+            self.final3 = stride1_conv2d_block(4 * num_filters, 2 * num_filters)
+            self.final4 = stride1_conv2d_block(4 * num_filters, 2 * num_filters)
+            if self.no_label_except_largest_scale:
+                self.output3 = Conv2dBlock(num_filters * 2, 2, kernel_size=1)
+                self.output4 = Conv2dBlock(num_filters * 2, 2, kernel_size=1)
+            else:
+                self.output3 = Conv2dBlock(num_filters * 2, num_labels+1, kernel_size=1)
+                self.output4 = Conv2dBlock(num_filters * 2, num_labels+1, kernel_size=1)
+        self.interpolator = functools.partial(F.interpolate, mode='nearest')
+        if smooth_resample:
+            self.interpolator = self.smooth_interp
+    @staticmethod
+    def smooth_interp(x, size):
+        r"""Smooth interpolation of segmentation maps.
+        Args:
+            x (4D tensor): Segmentation maps.
+            size(2D list): Target size (H, W).
+        """
+        x = F.interpolate(x, size=size, mode='area')
+        onehot_idx = torch.argmax(x, dim=-3, keepdims=True)
+        x.fill_(0.0)
+        x.scatter_(1, onehot_idx, 1.0)
+        return x
+    # Weights: [N C]
+    def forward(self, images, segmaps, weights=None):
+        # Assume images 256x256
+        # bottom-up pathway
+        feat11 = self.enc1(images)  # 128
+        feat12 = self.enc2(feat11)  # 64
+        feat13 = self.enc3(feat12)  # 32
+        feat14 = self.enc4(feat13)  # 16
+        feat15 = self.enc5(feat14)  # 8
+        # top-down pathway and lateral connections
+        feat25 = self.lat5(feat15)  # 8
+        feat24 = self.upsample2x(feat25) + self.lat4(feat14)  # 16
+        feat23 = self.upsample2x(feat24) + self.lat3(feat13)  # 32
+        feat22 = self.upsample2x(feat23) + self.lat2(feat12)  # 64
+        # final prediction layers
+        feat32 = self.final2(feat22)
+        results = []
+        label_map = self.interpolator(segmaps, size=feat32.size()[2:])
+        pred2 = self.output(feat32)  # N, num_labels+1, H//4, W//4
+        features = [feat11, feat12, feat13, feat14, feat15, feat25, feat24, feat23, feat22]
+        if weights is not None:
+            label_map = label_map * weights[..., None, None]
+        results.append({'pred': pred2, 'label': label_map})
+        if self.do_multiscale:
+            feat33 = self.final3(feat23)
+            pred3 = self.output3(feat33)
+            feat34 = self.final4(feat24)
+            pred4 = self.output4(feat34)
+            if self.no_label_except_largest_scale:
+                label_map3 = torch.ones([pred3.size(0), 1, pred3.size(2), pred3.size(3)], device=pred3.device)
+                label_map4 = torch.ones([pred4.size(0), 1, pred4.size(2), pred4.size(3)], device=pred4.device)
+            else:
+                label_map3 = self.interpolator(segmaps, size=pred3.size()[2:])
+                label_map4 = self.interpolator(segmaps, size=pred4.size()[2:])
+            if weights is not None:
+                label_map3 = label_map3 * weights[..., None, None]
+                label_map4 = label_map4 * weights[..., None, None]
+            results.append({'pred': pred3, 'label': label_map3})
+            results.append({'pred': pred4, 'label': label_map4})
+        return results, features

imaginaire/generators/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, check out LICENSE.md

imaginaire/generators/gancraft_base.py ADDED Viewed

	@@ -0,0 +1,603 @@

+# Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, check out LICENSE.md
+import functools
+import re
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from imaginaire.layers import Conv2dBlock, LinearBlock
+from imaginaire.model_utils.layers import AffineMod, ModLinear
+import imaginaire.model_utils.gancraft.mc_utils as mc_utils
+import imaginaire.model_utils.gancraft.voxlib as voxlib
+from imaginaire.utils.distributed import master_only_print as print
+class RenderMLP(nn.Module):
+    r""" MLP with affine modulation."""
+    def __init__(self, in_channels, style_dim, viewdir_dim, mask_dim=680,
+                 out_channels_s=1, out_channels_c=3, hidden_channels=256,
+                 use_seg=True):
+        super(RenderMLP, self).__init__()
+        self.use_seg = use_seg
+        if self.use_seg:
+            self.fc_m_a = nn.Linear(mask_dim, hidden_channels, bias=False)
+        self.fc_viewdir = None
+        if viewdir_dim > 0:
+            self.fc_viewdir = nn.Linear(viewdir_dim, hidden_channels, bias=False)
+        self.fc_1 = nn.Linear(in_channels, hidden_channels)
+        self.fc_2 = ModLinear(hidden_channels, hidden_channels, style_dim, bias=False, mod_bias=True, output_mode=True)
+        self.fc_3 = ModLinear(hidden_channels, hidden_channels, style_dim, bias=False, mod_bias=True, output_mode=True)
+        self.fc_4 = ModLinear(hidden_channels, hidden_channels, style_dim, bias=False, mod_bias=True, output_mode=True)
+        self.fc_sigma = nn.Linear(hidden_channels, out_channels_s)
+        if viewdir_dim > 0:
+            self.fc_5 = nn.Linear(hidden_channels, hidden_channels, bias=False)
+            self.mod_5 = AffineMod(hidden_channels, style_dim, mod_bias=True)
+        else:
+            self.fc_5 = ModLinear(hidden_channels, hidden_channels, style_dim,
+                                  bias=False, mod_bias=True, output_mode=True)
+        self.fc_6 = ModLinear(hidden_channels, hidden_channels, style_dim, bias=False, mod_bias=True, output_mode=True)
+        self.fc_out_c = nn.Linear(hidden_channels, out_channels_c)
+        self.act = nn.LeakyReLU(negative_slope=0.2)
+    def forward(self, x, raydir, z, m):
+        r""" Forward network
+        Args:
+            x (N x H x W x M x in_channels tensor): Projected features.
+            raydir (N x H x W x 1 x viewdir_dim tensor): Ray directions.
+            z (N x style_dim tensor): Style codes.
+            m (N x H x W x M x mask_dim tensor): One-hot segmentation maps.
+        """
+        b, h, w, n, _ = x.size()
+        z = z[:, None, None, None, :]
+        f = self.fc_1(x)
+        if self.use_seg:
+            f = f + self.fc_m_a(m)
+        # Common MLP
+        f = self.act(f)
+        f = self.act(self.fc_2(f, z))
+        f = self.act(self.fc_3(f, z))
+        f = self.act(self.fc_4(f, z))
+        # Sigma MLP
+        sigma = self.fc_sigma(f)
+        # Color MLP
+        if self.fc_viewdir is not None:
+            f = self.fc_5(f)
+            f = f + self.fc_viewdir(raydir)
+            f = self.act(self.mod_5(f, z))
+        else:
+            f = self.act(self.fc_5(f, z))
+        f = self.act(self.fc_6(f, z))
+        c = self.fc_out_c(f)
+        return sigma, c
+class StyleMLP(nn.Module):
+    r"""MLP converting style code to intermediate style representation."""
+    def __init__(self, style_dim, out_dim, hidden_channels=256, leaky_relu=True, num_layers=5, normalize_input=True,
+                 output_act=True):
+        super(StyleMLP, self).__init__()
+        self.normalize_input = normalize_input
+        self.output_act = output_act
+        fc_layers = []
+        fc_layers.append(nn.Linear(style_dim, hidden_channels, bias=True))
+        for i in range(num_layers-1):
+            fc_layers.append(nn.Linear(hidden_channels, hidden_channels, bias=True))
+        self.fc_layers = nn.ModuleList(fc_layers)
+        self.fc_out = nn.Linear(hidden_channels, out_dim, bias=True)
+        if leaky_relu:
+            self.act = nn.LeakyReLU(negative_slope=0.2, inplace=True)
+        else:
+            self.act = functools.partial(F.relu, inplace=True)
+    def forward(self, z):
+        r""" Forward network
+        Args:
+            z (N x style_dim tensor): Style codes.
+        """
+        if self.normalize_input:
+            z = F.normalize(z, p=2, dim=-1)
+        for fc_layer in self.fc_layers:
+            z = self.act(fc_layer(z))
+        z = self.fc_out(z)
+        if self.output_act:
+            z = self.act(z)
+        return z
+class SKYMLP(nn.Module):
+    r"""MLP converting ray directions to sky features."""
+    def __init__(self, in_channels, style_dim, out_channels_c=3,
+                 hidden_channels=256, leaky_relu=True):
+        super(SKYMLP, self).__init__()
+        self.fc_z_a = nn.Linear(style_dim, hidden_channels, bias=False)
+        self.fc1 = nn.Linear(in_channels, hidden_channels)
+        self.fc2 = nn.Linear(hidden_channels, hidden_channels)
+        self.fc3 = nn.Linear(hidden_channels, hidden_channels)
+        self.fc4 = nn.Linear(hidden_channels, hidden_channels)
+        self.fc5 = nn.Linear(hidden_channels, hidden_channels)
+        self.fc_out_c = nn.Linear(hidden_channels, out_channels_c)
+        if leaky_relu:
+            self.act = nn.LeakyReLU(negative_slope=0.2, inplace=True)
+        else:
+            self.act = functools.partial(F.relu, inplace=True)
+    def forward(self, x, z):
+        r"""Forward network
+        Args:
+            x (... x in_channels tensor): Ray direction embeddings.
+            z (... x style_dim tensor): Style codes.
+        """
+        z = self.fc_z_a(z)
+        while z.dim() < x.dim():
+            z = z.unsqueeze(1)
+        y = self.act(self.fc1(x) + z)
+        y = self.act(self.fc2(y))
+        y = self.act(self.fc3(y))
+        y = self.act(self.fc4(y))
+        y = self.act(self.fc5(y))
+        c = self.fc_out_c(y)
+        return c
+class RenderCNN(nn.Module):
+    r"""CNN converting intermediate feature map to final image."""
+    def __init__(self, in_channels, style_dim, hidden_channels=256,
+                 leaky_relu=True):
+        super(RenderCNN, self).__init__()
+        self.fc_z_cond = nn.Linear(style_dim, 2 * 2 * hidden_channels)
+        self.conv1 = nn.Conv2d(in_channels, hidden_channels, 1, stride=1, padding=0)
+        self.conv2a = nn.Conv2d(hidden_channels, hidden_channels, 3, stride=1, padding=1)
+        self.conv2b = nn.Conv2d(hidden_channels, hidden_channels, 3, stride=1, padding=1, bias=False)
+        self.conv3a = nn.Conv2d(hidden_channels, hidden_channels, 3, stride=1, padding=1)
+        self.conv3b = nn.Conv2d(hidden_channels, hidden_channels, 3, stride=1, padding=1, bias=False)
+        self.conv4a = nn.Conv2d(hidden_channels, hidden_channels, 1, stride=1, padding=0)
+        self.conv4b = nn.Conv2d(hidden_channels, hidden_channels, 1, stride=1, padding=0)
+        self.conv4 = nn.Conv2d(hidden_channels, 3, 1, stride=1, padding=0)
+        if leaky_relu:
+            self.act = nn.LeakyReLU(negative_slope=0.2, inplace=True)
+        else:
+            self.act = functools.partial(F.relu, inplace=True)
+    def modulate(self, x, w, b):
+        w = w[..., None, None]
+        b = b[..., None, None]
+        return x * (w+1) + b
+    def forward(self, x, z):
+        r"""Forward network.
+        Args:
+            x (N x in_channels x H x W tensor): Intermediate feature map
+            z (N x style_dim tensor): Style codes.
+        """
+        z = self.fc_z_cond(z)
+        adapt = torch.chunk(z, 2 * 2, dim=-1)
+        y = self.act(self.conv1(x))
+        y = y + self.conv2b(self.act(self.conv2a(y)))
+        y = self.act(self.modulate(y, adapt[0], adapt[1]))
+        y = y + self.conv3b(self.act(self.conv3a(y)))
+        y = self.act(self.modulate(y, adapt[2], adapt[3]))
+        y = y + self.conv4b(self.act(self.conv4a(y)))
+        y = self.act(y)
+        y = self.conv4(y)
+        return y
+class StyleEncoder(nn.Module):
+    r"""Style Encoder constructor.
+    Args:
+        style_enc_cfg (obj): Style encoder definition file.
+    """
+    def __init__(self, style_enc_cfg):
+        super(StyleEncoder, self).__init__()
+        input_image_channels = style_enc_cfg.input_image_channels
+        num_filters = style_enc_cfg.num_filters
+        kernel_size = style_enc_cfg.kernel_size
+        padding = int(np.ceil((kernel_size - 1.0) / 2))
+        style_dims = style_enc_cfg.style_dims
+        weight_norm_type = style_enc_cfg.weight_norm_type
+        self.no_vae = getattr(style_enc_cfg, 'no_vae', False)
+        activation_norm_type = 'none'
+        nonlinearity = 'leakyrelu'
+        base_conv2d_block = \
+            functools.partial(Conv2dBlock,
+                              kernel_size=kernel_size,
+                              stride=2,
+                              padding=padding,
+                              weight_norm_type=weight_norm_type,
+                              activation_norm_type=activation_norm_type,
+                              # inplace_nonlinearity=True,
+                              nonlinearity=nonlinearity)
+        self.layer1 = base_conv2d_block(input_image_channels, num_filters)
+        self.layer2 = base_conv2d_block(num_filters * 1, num_filters * 2)
+        self.layer3 = base_conv2d_block(num_filters * 2, num_filters * 4)
+        self.layer4 = base_conv2d_block(num_filters * 4, num_filters * 8)
+        self.layer5 = base_conv2d_block(num_filters * 8, num_filters * 8)
+        self.layer6 = base_conv2d_block(num_filters * 8, num_filters * 8)
+        self.fc_mu = LinearBlock(num_filters * 8 * 4 * 4, style_dims)
+        if not self.no_vae:
+            self.fc_var = LinearBlock(num_filters * 8 * 4 * 4, style_dims)
+    def forward(self, input_x):
+        r"""SPADE Style Encoder forward.
+        Args:
+            input_x (N x 3 x H x W tensor): input images.
+        Returns:
+            mu (N x C tensor): Mean vectors.
+            logvar (N x C tensor): Log-variance vectors.
+            z (N x C tensor): Style code vectors.
+        """
+        if input_x.size(2) != 256 or input_x.size(3) != 256:
+            input_x = F.interpolate(input_x, size=(256, 256), mode='bilinear')
+        x = self.layer1(input_x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.layer5(x)
+        x = self.layer6(x)
+        x = x.view(x.size(0), -1)
+        mu = self.fc_mu(x)
+        if not self.no_vae:
+            logvar = self.fc_var(x)
+            std = torch.exp(0.5 * logvar)
+            eps = torch.randn_like(std)
+            z = eps.mul(std) + mu
+        else:
+            z = mu
+            logvar = torch.zeros_like(mu)
+        return mu, logvar, z
+class Base3DGenerator(nn.Module):
+    r"""Minecraft 3D generator constructor.
+    Args:
+        gen_cfg (obj): Generator definition part of the yaml config file.
+        data_cfg (obj): Data definition part of the yaml config file.
+    """
+    def __init__(self, gen_cfg, data_cfg):
+        super(Base3DGenerator, self).__init__()
+        print('Base3DGenerator initialization.')
+        # ---------------------- Main Network ------------------------
+        # Exclude some of the features from positional encoding
+        self.pe_no_pe_feat_dim = getattr(gen_cfg, 'pe_no_pe_feat_dim', 0)
+        # blk_feat passes through PE
+        input_dim = (gen_cfg.blk_feat_dim-self.pe_no_pe_feat_dim)*(gen_cfg.pe_lvl_feat*2) + self.pe_no_pe_feat_dim
+        if (gen_cfg.pe_incl_orig_feat):
+            input_dim += (gen_cfg.blk_feat_dim-self.pe_no_pe_feat_dim)
+        print('[Base3DGenerator] Expected input dimensions: ', input_dim)
+        self.input_dim = input_dim
+        self.mlp_model_kwargs = gen_cfg.mlp_model_kwargs
+        self.pe_lvl_localcoords = getattr(gen_cfg, 'pe_lvl_localcoords', 0)
+        if self.pe_lvl_localcoords > 0:
+            self.mlp_model_kwargs['poscode_dim'] = self.pe_lvl_localcoords * 2 * 3
+        # Set pe_lvl_raydir=0 and pe_incl_orig_raydir=False to disable view direction input
+        input_dim_viewdir = 3*(gen_cfg.pe_lvl_raydir*2)
+        if (gen_cfg.pe_incl_orig_raydir):
+            input_dim_viewdir += 3
+        print('[Base3DGenerator] Expected viewdir input dimensions: ', input_dim_viewdir)
+        self.input_dim_viewdir = input_dim_viewdir
+        self.pe_params = [gen_cfg.pe_lvl_feat, gen_cfg.pe_incl_orig_feat,
+                          gen_cfg.pe_lvl_raydir, gen_cfg.pe_incl_orig_raydir]
+        # Style input dimension
+        style_dims = gen_cfg.style_dims
+        self.style_dims = style_dims
+        interm_style_dims = getattr(gen_cfg, 'interm_style_dims', style_dims)
+        self.interm_style_dims = interm_style_dims
+        # ---------------------- Style MLP --------------------------
+        self.style_net = globals()[gen_cfg.stylenet_model](
+            style_dims, interm_style_dims, **gen_cfg.stylenet_model_kwargs)
+        # number of output channels for MLP (before blending)
+        final_feat_dim = getattr(gen_cfg, 'final_feat_dim', 16)
+        self.final_feat_dim = final_feat_dim
+        # ----------------------- Sky Network -------------------------
+        sky_input_dim_base = 3
+        # Dedicated sky network input dimensions
+        sky_input_dim = sky_input_dim_base*(gen_cfg.pe_lvl_raydir_sky*2)
+        if (gen_cfg.pe_incl_orig_raydir_sky):
+            sky_input_dim += sky_input_dim_base
+        print('[Base3DGenerator] Expected sky input dimensions: ', sky_input_dim)
+        self.pe_params_sky = [gen_cfg.pe_lvl_raydir_sky, gen_cfg.pe_incl_orig_raydir_sky]
+        self.sky_net = SKYMLP(sky_input_dim, style_dim=interm_style_dims, out_channels_c=final_feat_dim)
+        # ----------------------- Style Encoder -------------------------
+        style_enc_cfg = getattr(gen_cfg, 'style_enc', None)
+        setattr(style_enc_cfg, 'input_image_channels', 3)
+        setattr(style_enc_cfg, 'style_dims', gen_cfg.style_dims)
+        self.style_encoder = StyleEncoder(style_enc_cfg)
+        # ---------------------- Ray Caster -------------------------
+        self.num_blocks_early_stop = gen_cfg.num_blocks_early_stop
+        self.num_samples = gen_cfg.num_samples
+        self.sample_depth = gen_cfg.sample_depth
+        self.coarse_deterministic_sampling = getattr(gen_cfg, 'coarse_deterministic_sampling', True)
+        self.sample_use_box_boundaries = getattr(gen_cfg, 'sample_use_box_boundaries', True)
+        # ---------------------- Blender -------------------------
+        self.raw_noise_std = getattr(gen_cfg, 'raw_noise_std', 0.0)
+        self.dists_scale = getattr(gen_cfg, 'dists_scale', 0.25)
+        self.clip_feat_map = getattr(gen_cfg, 'clip_feat_map', True)
+        self.keep_sky_out = getattr(gen_cfg, 'keep_sky_out', False)
+        self.keep_sky_out_avgpool = getattr(gen_cfg, 'keep_sky_out_avgpool', False)
+        keep_sky_out_learnbg = getattr(gen_cfg, 'keep_sky_out_learnbg', False)
+        self.sky_global_avgpool = getattr(gen_cfg, 'sky_global_avgpool', False)
+        if self.keep_sky_out:
+            self.sky_replace_color = None
+            if keep_sky_out_learnbg:
+                sky_replace_color = torch.zeros([final_feat_dim])
+                sky_replace_color.requires_grad = True
+                self.sky_replace_color = torch.nn.Parameter(sky_replace_color)
+        # ---------------------- render_cnn -------------------------
+        self.denoiser = RenderCNN(final_feat_dim, style_dim=interm_style_dims)
+        self.pad = gen_cfg.pad
+    def get_param_groups(self, cfg_opt):
+        print('[Generator] get_param_groups')
+        if hasattr(cfg_opt, 'ignore_parameters'):
+            print('[Generator::get_param_groups] [x]: ignored.')
+            optimize_parameters = []
+            for k, x in self.named_parameters():
+                match = False
+                for m in cfg_opt.ignore_parameters:
+                    if re.match(m, k) is not None:
+                        match = True
+                        print(' [x]', k)
+                        break
+                if match is False:
+                    print(' [v]', k)
+                    optimize_parameters.append(x)
+        else:
+            optimize_parameters = self.parameters()
+        param_groups = []
+        param_groups.append({'params': optimize_parameters})
+        if hasattr(cfg_opt, 'param_groups'):
+            optimized_param_names = []
+            all_param_names = [k for k, v in self.named_parameters()]
+            param_groups = []
+            for k, v in cfg_opt.param_groups.items():
+                print('[Generator::get_param_groups] Adding param group from config:', k, v)
+                params = getattr(self, k)
+                named_parameters = [k]
+                if issubclass(type(params), nn.Module):
+                    named_parameters = [k+'.'+pname for pname, _ in params.named_parameters()]
+                    params = params.parameters()
+                param_groups.append({'params': params, **v})
+                optimized_param_names.extend(named_parameters)
+        print('[Generator::get_param_groups] UNOPTIMIZED PARAMETERS:\n    ',
+              set(all_param_names) - set(optimized_param_names))
+        return param_groups
+    def _forward_perpix_sub(self, blk_feats, worldcoord2, raydirs_in, z, mc_masks_onehot=None):
+        r"""Forwarding the MLP.
+        Args:
+            blk_feats (K x C1 tensor): Sparse block features.
+            worldcoord2 (N x H x W x L x 3 tensor): 3D world coordinates of sampled points. L is number of samples; N is batch size, always 1.
+            raydirs_in (N x H x W x 1 x C2 tensor or None): ray direction embeddings.
+            z (N x C3 tensor): Intermediate style vectors.
+            mc_masks_onehot (N x H x W x L x C4): One-hot segmentation maps.
+        Returns:
+            net_out_s (N x H x W x L x 1 tensor): Opacities.
+            net_out_c (N x H x W x L x C5 tensor): Color embeddings.
+        """
+        proj_feature = voxlib.sparse_trilinear_interp_worldcoord(
+            blk_feats, self.voxel.corner_t, worldcoord2, ign_zero=True)
+        render_net_extra_kwargs = {}
+        if self.pe_lvl_localcoords > 0:
+            local_coords = torch.remainder(worldcoord2, 1.0) * 2.0
+            # Scale to [0, 2], as the positional encoding function doesn't have internal x2
+            local_coords[torch.isnan(local_coords)] = 0.0
+            local_coords = local_coords.contiguous()
+            poscode = voxlib.positional_encoding(local_coords, self.pe_lvl_localcoords, -1, False)
+            render_net_extra_kwargs['poscode'] = poscode
+        if self.pe_params[0] == 0 and self.pe_params[1] is True:  # no PE shortcut, saves ~400MB
+            feature_in = proj_feature
+        else:
+            if self.pe_no_pe_feat_dim > 0:
+                feature_in = voxlib.positional_encoding(
+                    proj_feature[..., :-self.pe_no_pe_feat_dim].contiguous(), self.pe_params[0], -1, self.pe_params[1])
+                feature_in = torch.cat([feature_in, proj_feature[..., -self.pe_no_pe_feat_dim:]], dim=-1)
+            else:
+                feature_in = voxlib.positional_encoding(
+                    proj_feature.contiguous(), self.pe_params[0], -1, self.pe_params[1])
+        net_out_s, net_out_c = self.render_net(feature_in, raydirs_in, z, mc_masks_onehot, **render_net_extra_kwargs)
+        if self.raw_noise_std > 0.:
+            noise = torch.randn_like(net_out_s) * self.raw_noise_std
+            net_out_s = net_out_s + noise
+        return net_out_s, net_out_c
+    def _forward_perpix(self, blk_feats, voxel_id, depth2, raydirs, cam_ori_t, z):
+        r"""Sample points along rays, forwarding the per-point MLP and aggregate pixel features
+        Args:
+            blk_feats (K x C1 tensor): Sparse block features.
+            voxel_id (N x H x W x M x 1 tensor): Voxel ids from ray-voxel intersection test. M: num intersected voxels, why always 6?
+            depth2 (N x 2 x H x W x M x 1 tensor): Depths of entrance and exit points for each ray-voxel intersection.
+            raydirs (N x H x W x 1 x 3 tensor): The direction of each ray.
+            cam_ori_t (N x 3 tensor): Camera origins.
+            z (N x C3 tensor): Intermediate style vectors.
+        """
+        # Generate sky_mask; PE transform on ray direction.
+        with torch.no_grad():
+            raydirs_in = raydirs.expand(-1, -1, -1, 1, -1).contiguous()
+            if self.pe_params[2] == 0 and self.pe_params[3] is True:
+                raydirs_in = raydirs_in
+            elif self.pe_params[2] == 0 and self.pe_params[3] is False:  # Not using raydir at all
+                raydirs_in = None
+            else:
+                raydirs_in = voxlib.positional_encoding(raydirs_in, self.pe_params[2], -1, self.pe_params[3])
+            # sky_mask: when True, ray finally hits sky
+            sky_mask = voxel_id[:, :, :, [-1], :] == 0
+            # sky_only_mask: when True, ray hits nothing but sky
+            sky_only_mask = voxel_id[:, :, :, [0], :] == 0
+        with torch.no_grad():
+            # Random sample points along the ray
+            num_samples = self.num_samples + 1
+            if self.sample_use_box_boundaries:
+                num_samples = self.num_samples - self.num_blocks_early_stop
+            # 10 samples per ray + 4 intersections - 2
+            rand_depth, new_dists, new_idx = mc_utils.sample_depth_batched(
+                depth2, num_samples, deterministic=self.coarse_deterministic_sampling,
+                use_box_boundaries=self.sample_use_box_boundaries, sample_depth=self.sample_depth)
+            worldcoord2 = raydirs * rand_depth + cam_ori_t[:, None, None, None, :]
+            # Generate per-sample segmentation label
+            voxel_id_reduced = self.label_trans.mc2reduced(voxel_id, ign2dirt=True)
+            mc_masks = torch.gather(voxel_id_reduced, -2, new_idx)  # B 256 256 N 1
+            mc_masks = mc_masks.long()
+            mc_masks_onehot = torch.zeros([mc_masks.size(0), mc_masks.size(1), mc_masks.size(
+                2), mc_masks.size(3), self.num_reduced_labels], dtype=torch.float, device=voxel_id.device)
+            # mc_masks_onehot: [B H W Nlayer 680]
+            mc_masks_onehot.scatter_(-1, mc_masks, 1.0)
+        net_out_s, net_out_c = self._forward_perpix_sub(blk_feats, worldcoord2, raydirs_in, z, mc_masks_onehot)
+        # Handle sky
+        sky_raydirs_in = raydirs.expand(-1, -1, -1, 1, -1).contiguous()
+        sky_raydirs_in = voxlib.positional_encoding(sky_raydirs_in, self.pe_params_sky[0], -1, self.pe_params_sky[1])
+        skynet_out_c = self.sky_net(sky_raydirs_in, z)
+        # Blending
+        weights = mc_utils.volum_rendering_relu(net_out_s, new_dists * self.dists_scale, dim=-2)
+        # If a ray exclusively hits the sky (no intersection with the voxels), set its weight to zero.
+        weights = weights * torch.logical_not(sky_only_mask).float()
+        total_weights_raw = torch.sum(weights, dim=-2, keepdim=True)  # 256 256 1 1
+        total_weights = total_weights_raw
+        is_gnd = worldcoord2[..., [0]] <= 1.0  # Y X Z, [256, 256, 4, 3], nan < 1.0 == False
+        is_gnd = is_gnd.any(dim=-2, keepdim=True)
+        nosky_mask = torch.logical_or(torch.logical_not(sky_mask), is_gnd)
+        nosky_mask = nosky_mask.float()
+        # Avoid sky leakage
+        sky_weight = 1.0-total_weights
+        if self.keep_sky_out:
+            # keep_sky_out_avgpool overrides sky_replace_color
+            if self.sky_replace_color is None or self.keep_sky_out_avgpool:
+                if self.keep_sky_out_avgpool:
+                    if hasattr(self, 'sky_avg'):
+                        sky_avg = self.sky_avg
+                    else:
+                        if self.sky_global_avgpool:
+                            sky_avg = torch.mean(skynet_out_c, dim=[1, 2], keepdim=True)
+                        else:
+                            skynet_out_c_nchw = skynet_out_c.permute(0, 4, 1, 2, 3).squeeze(-1).contiguous()
+                            sky_avg = F.avg_pool2d(skynet_out_c_nchw, 31, stride=1, padding=15, count_include_pad=False)
+                            sky_avg = sky_avg.permute(0, 2, 3, 1).unsqueeze(-2).contiguous()
+                    # print(sky_avg.shape)
+                    skynet_out_c = skynet_out_c * (1.0-nosky_mask) + sky_avg*(nosky_mask)
+                else:
+                    sky_weight = sky_weight * (1.0-nosky_mask)
+            else:
+                skynet_out_c = skynet_out_c * (1.0-nosky_mask) + self.sky_replace_color*(nosky_mask)
+        if self.clip_feat_map is True:  # intermediate feature before blending & CNN
+            rgbs = torch.clamp(net_out_c, -1, 1) + 1
+            rgbs_sky = torch.clamp(skynet_out_c, -1, 1) + 1
+            net_out = torch.sum(weights*rgbs, dim=-2, keepdim=True) + sky_weight * \
+                rgbs_sky  # 576, 768, 4, 3 -> 576, 768, 3
+            net_out = net_out.squeeze(-2)
+            net_out = net_out - 1
+        elif self.clip_feat_map is False:
+            rgbs = net_out_c
+            rgbs_sky = skynet_out_c
+            net_out = torch.sum(weights*rgbs, dim=-2, keepdim=True) + sky_weight * \
+                rgbs_sky  # 576, 768, 4, 3 -> 576, 768, 3
+            net_out = net_out.squeeze(-2)
+        elif self.clip_feat_map == 'tanh':
+            rgbs = torch.tanh(net_out_c)
+            rgbs_sky = torch.tanh(skynet_out_c)
+            net_out = torch.sum(weights*rgbs, dim=-2, keepdim=True) + sky_weight * \
+                rgbs_sky  # 576, 768, 4, 3 -> 576, 768, 3
+            net_out = net_out.squeeze(-2)
+        else:
+            raise NotImplementedError
+        return net_out, new_dists, weights, total_weights_raw, rand_depth, net_out_s, net_out_c, skynet_out_c, \
+            nosky_mask, sky_mask, sky_only_mask, new_idx
+    def _forward_global(self, net_out, z):
+        r"""Forward the CNN
+        Args:
+            net_out (N x C5 x H x W tensor): Intermediate feature maps.
+            z (N x C3 tensor): Intermediate style vectors.
+        Returns:
+            fake_images (N x 3 x H x W tensor): Output image.
+            fake_images_raw (N x 3 x H x W tensor): Output image before TanH.
+        """
+        fake_images = net_out.permute(0, 3, 1, 2).contiguous()
+        fake_images_raw = self.denoiser(fake_images, z)
+        fake_images = torch.tanh(fake_images_raw)
+        return fake_images, fake_images_raw

imaginaire/generators/scenedreamer.py ADDED Viewed

	@@ -0,0 +1,851 @@

+# Using Hashgrid as backbone representation
+import os
+import cv2
+import imageio
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import imaginaire.model_utils.gancraft.camctl as camctl
+import imaginaire.model_utils.gancraft.mc_utils as mc_utils
+import imaginaire.model_utils.gancraft.voxlib as voxlib
+from imaginaire.model_utils.pcg_gen import PCGVoxelGenerator, PCGCache
+from imaginaire.utils.distributed import master_only_print as print
+from imaginaire.generators.gancraft_base import Base3DGenerator
+from encoding import get_encoder
+from imaginaire.model_utils.layers import LightningMLP, ConditionalHashGrid
+class Generator(Base3DGenerator):
+    r"""SceneDreamer generator constructor.
+    Args:
+        gen_cfg (obj): Generator definition part of the yaml config file.
+        data_cfg (obj): Data definition part of the yaml config file.
+    """
+    def __init__(self, gen_cfg, data_cfg):
+        super(Generator, self).__init__(gen_cfg, data_cfg)
+        print('SceneDreamer[Hash] on ALL Scenes generator initialization.')
+        # here should be a list of height maps and semantic maps
+        if gen_cfg.pcg_cache:
+            print('[Generator] Loading PCG dataset: ', gen_cfg.pcg_dataset_path)
+            self.voxel = PCGCache(gen_cfg.pcg_dataset_path)
+            print('[Generator] Loaded PCG dataset.')
+        else:
+            self.voxel = PCGVoxelGenerator(gen_cfg.scene_size)
+        self.blk_feats = None
+        # Minecraft -> SPADE label translator.
+        self.label_trans = mc_utils.MCLabelTranslator()
+        self.num_reduced_labels = self.label_trans.get_num_reduced_lbls()
+        self.reduced_label_set = getattr(gen_cfg, 'reduced_label_set', False)
+        self.use_label_smooth = getattr(gen_cfg, 'use_label_smooth', False)
+        self.use_label_smooth_real = getattr(gen_cfg, 'use_label_smooth_real', self.use_label_smooth)
+        self.use_label_smooth_pgt = getattr(gen_cfg, 'use_label_smooth_pgt', False)
+        self.label_smooth_dia = getattr(gen_cfg, 'label_smooth_dia', 11)
+        # Load MLP model.
+        self.hash_encoder, self.hash_in_dim = get_encoder(encoding='hashgrid', input_dim=5, desired_resolution=2048 * 1, level_dim=8)
+        self.render_net = LightningMLP(self.hash_in_dim, viewdir_dim=self.input_dim_viewdir, style_dim=self.interm_style_dims, mask_dim=self.num_reduced_labels, out_channels_s=1, out_channels_c=self.final_feat_dim, **self.mlp_model_kwargs)
+        print(self.hash_encoder)
+        self.world_encoder = ConditionalHashGrid()
+        # Camera sampler.
+        self.camera_sampler_type = getattr(gen_cfg, 'camera_sampler_type', "random")
+        assert self.camera_sampler_type in ['random', 'traditional']
+        self.camera_min_entropy = getattr(gen_cfg, 'camera_min_entropy', -1)
+        self.camera_rej_avg_depth = getattr(gen_cfg, 'camera_rej_avg_depth', -1)
+        self.cam_res = gen_cfg.cam_res
+        self.crop_size = gen_cfg.crop_size
+        print('Done with the SceneDreamer initialization.')
+    def custom_init(self):
+        r"""Weight initialization."""
+        def init_func(m):
+            if hasattr(m, 'weight'):
+                try:
+                    nn.init.kaiming_normal_(m.weight.data, a=0.2, nonlinearity='leaky_relu')
+                except:
+                    print(m.name)
+                m.weight.data *= 0.5
+            if hasattr(m, 'bias') and m.bias is not None:
+                m.bias.data.fill_(0.0)
+        self.apply(init_func)
+    def _get_batch(self, batch_size, device):
+        r"""Sample camera poses and perform ray-voxel intersection.
+        Args:
+            batch_size (int): Expected batch size of the current batch
+            device (torch.device): Device on which the tensors should be stored
+        """
+        with torch.no_grad():
+            self.voxel.sample_world(device)
+            voxel_id_batch = []
+            depth2_batch = []
+            raydirs_batch = []
+            cam_ori_t_batch = []
+            for b in range(batch_size):
+                while True:  # Rejection sampling.
+                    # Sample camera pose.
+                    if self.camera_sampler_type == 'random':
+                        cam_res = self.cam_res
+                        cam_ori_t, cam_dir_t, cam_up_t = camctl.rand_camera_pose_thridperson2(self.voxel)
+                        # ~24mm fov horizontal.
+                        cam_f = 0.5/np.tan(np.deg2rad(73/2) * (np.random.rand(1)*0.5+0.5)) * (cam_res[1]-1)
+                        cam_c = [(cam_res[0]-1)/2, (cam_res[1]-1)/2]
+                        cam_res_crop = [self.crop_size[0] + self.pad, self.crop_size[1] + self.pad]
+                        cam_c = mc_utils.rand_crop(cam_c, cam_res, cam_res_crop)
+                    elif self.camera_sampler_type == 'traditional':
+                        cam_res = self.cam_res
+                        cam_c = [(cam_res[0]-1)/2, (cam_res[1]-1)/2]
+                        dice = torch.rand(1).item()
+                        if dice > 0.5:
+                            cam_ori_t, cam_dir_t, cam_up_t, cam_f = \
+                                camctl.rand_camera_pose_tour(self.voxel)
+                            cam_f = cam_f * (cam_res[1]-1)
+                        else:
+                            cam_ori_t, cam_dir_t, cam_up_t = \
+                                camctl.rand_camera_pose_thridperson2(self.voxel)
+                            # ~24mm fov horizontal.
+                            cam_f = 0.5 / np.tan(np.deg2rad(73/2) * (np.random.rand(1)*0.5+0.5)) * (cam_res[1]-1)
+                        cam_res_crop = [self.crop_size[0] + self.pad, self.crop_size[1] + self.pad]
+                        cam_c = mc_utils.rand_crop(cam_c, cam_res, cam_res_crop)
+                    else:
+                        raise NotImplementedError(
+                            'Unknown self.camera_sampler_type: {}'.format(self.camera_sampler_type))
+                    # Run ray-voxel intersection test
+                    voxel_id, depth2, raydirs = voxlib.ray_voxel_intersection_perspective(
+                        self.voxel.voxel_t, cam_ori_t, cam_dir_t, cam_up_t, cam_f, cam_c, cam_res_crop,
+                        self.num_blocks_early_stop)
+                    if self.camera_rej_avg_depth > 0:
+                        depth_map = depth2[0, :, :, 0, :]
+                        avg_depth = torch.mean(depth_map[~torch.isnan(depth_map)])
+                        if avg_depth < self.camera_rej_avg_depth:
+                            continue
+                    # Reject low entropy.
+                    if self.camera_min_entropy > 0:
+                        # Check entropy.
+                        maskcnt = torch.bincount(
+                            torch.flatten(voxel_id[:, :, 0, 0]), weights=None, minlength=680).float() / \
+                            (voxel_id.size(0)*voxel_id.size(1))
+                        maskentropy = -torch.sum(maskcnt * torch.log(maskcnt+1e-10))
+                        if maskentropy < self.camera_min_entropy:
+                            continue
+                    break
+                voxel_id_batch.append(voxel_id)
+                depth2_batch.append(depth2)
+                raydirs_batch.append(raydirs)
+                cam_ori_t_batch.append(cam_ori_t)
+            voxel_id = torch.stack(voxel_id_batch, dim=0)
+            depth2 = torch.stack(depth2_batch, dim=0)
+            raydirs = torch.stack(raydirs_batch, dim=0)
+            cam_ori_t = torch.stack(cam_ori_t_batch, dim=0).to(device)
+            cam_poses = None
+        return voxel_id, depth2, raydirs, cam_ori_t, cam_poses
+    def get_pseudo_gt(self, pseudo_gen, voxel_id, z=None, style_img=None, resize_512=True, deterministic=False):
+        r"""Evaluating img2img network to obtain pseudo-ground truth images.
+        Args:
+            pseudo_gen (callable): Function converting mask to image using img2img network.
+            voxel_id (N x img_dims[0] x img_dims[1] x max_samples x 1 tensor): IDs of intersected tensors along
+            each ray.
+            z (N x C tensor): Optional style code passed to pseudo_gen.
+            style_img (N x 3 x H x W tensor): Optional style image passed to pseudo_gen.
+            resize_512 (bool): If True, evaluate pseudo_gen at 512x512 regardless of input resolution.
+            deterministic (bool): If True, disable stochastic label mapping.
+        """
+        with torch.no_grad():
+            mc_mask = voxel_id[:, :, :, 0, :].permute(0, 3, 1, 2).long().contiguous()
+            coco_mask = self.label_trans.mc2coco(mc_mask) - 1
+            coco_mask[coco_mask < 0] = 183
+            if not deterministic:
+                # Stochastic mapping
+                dice = torch.rand(1).item()
+                if dice > 0.5 and dice < 0.9:
+                    coco_mask[coco_mask == self.label_trans.gglbl2ggid('sky')] = self.label_trans.gglbl2ggid('clouds')
+                elif dice >= 0.9:
+                    coco_mask[coco_mask == self.label_trans.gglbl2ggid('sky')] = self.label_trans.gglbl2ggid('fog')
+                dice = torch.rand(1).item()
+                if dice > 0.33 and dice < 0.66:
+                    coco_mask[coco_mask == self.label_trans.gglbl2ggid('water')] = self.label_trans.gglbl2ggid('sea')
+                elif dice >= 0.66:
+                    coco_mask[coco_mask == self.label_trans.gglbl2ggid('water')] = self.label_trans.gglbl2ggid('river')
+            fake_masks = torch.zeros([coco_mask.size(0), 185, coco_mask.size(2), coco_mask.size(3)],
+                                     dtype=torch.half, device=voxel_id.device)
+            fake_masks.scatter_(1, coco_mask, 1.0)
+            if self.use_label_smooth_pgt:
+                fake_masks = mc_utils.segmask_smooth(fake_masks, kernel_size=self.label_smooth_dia)
+            if self.pad > 0:
+                fake_masks = fake_masks[:, :, self.pad//2:-self.pad//2, self.pad//2:-self.pad//2]
+            # Generate pseudo GT using GauGAN.
+            if resize_512:
+                fake_masks_512 = F.interpolate(fake_masks, size=[512, 512], mode='nearest')
+            else:
+                fake_masks_512 = fake_masks
+            pseudo_real_img = pseudo_gen(fake_masks_512, z=z, style_img=style_img)
+            # NaN Inf Guard. NaN can occure on Volta GPUs.
+            nan_mask = torch.isnan(pseudo_real_img)
+            inf_mask = torch.isinf(pseudo_real_img)
+            pseudo_real_img[nan_mask | inf_mask] = 0.0
+            if resize_512:
+                pseudo_real_img = F.interpolate(
+                    pseudo_real_img, size=[fake_masks.size(2), fake_masks.size(3)], mode='area')
+            pseudo_real_img = torch.clamp(pseudo_real_img, -1, 1)
+        return pseudo_real_img, fake_masks
+    def sample_camera(self, data, pseudo_gen):
+        r"""Sample camera randomly and precompute everything used by both Gen and Dis.
+        Args:
+            data (dict):
+                images (N x 3 x H x W tensor) : Real images
+                label (N x C2 x H x W tensor) : Segmentation map
+            pseudo_gen (callable): Function converting mask to image using img2img network.
+        Returns:
+            ret (dict):
+                voxel_id (N x H x W x max_samples x 1 tensor): IDs of intersected tensors along each ray.
+                depth2 (N x 2 x H x W x max_samples x 1 tensor): Depths of entrance and exit points for each ray-voxel
+                intersection.
+                raydirs (N x H x W x 1 x 3 tensor): The direction of each ray.
+                cam_ori_t (N x 3 tensor): Camera origins.
+                pseudo_real_img (N x 3 x H x W tensor): Pseudo-ground truth image.
+                real_masks (N x C3 x H x W tensor): One-hot segmentation map for real images, with translated labels.
+                fake_masks (N x C3 x H x W tensor): One-hot segmentation map for sampled camera views.
+        """
+        device = torch.device('cuda')
+        batch_size = data['images'].size(0)
+        # ================ Assemble a batch ==================
+        # Requires: voxel_id, depth2, raydirs, cam_ori_t.
+        voxel_id, depth2, raydirs, cam_ori_t, _ = self._get_batch(batch_size, device)
+        ret = {'voxel_id': voxel_id, 'depth2': depth2, 'raydirs': raydirs, 'cam_ori_t': cam_ori_t}
+        if pseudo_gen is not None:
+            pseudo_real_img, _ = self.get_pseudo_gt(pseudo_gen, voxel_id)
+        ret['pseudo_real_img'] = pseudo_real_img.float()
+        # =============== Mask translation ================
+        real_masks = data['label']
+        if self.reduced_label_set:
+            # Translate fake mask (directly from mcid).
+            # convert unrecognized labels to 'dirt'.
+            # N C H W [1 1 80 80]
+            reduce_fake_mask = self.label_trans.mc2reduced(
+                voxel_id[:, :, :, 0, :].permute(0, 3, 1, 2).long().contiguous()
+                , ign2dirt=True)
+            reduce_fake_mask_onehot = torch.zeros([
+                reduce_fake_mask.size(0), self.num_reduced_labels, reduce_fake_mask.size(2), reduce_fake_mask.size(3)],
+                dtype=torch.float, device=device)
+            reduce_fake_mask_onehot.scatter_(1, reduce_fake_mask, 1.0)
+            fake_masks = reduce_fake_mask_onehot
+            if self.pad != 0:
+                fake_masks = fake_masks[:, :, self.pad//2:-self.pad//2, self.pad//2:-self.pad//2]
+            # Translate real mask (data['label']), which is onehot.
+            real_masks_idx = torch.argmax(real_masks, dim=1, keepdim=True)
+            real_masks_idx[real_masks_idx > 182] = 182
+            reduced_real_mask = self.label_trans.coco2reduced(real_masks_idx)
+            reduced_real_mask_onehot = torch.zeros([
+                reduced_real_mask.size(0), self.num_reduced_labels, reduced_real_mask.size(2),
+                reduced_real_mask.size(3)], dtype=torch.float, device=device)
+            reduced_real_mask_onehot.scatter_(1, reduced_real_mask, 1.0)
+            real_masks = reduced_real_mask_onehot
+        # Mask smoothing.
+        if self.use_label_smooth:
+            fake_masks = mc_utils.segmask_smooth(fake_masks, kernel_size=self.label_smooth_dia)
+        if self.use_label_smooth_real:
+            real_masks = mc_utils.segmask_smooth(real_masks, kernel_size=self.label_smooth_dia)
+        ret['real_masks'] = real_masks
+        ret['fake_masks'] = fake_masks
+        return ret
+    def _forward_perpix_sub(self, blk_feats, worldcoord2, raydirs_in, z, mc_masks_onehot=None, global_enc=None):
+        r"""Per-pixel rendering forwarding
+        Args:
+            blk_feats: Deprecated
+            worldcoord2 (N x H x W x L x 3 tensor): 3D world coordinates of sampled points. L is number of samples; N is batch size, always 1.
+            raydirs_in (N x H x W x 1 x C2 tensor or None): ray direction embeddings.
+            z (N x C3 tensor): Intermediate style vectors.
+            mc_masks_onehot (N x H x W x L x C4): One-hot segmentation maps.
+        Returns:
+            net_out_s (N x H x W x L x 1 tensor): Opacities.
+            net_out_c (N x H x W x L x C5 tensor): Color embeddings.
+        """
+        _x, _y, _z = self.voxel.voxel_t.shape
+        delimeter = torch.Tensor([_x, _y, _z]).to(worldcoord2)
+        normalized_coord = worldcoord2 / delimeter * 2 - 1
+        global_enc = global_enc[:, None, None, None, :].repeat(1, normalized_coord.shape[1], normalized_coord.shape[2], normalized_coord.shape[3], 1)
+        normalized_coord = torch.cat([normalized_coord, global_enc], dim=-1)
+        feature_in = self.hash_encoder(normalized_coord)
+        net_out_s, net_out_c = self.render_net(feature_in, raydirs_in, z, mc_masks_onehot)
+        if self.raw_noise_std > 0.:
+            noise = torch.randn_like(net_out_s) * self.raw_noise_std
+            net_out_s = net_out_s + noise
+        return net_out_s, net_out_c
+    def _forward_perpix(self, blk_feats, voxel_id, depth2, raydirs, cam_ori_t, z, global_enc):
+        r"""Sample points along rays, forwarding the per-point MLP and aggregate pixel features
+        Args:
+            blk_feats (K x C1 tensor): Deprecated
+            voxel_id (N x H x W x M x 1 tensor): Voxel ids from ray-voxel intersection test. M: num intersected voxels, why always 6?
+            depth2 (N x 2 x H x W x M x 1 tensor): Depths of entrance and exit points for each ray-voxel intersection.
+            raydirs (N x H x W x 1 x 3 tensor): The direction of each ray.
+            cam_ori_t (N x 3 tensor): Camera origins.
+            z (N x C3 tensor): Intermediate style vectors.
+        """
+        # Generate sky_mask; PE transform on ray direction.
+        with torch.no_grad():
+            raydirs_in = raydirs.expand(-1, -1, -1, 1, -1).contiguous()
+            if self.pe_params[2] == 0 and self.pe_params[3] is True:
+                raydirs_in = raydirs_in
+            elif self.pe_params[2] == 0 and self.pe_params[3] is False:  # Not using raydir at all
+                raydirs_in = None
+            else:
+                raydirs_in = voxlib.positional_encoding(raydirs_in, self.pe_params[2], -1, self.pe_params[3])
+            # sky_mask: when True, ray finally hits sky
+            sky_mask = voxel_id[:, :, :, [-1], :] == 0
+            # sky_only_mask: when True, ray hits nothing but sky
+            sky_only_mask = voxel_id[:, :, :, [0], :] == 0
+        with torch.no_grad():
+            # Random sample points along the ray
+            num_samples = self.num_samples + 1
+            if self.sample_use_box_boundaries:
+                num_samples = self.num_samples - self.num_blocks_early_stop
+            # 10 samples per ray + 4 intersections - 2
+            rand_depth, new_dists, new_idx = mc_utils.sample_depth_batched(
+                depth2, num_samples, deterministic=self.coarse_deterministic_sampling,
+                use_box_boundaries=self.sample_use_box_boundaries, sample_depth=self.sample_depth)
+            nan_mask = torch.isnan(rand_depth)
+            inf_mask = torch.isinf(rand_depth)
+            rand_depth[nan_mask | inf_mask] = 0.0
+            worldcoord2 = raydirs * rand_depth + cam_ori_t[:, None, None, None, :]
+            # Generate per-sample segmentation label
+            voxel_id_reduced = self.label_trans.mc2reduced(voxel_id, ign2dirt=True)
+            mc_masks = torch.gather(voxel_id_reduced, -2, new_idx)  # B 256 256 N 1
+            mc_masks = mc_masks.long()
+            mc_masks_onehot = torch.zeros([mc_masks.size(0), mc_masks.size(1), mc_masks.size(
+                2), mc_masks.size(3), self.num_reduced_labels], dtype=torch.float, device=voxel_id.device)
+            # mc_masks_onehot: [B H W Nlayer 680]
+            mc_masks_onehot.scatter_(-1, mc_masks, 1.0)
+        net_out_s, net_out_c = self._forward_perpix_sub(blk_feats, worldcoord2, raydirs_in, z, mc_masks_onehot, global_enc)
+        # Handle sky
+        sky_raydirs_in = raydirs.expand(-1, -1, -1, 1, -1).contiguous()
+        sky_raydirs_in = voxlib.positional_encoding(sky_raydirs_in, self.pe_params_sky[0], -1, self.pe_params_sky[1])
+        skynet_out_c = self.sky_net(sky_raydirs_in, z)
+        # Blending
+        weights = mc_utils.volum_rendering_relu(net_out_s, new_dists * self.dists_scale, dim=-2)
+        # If a ray exclusively hits the sky (no intersection with the voxels), set its weight to zero.
+        weights = weights * torch.logical_not(sky_only_mask).float()
+        total_weights_raw = torch.sum(weights, dim=-2, keepdim=True)  # 256 256 1 1
+        total_weights = total_weights_raw
+        is_gnd = worldcoord2[..., [0]] <= 1.0  # Y X Z, [256, 256, 4, 3], nan < 1.0 == False
+        is_gnd = is_gnd.any(dim=-2, keepdim=True)
+        nosky_mask = torch.logical_or(torch.logical_not(sky_mask), is_gnd)
+        nosky_mask = nosky_mask.float()
+        # Avoid sky leakage
+        sky_weight = 1.0-total_weights
+        if self.keep_sky_out:
+            # keep_sky_out_avgpool overrides sky_replace_color
+            if self.sky_replace_color is None or self.keep_sky_out_avgpool:
+                if self.keep_sky_out_avgpool:
+                    if hasattr(self, 'sky_avg'):
+                        sky_avg = self.sky_avg
+                    else:
+                        if self.sky_global_avgpool:
+                            sky_avg = torch.mean(skynet_out_c, dim=[1, 2], keepdim=True)
+                        else:
+                            skynet_out_c_nchw = skynet_out_c.permute(0, 4, 1, 2, 3).squeeze(-1).contiguous()
+                            sky_avg = F.avg_pool2d(skynet_out_c_nchw, 31, stride=1, padding=15, count_include_pad=False)
+                            sky_avg = sky_avg.permute(0, 2, 3, 1).unsqueeze(-2).contiguous()
+                    # print(sky_avg.shape)
+                    skynet_out_c = skynet_out_c * (1.0-nosky_mask) + sky_avg*(nosky_mask)
+                else:
+                    sky_weight = sky_weight * (1.0-nosky_mask)
+            else:
+                skynet_out_c = skynet_out_c * (1.0-nosky_mask) + self.sky_replace_color*(nosky_mask)
+        if self.clip_feat_map is True:  # intermediate feature before blending & CNN
+            rgbs = torch.clamp(net_out_c, -1, 1) + 1
+            rgbs_sky = torch.clamp(skynet_out_c, -1, 1) + 1
+            net_out = torch.sum(weights*rgbs, dim=-2, keepdim=True) + sky_weight * \
+                rgbs_sky  # 576, 768, 4, 3 -> 576, 768, 3
+            net_out = net_out.squeeze(-2)
+            net_out = net_out - 1
+        elif self.clip_feat_map is False:
+            rgbs = net_out_c
+            rgbs_sky = skynet_out_c
+            net_out = torch.sum(weights*rgbs, dim=-2, keepdim=True) + sky_weight * \
+                rgbs_sky  # 576, 768, 4, 3 -> 576, 768, 3
+            net_out = net_out.squeeze(-2)
+        elif self.clip_feat_map == 'tanh':
+            rgbs = torch.tanh(net_out_c)
+            rgbs_sky = torch.tanh(skynet_out_c)
+            net_out = torch.sum(weights*rgbs, dim=-2, keepdim=True) + sky_weight * \
+                rgbs_sky  # 576, 768, 4, 3 -> 576, 768, 3
+            net_out = net_out.squeeze(-2)
+        else:
+            raise NotImplementedError
+        return net_out, new_dists, weights, total_weights_raw, rand_depth, net_out_s, net_out_c, skynet_out_c, \
+            nosky_mask, sky_mask, sky_only_mask, new_idx
+    def forward(self, data, random_style=False):
+        r"""SceneDreamer forward.
+        """
+        device = torch.device('cuda')
+        batch_size = data['images'].size(0)
+        # Requires: voxel_id, depth2, raydirs, cam_ori_t.
+        voxel_id, depth2, raydirs, cam_ori_t = data['voxel_id'], data['depth2'], data['raydirs'], data['cam_ori_t']
+        if 'pseudo_real_img' in data:
+            pseudo_real_img = data['pseudo_real_img']
+        global_enc = self.world_encoder(self.voxel.current_height_map, self.voxel.current_semantic_map)
+        z, mu, logvar = None, None, None
+        if random_style:
+            if self.style_dims > 0:
+                z = torch.randn(batch_size, self.style_dims, dtype=torch.float32, device=device)
+        else:
+            if self.style_encoder is None:
+                # ================ Get Style Code =================
+                if self.style_dims > 0:
+                    z = torch.randn(batch_size, self.style_dims, dtype=torch.float32, device=device)
+            else:
+                mu, logvar, z = self.style_encoder(pseudo_real_img)
+        # ================ Network Forward ================
+        # Forward StyleNet
+        if self.style_net is not None:
+            z = self.style_net(z)
+        # Forward per-pixel net.
+        net_out, new_dists, weights, total_weights_raw, rand_depth, net_out_s, net_out_c, skynet_out_c, nosky_mask, \
+            sky_mask, sky_only_mask, new_idx = self._forward_perpix(
+                self.blk_feats, voxel_id, depth2, raydirs, cam_ori_t, z, global_enc)
+        # Forward global net.
+        fake_images, fake_images_raw = self._forward_global(net_out, z)
+        if self.pad != 0:
+            fake_images = fake_images[:, :, self.pad//2:-self.pad//2, self.pad//2:-self.pad//2]
+        # =============== Arrange Return Values ================
+        output = {}
+        output['fake_images'] = fake_images
+        output['mu'] = mu
+        output['logvar'] = logvar
+        return output
+    def inference_givenstyle(self, style,
+                  output_dir,
+                  camera_mode,
+                  style_img_path=None,
+                  seed=1,
+                  pad=30,
+                  num_samples=40,
+                  num_blocks_early_stop=6,
+                  sample_depth=3,
+                  tile_size=128,
+                  resolution_hw=[540, 960],
+                  cam_ang=72,
+                  cam_maxstep=10):
+        r"""Compute result images according to the provided camera trajectory and save the results in the specified
+        folder. The full image is evaluated in multiple tiles to save memory.
+        Args:
+            output_dir (str): Where should the results be stored.
+            camera_mode (int): Which camera trajectory to use.
+            style_img_path (str): Path to the style-conditioning image.
+            seed (int): Random seed (controls style when style_image_path is not specified).
+            pad (int): Pixels to remove from the image tiles before stitching. Should be equal or larger than the
+            receptive field of the CNN to avoid border artifact.
+            num_samples (int): Number of samples per ray (different from training).
+            num_blocks_early_stop (int): Max number of intersected boxes per ray before stopping
+            (different from training).
+            sample_depth (float): Max distance traveled through boxes before stopping (different from training).
+            tile_size (int): Max size of a tile in pixels.
+            resolution_hw (list [H, W]): Resolution of the output image.
+            cam_ang (float): Horizontal FOV of the camera (may be adjusted by the camera controller).
+            cam_maxstep (int): Number of frames sampled from the camera trajectory.
+        """
+        def write_img(path, img, rgb_input=False):
+            img = ((img*0.5+0.5)*255).detach().cpu().numpy().astype(np.uint8)
+            img = img[0].transpose(1, 2, 0)
+            if rgb_input:
+                img = img[..., [2, 1, 0]]
+            cv2.imwrite(path, img,  [cv2.IMWRITE_PNG_COMPRESSION, 4])
+            return img[..., ::-1]
+        def read_img(path):
+            img = cv2.imread(path).astype(np.float32)[..., [2, 1, 0]].transpose(2, 0, 1) / 255
+            img = img * 2 - 1
+            img = torch.from_numpy(img)
+        print('Saving to', output_dir)
+        # Use provided random seed.
+        device = torch.device('cuda')
+        global_enc = self.world_encoder(self.voxel.current_height_map, self.voxel.current_semantic_map)
+        biome_colors = torch.Tensor([
+        [255, 255, 178],
+        [184, 200, 98],
+        [188, 161, 53],
+        [190, 255, 242],
+        [106, 144, 38],
+        [33, 77, 41],
+        [86, 179, 106],
+        [34, 61, 53],
+        [35, 114, 94],
+        [0, 0, 255],
+        [0, 255, 0],
+        ]).to(device) / 255 * 2 - 1
+        semantic_map = torch.argmax(self.voxel.current_semantic_map, dim=1)
+        self.pad = pad
+        self.num_samples = num_samples
+        self.num_blocks_early_stop = num_blocks_early_stop
+        self.sample_depth = sample_depth
+        self.coarse_deterministic_sampling = True
+        self.crop_size = resolution_hw
+        self.cam_res = [self.crop_size[0]+self.pad, self.crop_size[1]+self.pad]
+        self.use_label_smooth_pgt = False
+        # Make output dirs.
+        output_dir = os.path.join(output_dir, 'rgb_render')
+        os.makedirs(output_dir, exist_ok=True)
+        fout = imageio.get_writer(output_dir + '.mp4', fps=10)
+        write_img(os.path.join(output_dir, 'semantic_map.png'), biome_colors[semantic_map].permute(0, 3, 1, 2), rgb_input=True)
+        write_img(os.path.join(output_dir, 'height_map.png'), self.voxel.current_height_map)
+        np.save(os.path.join(output_dir, 'style.npy'), style.detach().cpu().numpy())
+        evalcamctl = camctl.EvalCameraController(
+            self.voxel, maxstep=cam_maxstep, pattern=camera_mode, cam_ang=cam_ang,
+            smooth_decay_multiplier=150/cam_maxstep)
+        # Get output style.
+        z = self.style_net(style)
+        # Generate required output images.
+        for id, (cam_ori_t, cam_dir_t, cam_up_t, cam_f) in enumerate(evalcamctl):
+            print('Rendering frame', id)
+            cam_f = cam_f * (self.crop_size[1]-1)  # So that the view is not depending on the padding
+            cam_c = [(self.cam_res[0]-1)/2, (self.cam_res[1]-1)/2]
+            voxel_id, depth2, raydirs = voxlib.ray_voxel_intersection_perspective(
+                self.voxel.voxel_t, cam_ori_t, cam_dir_t, cam_up_t, cam_f, cam_c, self.cam_res,
+                self.num_blocks_early_stop)
+            voxel_id = voxel_id.unsqueeze(0)
+            depth2 = depth2.unsqueeze(0)
+            raydirs = raydirs.unsqueeze(0)
+            cam_ori_t = cam_ori_t.unsqueeze(0).to(device)
+            voxel_id_all = voxel_id
+            depth2_all = depth2
+            raydirs_all = raydirs
+            # Evaluate sky in advance to get a consistent sky in the semi-transparent region.
+            if self.sky_global_avgpool:
+                sky_raydirs_in = raydirs.expand(-1, -1, -1, 1, -1).contiguous()
+                sky_raydirs_in = voxlib.positional_encoding(
+                    sky_raydirs_in, self.pe_params_sky[0], -1, self.pe_params_sky[1])
+                skynet_out_c = self.sky_net(sky_raydirs_in, z)
+                sky_avg = torch.mean(skynet_out_c, dim=[1, 2], keepdim=True)
+                self.sky_avg = sky_avg
+            num_strips_h = (self.cam_res[0]-self.pad+tile_size-1)//tile_size
+            num_strips_w = (self.cam_res[1]-self.pad+tile_size-1)//tile_size
+            fake_images_chunks_v = []
+            # For each horizontal strip.
+            for strip_id_h in range(num_strips_h):
+                strip_begin_h = strip_id_h * tile_size
+                strip_end_h = np.minimum(strip_id_h * tile_size + tile_size + self.pad, self.cam_res[0])
+                # For each vertical strip.
+                fake_images_chunks_h = []
+                for strip_id_w in range(num_strips_w):
+                    strip_begin_w = strip_id_w * tile_size
+                    strip_end_w = np.minimum(strip_id_w * tile_size + tile_size + self.pad, self.cam_res[1])
+                    voxel_id = voxel_id_all[:, strip_begin_h:strip_end_h, strip_begin_w:strip_end_w, :, :]
+                    depth2 = depth2_all[:, :, strip_begin_h:strip_end_h, strip_begin_w:strip_end_w, :, :]
+                    raydirs = raydirs_all[:, strip_begin_h:strip_end_h, strip_begin_w:strip_end_w, :, :]
+                    net_out, new_dists, weights, total_weights_raw, rand_depth, net_out_s, net_out_c, skynet_out_c, \
+                        nosky_mask, sky_mask, sky_only_mask, new_idx = self._forward_perpix(
+                            self.blk_feats, voxel_id, depth2, raydirs, cam_ori_t, z, global_enc)
+                    fake_images, _ = self._forward_global(net_out, z)
+                    if self.pad != 0:
+                        fake_images = fake_images[:, :, self.pad//2:-self.pad//2, self.pad//2:-self.pad//2]
+                    fake_images_chunks_h.append(fake_images)
+                fake_images_h = torch.cat(fake_images_chunks_h, dim=-1)
+                fake_images_chunks_v.append(fake_images_h)
+            fake_images = torch.cat(fake_images_chunks_v, dim=-2)
+            rgb = write_img(os.path.join(output_dir,
+                            '{:05d}.png'.format(id)), fake_images, rgb_input=True)
+            fout.append_data(rgb)
+        fout.close()
+    def inference_givenstyle_depth(self, style,
+                  output_dir,
+                  camera_mode,
+                  style_img_path=None,
+                  seed=1,
+                  pad=30,
+                  num_samples=40,
+                  num_blocks_early_stop=6,
+                  sample_depth=3,
+                  tile_size=128,
+                  resolution_hw=[540, 960],
+                  cam_ang=72,
+                  cam_maxstep=10):
+        r"""Compute result images according to the provided camera trajectory and save the results in the specified
+        folder. The full image is evaluated in multiple tiles to save memory.
+        Args:
+            output_dir (str): Where should the results be stored.
+            camera_mode (int): Which camera trajectory to use.
+            style_img_path (str): Path to the style-conditioning image.
+            seed (int): Random seed (controls style when style_image_path is not specified).
+            pad (int): Pixels to remove from the image tiles before stitching. Should be equal or larger than the
+            receptive field of the CNN to avoid border artifact.
+            num_samples (int): Number of samples per ray (different from training).
+            num_blocks_early_stop (int): Max number of intersected boxes per ray before stopping
+            (different from training).
+            sample_depth (float): Max distance traveled through boxes before stopping (different from training).
+            tile_size (int): Max size of a tile in pixels.
+            resolution_hw (list [H, W]): Resolution of the output image.
+            cam_ang (float): Horizontal FOV of the camera (may be adjusted by the camera controller).
+            cam_maxstep (int): Number of frames sampled from the camera trajectory.
+        """
+        def write_img(path, img, rgb_input=False):
+            img = ((img*0.5+0.5)*255).detach().cpu().numpy().astype(np.uint8)
+            img = img[0].transpose(1, 2, 0)
+            if rgb_input:
+                img = img[..., [2, 1, 0]]
+            cv2.imwrite(path, img,  [cv2.IMWRITE_PNG_COMPRESSION, 4])
+            return img[..., ::-1]
+        def read_img(path):
+            img = cv2.imread(path).astype(np.float32)[..., [2, 1, 0]].transpose(2, 0, 1) / 255
+            img = img * 2 - 1
+            img = torch.from_numpy(img)
+        print('Saving to', output_dir)
+        # Use provided random seed.
+        device = torch.device('cuda')
+        global_enc = self.world_encoder(self.voxel.current_height_map, self.voxel.current_semantic_map)
+        biome_colors = torch.Tensor([
+        [255, 255, 178],
+        [184, 200, 98],
+        [188, 161, 53],
+        [190, 255, 242],
+        [106, 144, 38],
+        [33, 77, 41],
+        [86, 179, 106],
+        [34, 61, 53],
+        [35, 114, 94],
+        [0, 0, 255],
+        [0, 255, 0],
+        ]) / 255 * 2 - 1
+        print(self.voxel.current_height_map[0].shape)
+        semantic_map = torch.argmax(self.voxel.current_semantic_map, dim=1)
+        print(torch.unique(semantic_map, return_counts=True))
+        print(semantic_map.min())
+        self.pad = pad
+        self.num_samples = num_samples
+        self.num_blocks_early_stop = num_blocks_early_stop
+        self.sample_depth = sample_depth
+        self.coarse_deterministic_sampling = True
+        self.crop_size = resolution_hw
+        self.cam_res = [self.crop_size[0]+self.pad, self.crop_size[1]+self.pad]
+        self.use_label_smooth_pgt = False
+        # Make output dirs.
+        gancraft_outputs_dir = os.path.join(output_dir, 'gancraft_outputs')
+        os.makedirs(gancraft_outputs_dir, exist_ok=True)
+        gancraft_depth_outputs_dir = os.path.join(output_dir, 'depth')
+        os.makedirs(gancraft_depth_outputs_dir, exist_ok=True)
+        vis_masks_dir = os.path.join(output_dir, 'vis_masks')
+        os.makedirs(vis_masks_dir, exist_ok=True)
+        fout = imageio.get_writer(gancraft_outputs_dir + '.mp4', fps=10)
+        fout_cat = imageio.get_writer(gancraft_outputs_dir + '-vis_masks.mp4', fps=10)
+        write_img(os.path.join(output_dir, 'semantic_map.png'), biome_colors[semantic_map].permute(0, 3, 1, 2), rgb_input=True)
+        write_img(os.path.join(output_dir, 'heightmap.png'), self.voxel.current_height_map)
+        evalcamctl = camctl.EvalCameraController(
+            self.voxel, maxstep=cam_maxstep, pattern=camera_mode, cam_ang=cam_ang,
+            smooth_decay_multiplier=150/cam_maxstep)
+        # import pickle
+        # with open(os.path.join(output_dir,'camera.pkl'), 'wb') as f:
+        #     pickle.dump(evalcamctl, f)
+        # Get output style.
+        z = self.style_net(style)
+        # Generate required output images.
+        for id, (cam_ori_t, cam_dir_t, cam_up_t, cam_f) in enumerate(evalcamctl):
+            # print('Rendering frame', id)
+            cam_f = cam_f * (self.crop_size[1]-1)  # So that the view is not depending on the padding
+            cam_c = [(self.cam_res[0]-1)/2, (self.cam_res[1]-1)/2]
+            voxel_id, depth2, raydirs = voxlib.ray_voxel_intersection_perspective(
+                self.voxel.voxel_t, cam_ori_t, cam_dir_t, cam_up_t, cam_f, cam_c, self.cam_res,
+                self.num_blocks_early_stop)
+            voxel_id = voxel_id.unsqueeze(0)
+            depth2 = depth2.unsqueeze(0)
+            raydirs = raydirs.unsqueeze(0)
+            cam_ori_t = cam_ori_t.unsqueeze(0).to(device)
+            # Save 3D voxel rendering.
+            mc_rgb = self.label_trans.mc_color(voxel_id[0, :, :, 0, 0].cpu().numpy())
+            # Diffused shading, co-located light.
+            first_intersection_depth = depth2[:, 0, :, :, 0, None, :]  # [1, 542, 542, 1, 1].
+            first_intersection_point = raydirs * first_intersection_depth + cam_ori_t[:, None, None, None, :]
+            fip_local_coords = torch.remainder(first_intersection_point, 1.0)
+            fip_wall_proximity = torch.minimum(fip_local_coords, 1.0-fip_local_coords)
+            fip_wall_orientation = torch.argmin(fip_wall_proximity, dim=-1, keepdim=False)
+            # 0: [1,0,0]; 1: [0,1,0]; 2: [0,0,1]
+            lut = torch.tensor([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=torch.float32,
+                               device=fip_wall_orientation.device)
+            fip_normal = lut[fip_wall_orientation]  # [1, 542, 542, 1, 3]
+            diffuse_shade = torch.abs(torch.sum(fip_normal * raydirs, dim=-1))
+            mc_rgb = (mc_rgb.astype(np.float) / 255) ** 2.2
+            mc_rgb = mc_rgb * diffuse_shade[0, :, :, :].cpu().numpy()
+            mc_rgb = (mc_rgb ** (1/2.2)) * 255
+            mc_rgb = mc_rgb.astype(np.uint8)
+            if self.pad > 0:
+                mc_rgb = mc_rgb[self.pad//2:-self.pad//2, self.pad//2:-self.pad//2]
+            cv2.imwrite(os.path.join(vis_masks_dir, '{:05d}.png'.format(id)), mc_rgb,  [cv2.IMWRITE_PNG_COMPRESSION, 4])
+            # Tiled eval of GANcraft.
+            voxel_id_all = voxel_id
+            depth2_all = depth2
+            raydirs_all = raydirs
+            # Evaluate sky in advance to get a consistent sky in the semi-transparent region.
+            if self.sky_global_avgpool:
+                sky_raydirs_in = raydirs.expand(-1, -1, -1, 1, -1).contiguous()
+                sky_raydirs_in = voxlib.positional_encoding(
+                    sky_raydirs_in, self.pe_params_sky[0], -1, self.pe_params_sky[1])
+                skynet_out_c = self.sky_net(sky_raydirs_in, z)
+                sky_avg = torch.mean(skynet_out_c, dim=[1, 2], keepdim=True)
+                self.sky_avg = sky_avg
+            num_strips_h = (self.cam_res[0]-self.pad+tile_size-1)//tile_size
+            num_strips_w = (self.cam_res[1]-self.pad+tile_size-1)//tile_size
+            fake_images_chunks_v = []
+            fake_depth_chunks_v = []
+            # For each horizontal strip.
+            for strip_id_h in range(num_strips_h):
+                strip_begin_h = strip_id_h * tile_size
+                strip_end_h = np.minimum(strip_id_h * tile_size + tile_size + self.pad, self.cam_res[0])
+                # For each vertical strip.
+                fake_images_chunks_h = []
+                fake_depth_chunks_h = []
+                for strip_id_w in range(num_strips_w):
+                    strip_begin_w = strip_id_w * tile_size
+                    strip_end_w = np.minimum(strip_id_w * tile_size + tile_size + self.pad, self.cam_res[1])
+                    voxel_id = voxel_id_all[:, strip_begin_h:strip_end_h, strip_begin_w:strip_end_w, :, :]
+                    depth2 = depth2_all[:, :, strip_begin_h:strip_end_h, strip_begin_w:strip_end_w, :, :]
+                    raydirs = raydirs_all[:, strip_begin_h:strip_end_h, strip_begin_w:strip_end_w, :, :]
+                    net_out, new_dists, weights, total_weights_raw, rand_depth, net_out_s, net_out_c, skynet_out_c, \
+                        nosky_mask, sky_mask, sky_only_mask, new_idx = self._forward_perpix(
+                            self.blk_feats, voxel_id, depth2, raydirs, cam_ori_t, z, global_enc)
+                    fake_images, _ = self._forward_global(net_out, z)
+                    depth_map = torch.sum(weights * rand_depth, -2)
+                    # disp_map = 1. / torch.max(1e-10 * torch.ones_like(depth_map).to(depth_map), depth_map / torch.sum(weights, -2))
+                    # depth_map = torch.clip(depth_map, 0, 100.)
+                    # disp_map = 1. / (depth_map.permute(0, 3, 1, 2))
+                    disp_map = depth_map.permute(0, 3, 1, 2)
+                    if self.pad != 0:
+                        fake_images = fake_images[:, :, self.pad//2:-self.pad//2, self.pad//2:-self.pad//2]
+                        disp_map = disp_map[:, :, self.pad//2:-self.pad//2, self.pad//2:-self.pad//2]
+                    fake_images_chunks_h.append(fake_images)
+                    fake_depth_chunks_h.append(disp_map)
+                fake_images_h = torch.cat(fake_images_chunks_h, dim=-1)
+                fake_depth_h = torch.cat(fake_depth_chunks_h, dim=-1)
+                fake_images_chunks_v.append(fake_images_h)
+                fake_depth_chunks_v.append(fake_depth_h)
+            fake_images = torch.cat(fake_images_chunks_v, dim=-2)
+            fake_depth = torch.cat(fake_depth_chunks_v, dim=-2)
+            # fake_depth = ((fake_depth - fake_depth.mean()) / fake_depth.std() + 1) / 2
+            # fake_depth = torch.clip(1./ (fake_depth + 1e-4), 0., 1.)
+            # fake_depth = ((fake_depth - fake_depth.mean()) / fake_depth.std() + 1) / 2
+            mmask = fake_depth > 0
+            tmp = fake_depth[mmask]
+            # tmp = 1. / (tmp + 1e-4)
+            tmp = (tmp - tmp.min()) / (tmp.max() - tmp.min())
+            # tmp = ((tmp - tmp.mean()) / tmp.std() + 1) / 2.
+            fake_depth[~mmask] = 1
+            fake_depth[mmask] = tmp
+            # fake_depth = (fake_depth - fake_depth.min()) / (fake_depth.max() - fake_depth.min())
+            cv2.imwrite(os.path.join(gancraft_depth_outputs_dir, '{:05d}.png'.format(id)), fake_depth[0].permute(1, 2, 0).detach().cpu().numpy() * 255)
+            rgb = write_img(os.path.join(gancraft_outputs_dir,
+                            '{:05d}.png'.format(id)), fake_images, rgb_input=True)
+            fout.append_data(rgb)
+            fout_cat.append_data(np.concatenate((mc_rgb[..., ::-1], rgb), axis=1))
+        fout.close()
+        fout_cat.close()

imaginaire/generators/spade.py ADDED Viewed

	@@ -0,0 +1,571 @@

+# Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, check out LICENSE.md
+import functools
+import math
+import types
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import Upsample as NearestUpsample
+from imaginaire.layers import Conv2dBlock, LinearBlock, Res2dBlock
+from imaginaire.utils.data import (get_crop_h_w,
+                                   get_paired_input_image_channel_number,
+                                   get_paired_input_label_channel_number)
+from imaginaire.utils.distributed import master_only_print as print
+class Generator(nn.Module):
+    r"""SPADE generator constructor.
+    Args:
+        gen_cfg (obj): Generator definition part of the yaml config file.
+        data_cfg (obj): Data definition part of the yaml config file.
+    """
+    def __init__(self, gen_cfg, data_cfg):
+        super(Generator, self).__init__()
+        print('SPADE generator initialization.')
+        # We assume the first datum is the ground truth image.
+        image_channels = getattr(gen_cfg, 'image_channels', None)
+        if image_channels is None:
+            image_channels = get_paired_input_image_channel_number(data_cfg)
+        num_labels = getattr(gen_cfg, 'num_labels', None)
+        if num_labels is None:
+            # Calculate number of channels in the input label when not specified.
+            num_labels = get_paired_input_label_channel_number(data_cfg)
+        crop_h, crop_w = get_crop_h_w(data_cfg.train.augmentations)
+        # Build the generator
+        out_image_small_side_size = crop_w if crop_w < crop_h else crop_h
+        num_filters = getattr(gen_cfg, 'num_filters', 128)
+        kernel_size = getattr(gen_cfg, 'kernel_size', 3)
+        weight_norm_type = getattr(gen_cfg, 'weight_norm_type', 'spectral')
+        cond_dims = 0
+        # Check whether we use the style code.
+        style_dims = getattr(gen_cfg, 'style_dims', None)
+        self.style_dims = style_dims
+        if style_dims is not None:
+            print('\tStyle code dimensions: %d' % style_dims)
+            cond_dims += style_dims
+            self.use_style = True
+        else:
+            self.use_style = False
+        # Check whether we use the attribute code.
+        if hasattr(gen_cfg, 'attribute_dims'):
+            self.use_attribute = True
+            self.attribute_dims = gen_cfg.attribute_dims
+            cond_dims += gen_cfg.attribute_dims
+        else:
+            self.use_attribute = False
+        if not self.use_style and not self.use_attribute:
+            self.use_style_encoder = False
+        else:
+            self.use_style_encoder = True
+        print('\tBase filter number: %d' % num_filters)
+        print('\tConvolution kernel size: %d' % kernel_size)
+        print('\tWeight norm type: %s' % weight_norm_type)
+        skip_activation_norm = \
+            getattr(gen_cfg, 'skip_activation_norm', True)
+        activation_norm_params = getattr(gen_cfg, 'activation_norm_params', None)
+        if activation_norm_params is None:
+            activation_norm_params = types.SimpleNamespace()
+        if not hasattr(activation_norm_params, 'num_filters'):
+            setattr(activation_norm_params, 'num_filters', 128)
+        if not hasattr(activation_norm_params, 'kernel_size'):
+            setattr(activation_norm_params, 'kernel_size', 3)
+        if not hasattr(activation_norm_params, 'activation_norm_type'):
+            setattr(activation_norm_params, 'activation_norm_type', 'sync_batch')
+        if not hasattr(activation_norm_params, 'separate_projection'):
+            setattr(activation_norm_params, 'separate_projection', False)
+        if not hasattr(activation_norm_params, 'activation_norm_params'):
+            activation_norm_params.activation_norm_params = types.SimpleNamespace()
+            activation_norm_params.activation_norm_params.affine = True
+        setattr(activation_norm_params, 'cond_dims', num_labels)
+        if not hasattr(activation_norm_params, 'weight_norm_type'):
+            setattr(activation_norm_params, 'weight_norm_type', weight_norm_type)
+        global_adaptive_norm_type = getattr(gen_cfg, 'global_adaptive_norm_type', 'sync_batch')
+        use_posenc_in_input_layer = getattr(gen_cfg, 'use_posenc_in_input_layer', True)
+        output_multiplier = getattr(gen_cfg, 'output_multiplier', 1.0)
+        print(activation_norm_params)
+        self.spade_generator = SPADEGenerator(num_labels,
+                                              out_image_small_side_size,
+                                              image_channels,
+                                              num_filters,
+                                              kernel_size,
+                                              cond_dims,
+                                              activation_norm_params,
+                                              weight_norm_type,
+                                              global_adaptive_norm_type,
+                                              skip_activation_norm,
+                                              use_posenc_in_input_layer,
+                                              self.use_style_encoder,
+                                              output_multiplier)
+        if self.use_style:
+            # Build the encoder.
+            style_enc_cfg = getattr(gen_cfg, 'style_enc', None)
+            if style_enc_cfg is None:
+                style_enc_cfg = types.SimpleNamespace()
+            if not hasattr(style_enc_cfg, 'num_filters'):
+                setattr(style_enc_cfg, 'num_filters', 128)
+            if not hasattr(style_enc_cfg, 'kernel_size'):
+                setattr(style_enc_cfg, 'kernel_size', 3)
+            if not hasattr(style_enc_cfg, 'weight_norm_type'):
+                setattr(style_enc_cfg, 'weight_norm_type', weight_norm_type)
+            setattr(style_enc_cfg, 'input_image_channels', image_channels)
+            setattr(style_enc_cfg, 'style_dims', style_dims)
+            self.style_encoder = StyleEncoder(style_enc_cfg)
+        self.z = None
+        print('Done with the SPADE generator initialization.')
+    def forward(self, data, random_style=False):
+        r"""SPADE Generator forward.
+        Args:
+            data (dict):
+              - images (N x C1 x H x W tensor) : Ground truth images
+              - label (N x C2 x H x W tensor) : Semantic representations
+              - z (N x style_dims tensor): Gaussian random noise
+              - random_style (bool): Whether to sample a random style vector.
+        Returns:
+            (dict):
+              - fake_images (N x 3 x H x W tensor): fake images
+              - mu (N x C1 tensor): mean vectors
+              - logvar (N x C1 tensor): log-variance vectors
+        """
+        if self.use_style_encoder:
+            if random_style:
+                bs = data['label'].size(0)
+                z = torch.randn(
+                    bs, self.style_dims, dtype=torch.float32).cuda()
+                if (data['label'].dtype ==
+                        data['label'].dtype == torch.float16):
+                    z = z.half()
+                mu = None
+                logvar = None
+            else:
+                mu, logvar, z = self.style_encoder(data['images'])
+            if self.use_attribute:
+                data['z'] = torch.cat((z, data['attributes'].squeeze(1)), dim=1)
+            else:
+                data['z'] = z
+        output = self.spade_generator(data)
+        if self.use_style_encoder:
+            output['mu'] = mu
+            output['logvar'] = logvar
+        return output
+    def inference(self,
+                  data,
+                  random_style=False,
+                  use_fixed_random_style=False,
+                  keep_original_size=False):
+        r"""Compute results images for a batch of input data and save the
+        results in the specified folder.
+        Args:
+            data (dict):
+              - images (N x C1 x H x W tensor) : Ground truth images
+              - label (N x C2 x H x W tensor) : Semantic representations
+              - z (N x style_dims tensor): Gaussian random noise
+            random_style (bool): Whether to sample a random style vector.
+            use_fixed_random_style (bool): Sample random style once and use it
+                for all the remaining inference.
+            keep_original_size (bool): Keep original size of the input.
+        Returns:
+            (dict):
+              - fake_images (N x 3 x H x W tensor): fake images
+              - mu (N x C1 tensor): mean vectors
+              - logvar (N x C1 tensor): log-variance vectors
+        """
+        self.eval()
+        self.spade_generator.eval()
+        if self.use_style_encoder:
+            if random_style and self.use_style_encoder:
+                if self.z is None or not use_fixed_random_style:
+                    bs = data['label'].size(0)
+                    z = torch.randn(
+                        bs, self.style_dims, dtype=torch.float32).to('cuda')
+                    if (data['label'].dtype ==
+                            data['label'].dtype ==
+                            torch.float16):
+                        z = z.half()
+                    self.z = z
+                else:
+                    z = self.z
+            else:
+                mu, logvar, z = self.style_encoder(data['images'])
+            data['z'] = z
+        output = self.spade_generator(data)
+        output_images = output['fake_images']
+        if keep_original_size:
+            height = data['original_h_w'][0][0]
+            width = data['original_h_w'][0][1]
+            output_images = torch.nn.functional.interpolate(
+                output_images, size=[height, width])
+        for key in data['key'].keys():
+            if 'segmaps' in key or 'seg_maps' in key:
+                file_names = data['key'][key][0]
+                break
+        for key in data['key'].keys():
+            if 'edgemaps' in key or 'edge_maps' in key:
+                file_names = data['key'][key][0]
+                break
+        return output_images, file_names
+class SPADEGenerator(nn.Module):
+    r"""SPADE Image Generator constructor.
+    Args:
+        num_labels (int): Number of different labels.
+        out_image_small_side_size (int): min(width, height)
+        image_channels (int): Num. of channels of the output image.
+        num_filters (int): Base filter numbers.
+        kernel_size (int): Convolution kernel size.
+        style_dims (int): Dimensions of the style code.
+        activation_norm_params (obj): Spatially adaptive normalization param.
+        weight_norm_type (str): Type of weight normalization.
+            ``'none'``, ``'spectral'``, or ``'weight'``.
+        global_adaptive_norm_type (str): Type of normalization in SPADE.
+        skip_activation_norm (bool): If ``True``, applies activation norm to the
+            shortcut connection in residual blocks.
+        use_style_encoder (bool): Whether to use global adaptive norm
+            like conditional batch norm or adaptive instance norm.
+        output_multiplier (float): A positive number multiplied to the output
+    """
+    def __init__(self,
+                 num_labels,
+                 out_image_small_side_size,
+                 image_channels,
+                 num_filters,
+                 kernel_size,
+                 style_dims,
+                 activation_norm_params,
+                 weight_norm_type,
+                 global_adaptive_norm_type,
+                 skip_activation_norm,
+                 use_posenc_in_input_layer,
+                 use_style_encoder,
+                 output_multiplier):
+        super(SPADEGenerator, self).__init__()
+        self.output_multiplier = output_multiplier
+        self.use_style_encoder = use_style_encoder
+        self.use_posenc_in_input_layer = use_posenc_in_input_layer
+        self.out_image_small_side_size = out_image_small_side_size
+        self.num_filters = num_filters
+        padding = int(np.ceil((kernel_size - 1.0) / 2))
+        nonlinearity = 'leakyrelu'
+        activation_norm_type = 'spatially_adaptive'
+        base_res2d_block = \
+            functools.partial(Res2dBlock,
+                              kernel_size=kernel_size,
+                              padding=padding,
+                              bias=[True, True, False],
+                              weight_norm_type=weight_norm_type,
+                              activation_norm_type=activation_norm_type,
+                              activation_norm_params=activation_norm_params,
+                              skip_activation_norm=skip_activation_norm,
+                              nonlinearity=nonlinearity,
+                              order='NACNAC')
+        if self.use_style_encoder:
+            self.fc_0 = LinearBlock(style_dims, 2 * style_dims,
+                                    weight_norm_type=weight_norm_type,
+                                    nonlinearity='relu',
+                                    order='CAN')
+            self.fc_1 = LinearBlock(2 * style_dims, 2 * style_dims,
+                                    weight_norm_type=weight_norm_type,
+                                    nonlinearity='relu',
+                                    order='CAN')
+            adaptive_norm_params = types.SimpleNamespace()
+            if not hasattr(adaptive_norm_params, 'cond_dims'):
+                setattr(adaptive_norm_params, 'cond_dims', 2 * style_dims)
+            if not hasattr(adaptive_norm_params, 'activation_norm_type'):
+                setattr(adaptive_norm_params, 'activation_norm_type', global_adaptive_norm_type)
+            if not hasattr(adaptive_norm_params, 'weight_norm_type'):
+                setattr(adaptive_norm_params, 'weight_norm_type', activation_norm_params.weight_norm_type)
+            if not hasattr(adaptive_norm_params, 'separate_projection'):
+                setattr(adaptive_norm_params, 'separate_projection', activation_norm_params.separate_projection)
+            adaptive_norm_params.activation_norm_params = types.SimpleNamespace()
+            setattr(adaptive_norm_params.activation_norm_params, 'affine',
+                    activation_norm_params.activation_norm_params.affine)
+            base_cbn2d_block = \
+                functools.partial(Conv2dBlock,
+                                  kernel_size=kernel_size,
+                                  stride=1,
+                                  padding=padding,
+                                  bias=True,
+                                  weight_norm_type=weight_norm_type,
+                                  activation_norm_type='adaptive',
+                                  activation_norm_params=adaptive_norm_params,
+                                  nonlinearity=nonlinearity,
+                                  order='NAC')
+        else:
+            base_conv2d_block = \
+                functools.partial(Conv2dBlock,
+                                  kernel_size=kernel_size,
+                                  stride=1,
+                                  padding=padding,
+                                  bias=True,
+                                  weight_norm_type=weight_norm_type,
+                                  nonlinearity=nonlinearity,
+                                  order='NAC')
+        in_num_labels = num_labels
+        in_num_labels += 2 if self.use_posenc_in_input_layer else 0
+        self.head_0 = Conv2dBlock(in_num_labels, 8 * num_filters,
+                                  kernel_size=kernel_size, stride=1,
+                                  padding=padding,
+                                  weight_norm_type=weight_norm_type,
+                                  activation_norm_type='none',
+                                  nonlinearity=nonlinearity)
+        if self.use_style_encoder:
+            self.cbn_head_0 = base_cbn2d_block(
+                8 * num_filters, 16 * num_filters)
+        else:
+            self.conv_head_0 = base_conv2d_block(
+                8 * num_filters, 16 * num_filters)
+        self.head_1 = base_res2d_block(16 * num_filters, 16 * num_filters)
+        self.head_2 = base_res2d_block(16 * num_filters, 16 * num_filters)
+        self.up_0a = base_res2d_block(16 * num_filters, 8 * num_filters)
+        if self.use_style_encoder:
+            self.cbn_up_0a = base_cbn2d_block(
+                8 * num_filters, 8 * num_filters)
+        else:
+            self.conv_up_0a = base_conv2d_block(
+                8 * num_filters, 8 * num_filters)
+        self.up_0b = base_res2d_block(8 * num_filters, 8 * num_filters)
+        self.up_1a = base_res2d_block(8 * num_filters, 4 * num_filters)
+        if self.use_style_encoder:
+            self.cbn_up_1a = base_cbn2d_block(
+                4 * num_filters, 4 * num_filters)
+        else:
+            self.conv_up_1a = base_conv2d_block(
+                4 * num_filters, 4 * num_filters)
+        self.up_1b = base_res2d_block(4 * num_filters, 4 * num_filters)
+        self.up_2a = base_res2d_block(4 * num_filters, 4 * num_filters)
+        if self.use_style_encoder:
+            self.cbn_up_2a = base_cbn2d_block(
+                4 * num_filters, 4 * num_filters)
+        else:
+            self.conv_up_2a = base_conv2d_block(
+                4 * num_filters, 4 * num_filters)
+        self.up_2b = base_res2d_block(4 * num_filters, 2 * num_filters)
+        self.conv_img256 = Conv2dBlock(2 * num_filters, image_channels,
+                                       5, stride=1, padding=2,
+                                       weight_norm_type=weight_norm_type,
+                                       activation_norm_type='none',
+                                       nonlinearity=nonlinearity,
+                                       order='ANC')
+        self.base = 16
+        if self.out_image_small_side_size == 512:
+            self.up_3a = base_res2d_block(2 * num_filters, 1 * num_filters)
+            self.up_3b = base_res2d_block(1 * num_filters, 1 * num_filters)
+            self.conv_img512 = Conv2dBlock(1 * num_filters, image_channels,
+                                           5, stride=1, padding=2,
+                                           weight_norm_type=weight_norm_type,
+                                           activation_norm_type='none',
+                                           nonlinearity=nonlinearity,
+                                           order='ANC')
+            self.base = 32
+        if self.out_image_small_side_size == 1024:
+            self.up_3a = base_res2d_block(2 * num_filters, 1 * num_filters)
+            self.up_3b = base_res2d_block(1 * num_filters, 1 * num_filters)
+            self.conv_img512 = Conv2dBlock(1 * num_filters, image_channels,
+                                           5, stride=1, padding=2,
+                                           weight_norm_type=weight_norm_type,
+                                           activation_norm_type='none',
+                                           nonlinearity=nonlinearity,
+                                           order='ANC')
+            self.up_4a = base_res2d_block(num_filters, num_filters // 2)
+            self.up_4b = base_res2d_block(num_filters // 2, num_filters // 2)
+            self.conv_img1024 = Conv2dBlock(num_filters // 2, image_channels,
+                                            5, stride=1, padding=2,
+                                            weight_norm_type=weight_norm_type,
+                                            activation_norm_type='none',
+                                            nonlinearity=nonlinearity,
+                                            order='ANC')
+            self.nearest_upsample4x = NearestUpsample(scale_factor=4, mode='nearest')
+            self.base = 64
+        if self.out_image_small_side_size != 256 and self.out_image_small_side_size != 512 \
+                and self.out_image_small_side_size != 1024:
+            raise ValueError('Generation image size (%d, %d) not supported' %
+                             (self.out_image_small_side_size,
+                              self.out_image_small_side_size))
+        self.nearest_upsample2x = NearestUpsample(scale_factor=2, mode='nearest')
+        xv, yv = torch.meshgrid(
+            [torch.arange(-1, 1.1, 2. / 15), torch.arange(-1, 1.1, 2. / 15)])
+        self.xy = torch.cat((xv.unsqueeze(0), yv.unsqueeze(0)), 0).unsqueeze(0)
+        self.xy = self.xy.cuda()
+    def forward(self, data):
+        r"""SPADE Generator forward.
+        Args:
+            data (dict):
+              - data  (N x C1 x H x W tensor) : Ground truth images.
+              - label (N x C2 x H x W tensor) : Semantic representations.
+              - z (N x style_dims tensor): Gaussian random noise.
+        Returns:
+            output (dict):
+              - fake_images (N x 3 x H x W tensor): Fake images.
+        """
+        seg = data['label']
+        if self.use_style_encoder:
+            z = data['z']
+            z = self.fc_0(z)
+            z = self.fc_1(z)
+        # The code piece below makes sure that the input size is always 16x16
+        sy = math.floor(seg.size()[2] * 1.0 / self.base)
+        sx = math.floor(seg.size()[3] * 1.0 / self.base)
+        in_seg = F.interpolate(seg, size=[sy, sx], mode='nearest')
+        if self.use_posenc_in_input_layer:
+            in_xy = F.interpolate(self.xy, size=[sy, sx], mode='bicubic')
+            in_seg_xy = torch.cat(
+                (in_seg, in_xy.expand(in_seg.size()[0], 2, sy, sx)), 1)
+        else:
+            in_seg_xy = in_seg
+        # 16x16
+        x = self.head_0(in_seg_xy)
+        if self.use_style_encoder:
+            x = self.cbn_head_0(x, z)
+        else:
+            x = self.conv_head_0(x)
+        x = self.head_1(x, seg)
+        x = self.head_2(x, seg)
+        x = self.nearest_upsample2x(x)
+        # 32x32
+        x = self.up_0a(x, seg)
+        if self.use_style_encoder:
+            x = self.cbn_up_0a(x, z)
+        else:
+            x = self.conv_up_0a(x)
+        x = self.up_0b(x, seg)
+        x = self.nearest_upsample2x(x)
+        # 64x64
+        x = self.up_1a(x, seg)
+        if self.use_style_encoder:
+            x = self.cbn_up_1a(x, z)
+        else:
+            x = self.conv_up_1a(x)
+        x = self.up_1b(x, seg)
+        x = self.nearest_upsample2x(x)
+        # 128x128
+        x = self.up_2a(x, seg)
+        if self.use_style_encoder:
+            x = self.cbn_up_2a(x, z)
+        else:
+            x = self.conv_up_2a(x)
+        x = self.up_2b(x, seg)
+        x = self.nearest_upsample2x(x)
+        # 256x256
+        if self.out_image_small_side_size == 256:
+            x256 = self.conv_img256(x)
+            x = torch.tanh(self.output_multiplier * x256)
+        # 512x512
+        elif self.out_image_small_side_size == 512:
+            x256 = self.conv_img256(x)
+            x256 = self.nearest_upsample2x(x256)
+            x = self.up_3a(x, seg)
+            x = self.up_3b(x, seg)
+            x = self.nearest_upsample2x(x)
+            x512 = self.conv_img512(x)
+            x = torch.tanh(self.output_multiplier * (x256 + x512))
+        # 1024x1024
+        elif self.out_image_small_side_size == 1024:
+            x256 = self.conv_img256(x)
+            x256 = self.nearest_upsample4x(x256)
+            x = self.up_3a(x, seg)
+            x = self.up_3b(x, seg)
+            x = self.nearest_upsample2x(x)
+            x512 = self.conv_img512(x)
+            x512 = self.nearest_upsample2x(x512)
+            x = self.up_4a(x, seg)
+            x = self.up_4b(x, seg)
+            x = self.nearest_upsample2x(x)
+            x1024 = self.conv_img1024(x)
+            x = torch.tanh(self.output_multiplier * (x256 + x512 + x1024))
+        output = dict()
+        output['fake_images'] = x
+        return output
+class StyleEncoder(nn.Module):
+    r"""Style Encode constructor.
+    Args:
+        style_enc_cfg (obj): Style encoder definition file.
+    """
+    def __init__(self, style_enc_cfg):
+        super(StyleEncoder, self).__init__()
+        input_image_channels = style_enc_cfg.input_image_channels
+        num_filters = style_enc_cfg.num_filters
+        kernel_size = style_enc_cfg.kernel_size
+        padding = int(np.ceil((kernel_size - 1.0) / 2))
+        style_dims = style_enc_cfg.style_dims
+        weight_norm_type = style_enc_cfg.weight_norm_type
+        activation_norm_type = 'none'
+        nonlinearity = 'leakyrelu'
+        base_conv2d_block = \
+            functools.partial(Conv2dBlock,
+                              kernel_size=kernel_size,
+                              stride=2,
+                              padding=padding,
+                              weight_norm_type=weight_norm_type,
+                              activation_norm_type=activation_norm_type,
+                              # inplace_nonlinearity=True,
+                              nonlinearity=nonlinearity)
+        self.layer1 = base_conv2d_block(input_image_channels, num_filters)
+        self.layer2 = base_conv2d_block(num_filters * 1, num_filters * 2)
+        self.layer3 = base_conv2d_block(num_filters * 2, num_filters * 4)
+        self.layer4 = base_conv2d_block(num_filters * 4, num_filters * 8)
+        self.layer5 = base_conv2d_block(num_filters * 8, num_filters * 8)
+        self.layer6 = base_conv2d_block(num_filters * 8, num_filters * 8)
+        self.fc_mu = LinearBlock(num_filters * 8 * 4 * 4, style_dims)
+        self.fc_var = LinearBlock(num_filters * 8 * 4 * 4, style_dims)
+    def forward(self, input_x):
+        r"""SPADE Style Encoder forward.
+        Args:
+            input_x (N x 3 x H x W tensor): input images.
+        Returns:
+            (tuple):
+              - mu (N x C tensor): Mean vectors.
+              - logvar (N x C tensor): Log-variance vectors.
+              - z (N x C tensor): Style code vectors.
+        """
+        if input_x.size(2) != 256 or input_x.size(3) != 256:
+            input_x = F.interpolate(input_x, size=(256, 256), mode='bilinear')
+        x = self.layer1(input_x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.layer5(x)
+        x = self.layer6(x)
+        x = x.view(x.size(0), -1)
+        mu = self.fc_mu(x)
+        logvar = self.fc_var(x)
+        std = torch.exp(0.5 * logvar)
+        eps = torch.randn_like(std)
+        z = eps.mul(std) + mu
+        return mu, logvar, z

imaginaire/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, check out LICENSE.md
+from .conv import LinearBlock, Conv1dBlock, Conv2dBlock, Conv3dBlock, \
+    HyperConv2dBlock, MultiOutConv2dBlock, \
+    PartialConv2dBlock, PartialConv3dBlock
+from .residual import ResLinearBlock, Res1dBlock, Res2dBlock, Res3dBlock, \
+    HyperRes2dBlock, MultiOutRes2dBlock, UpRes2dBlock, DownRes2dBlock, \
+    PartialRes2dBlock, PartialRes3dBlock
+from .non_local import NonLocal2dBlock
+__all__ = ['Conv1dBlock', 'Conv2dBlock', 'Conv3dBlock', 'LinearBlock',
+           'HyperConv2dBlock', 'MultiOutConv2dBlock',
+           'PartialConv2dBlock', 'PartialConv3dBlock',
+           'Res1dBlock', 'Res2dBlock', 'Res3dBlock',
+           'UpRes2dBlock', 'DownRes2dBlock',
+           'ResLinearBlock', 'HyperRes2dBlock', 'MultiOutRes2dBlock',
+           'PartialRes2dBlock', 'PartialRes3dBlock',
+           'NonLocal2dBlock']
+try:
+    from .repvgg import RepVGG1dBlock, RepVGG2dBlock, RepVGG3dBlock
+    from .attn import MultiheadAttention
+    __all__.extend(['RepVGG1dBlock', 'RepVGG2dBlock', 'RepVGG3dBlock'])
+except:  # noqa
+    pass

imaginaire/layers/activation_norm.py ADDED Viewed

	@@ -0,0 +1,629 @@

+# Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, check out LICENSE.md
+# flake8: noqa E722
+from types import SimpleNamespace
+import torch
+try:
+    from torch.nn import SyncBatchNorm
+except ImportError:
+    from torch.nn import BatchNorm2d as SyncBatchNorm
+from torch import nn
+from torch.nn import functional as F
+from .conv import LinearBlock, Conv2dBlock, HyperConv2d, PartialConv2dBlock
+from .misc import PartialSequential, ApplyNoise
+class AdaptiveNorm(nn.Module):
+    r"""Adaptive normalization layer. The layer first normalizes the input, then
+    performs an affine transformation using parameters computed from the
+    conditional inputs.
+    Args:
+        num_features (int): Number of channels in the input tensor.
+        cond_dims (int): Number of channels in the conditional inputs.
+        weight_norm_type (str): Type of weight normalization.
+            ``'none'``, ``'spectral'``, ``'weight'``, or ``'weight_demod'``.
+        projection (bool): If ``True``, project the conditional input to gamma
+            and beta using a fully connected layer, otherwise directly use
+            the conditional input as gamma and beta.
+        projection_bias (bool) If ``True``, use bias in the fully connected
+            projection layer.
+        separate_projection (bool): If ``True``, we will use two different
+            layers for gamma and beta. Otherwise, we will use one layer. It
+            matters only if you apply any weight norms to this layer.
+        input_dim (int): Number of dimensions of the input tensor.
+        activation_norm_type (str):
+            Type of activation normalization.
+            ``'none'``, ``'instance'``, ``'batch'``, ``'sync_batch'``,
+            ``'layer'``,  ``'layer_2d'``, ``'group'``, ``'adaptive'``,
+            ``'spatially_adaptive'`` or ``'hyper_spatially_adaptive'``.
+        activation_norm_params (obj, optional, default=None):
+            Parameters of activation normalization.
+            If not ``None``, ``activation_norm_params.__dict__`` will be used as
+            keyword arguments when initializing activation normalization.
+    """
+    def __init__(self, num_features, cond_dims, weight_norm_type='',
+                 projection=True,
+                 projection_bias=True,
+                 separate_projection=False,
+                 input_dim=2,
+                 activation_norm_type='instance',
+                 activation_norm_params=None,
+                 apply_noise=False,
+                 add_bias=True,
+                 input_scale=1.0,
+                 init_gain=1.0):
+        super().__init__()
+        if activation_norm_params is None:
+            activation_norm_params = SimpleNamespace(affine=False)
+        self.norm = get_activation_norm_layer(num_features,
+                                              activation_norm_type,
+                                              input_dim,
+                                              **vars(activation_norm_params))
+        if apply_noise:
+            self.noise_layer = ApplyNoise()
+        else:
+            self.noise_layer = None
+        if projection:
+            if separate_projection:
+                self.fc_gamma = \
+                    LinearBlock(cond_dims, num_features,
+                                weight_norm_type=weight_norm_type,
+                                bias=projection_bias)
+                self.fc_beta = \
+                    LinearBlock(cond_dims, num_features,
+                                weight_norm_type=weight_norm_type,
+                                bias=projection_bias)
+            else:
+                self.fc = LinearBlock(cond_dims, num_features * 2,
+                                      weight_norm_type=weight_norm_type,
+                                      bias=projection_bias)
+        self.projection = projection
+        self.separate_projection = separate_projection
+        self.input_scale = input_scale
+        self.add_bias = add_bias
+        self.conditional = True
+        self.init_gain = init_gain
+    def forward(self, x, y, noise=None, **_kwargs):
+        r"""Adaptive Normalization forward.
+        Args:
+            x (N x C1 x * tensor): Input tensor.
+            y (N x C2 tensor): Conditional information.
+        Returns:
+            out (N x C1 x * tensor): Output tensor.
+        """
+        y = y * self.input_scale
+        if self.projection:
+            if self.separate_projection:
+                gamma = self.fc_gamma(y)
+                beta = self.fc_beta(y)
+                for _ in range(x.dim() - gamma.dim()):
+                    gamma = gamma.unsqueeze(-1)
+                    beta = beta.unsqueeze(-1)
+            else:
+                y = self.fc(y)
+                for _ in range(x.dim() - y.dim()):
+                    y = y.unsqueeze(-1)
+                gamma, beta = y.chunk(2, 1)
+        else:
+            for _ in range(x.dim() - y.dim()):
+                y = y.unsqueeze(-1)
+            gamma, beta = y.chunk(2, 1)
+        if self.norm is not None:
+            x = self.norm(x)
+        if self.noise_layer is not None:
+            x = self.noise_layer(x, noise=noise)
+        if self.add_bias:
+            x = torch.addcmul(beta, x, 1 + gamma)
+            return x
+        else:
+            return x * (1 + gamma), beta.squeeze(3).squeeze(2)
+class SpatiallyAdaptiveNorm(nn.Module):
+    r"""Spatially Adaptive Normalization (SPADE) initialization.
+    Args:
+        num_features (int) : Number of channels in the input tensor.
+        cond_dims (int or list of int) : List of numbers of channels
+            in the input.
+        num_filters (int): Number of filters in SPADE.
+        kernel_size (int): Kernel size of the convolutional filters in
+            the SPADE layer.
+        weight_norm_type (str): Type of weight normalization.
+            ``'none'``, ``'spectral'``, or ``'weight'``.
+        separate_projection (bool): If ``True``, we will use two different
+            layers for gamma and beta. Otherwise, we will use one layer. It
+            matters only if you apply any weight norms to this layer.
+        activation_norm_type (str):
+            Type of activation normalization.
+            ``'none'``, ``'instance'``, ``'batch'``, ``'sync_batch'``,
+            ``'layer'``,  ``'layer_2d'``, ``'group'``.
+        activation_norm_params (obj, optional, default=None):
+            Parameters of activation normalization.
+            If not ``None``, ``activation_norm_params.__dict__`` will be used as
+            keyword arguments when initializing activation normalization.
+    """
+    def __init__(self,
+                 num_features,
+                 cond_dims,
+                 num_filters=128,
+                 kernel_size=3,
+                 weight_norm_type='',
+                 separate_projection=False,
+                 activation_norm_type='sync_batch',
+                 activation_norm_params=None,
+                 bias_only=False,
+                 partial=False,
+                 interpolation='nearest'):
+        super().__init__()
+        if activation_norm_params is None:
+            activation_norm_params = SimpleNamespace(affine=False)
+        padding = kernel_size // 2
+        self.separate_projection = separate_projection
+        self.mlps = nn.ModuleList()
+        self.gammas = nn.ModuleList()
+        self.betas = nn.ModuleList()
+        self.bias_only = bias_only
+        self.interpolation = interpolation
+        # Make cond_dims a list.
+        if type(cond_dims) != list:
+            cond_dims = [cond_dims]
+        # Make num_filters a list.
+        if not isinstance(num_filters, list):
+            num_filters = [num_filters] * len(cond_dims)
+        else:
+            assert len(num_filters) >= len(cond_dims)
+        # Make partial a list.
+        if not isinstance(partial, list):
+            partial = [partial] * len(cond_dims)
+        else:
+            assert len(partial) >= len(cond_dims)
+        for i, cond_dim in enumerate(cond_dims):
+            mlp = []
+            conv_block = PartialConv2dBlock if partial[i] else Conv2dBlock
+            sequential = PartialSequential if partial[i] else nn.Sequential
+            if num_filters[i] > 0:
+                mlp += [conv_block(cond_dim,
+                                   num_filters[i],
+                                   kernel_size,
+                                   padding=padding,
+                                   weight_norm_type=weight_norm_type,
+                                   nonlinearity='relu')]
+            mlp_ch = cond_dim if num_filters[i] == 0 else num_filters[i]
+            if self.separate_projection:
+                if partial[i]:
+                    raise NotImplementedError(
+                        'Separate projection not yet implemented for ' +
+                        'partial conv')
+                self.mlps.append(nn.Sequential(*mlp))
+                self.gammas.append(
+                    conv_block(mlp_ch, num_features,
+                               kernel_size,
+                               padding=padding,
+                               weight_norm_type=weight_norm_type))
+                self.betas.append(
+                    conv_block(mlp_ch, num_features,
+                               kernel_size,
+                               padding=padding,
+                               weight_norm_type=weight_norm_type))
+            else:
+                mlp += [conv_block(mlp_ch, num_features * 2, kernel_size,
+                                   padding=padding,
+                                   weight_norm_type=weight_norm_type)]
+                self.mlps.append(sequential(*mlp))
+        self.norm = get_activation_norm_layer(num_features,
+                                              activation_norm_type,
+                                              2,
+                                              **vars(activation_norm_params))
+        self.conditional = True
+    def forward(self, x, *cond_inputs, **_kwargs):
+        r"""Spatially Adaptive Normalization (SPADE) forward.
+        Args:
+            x (N x C1 x H x W tensor) : Input tensor.
+            cond_inputs (list of tensors) : Conditional maps for SPADE.
+        Returns:
+            output (4D tensor) : Output tensor.
+        """
+        output = self.norm(x) if self.norm is not None else x
+        for i in range(len(cond_inputs)):
+            if cond_inputs[i] is None:
+                continue
+            label_map = F.interpolate(cond_inputs[i], size=x.size()[2:], mode=self.interpolation)
+            if self.separate_projection:
+                hidden = self.mlps[i](label_map)
+                gamma = self.gammas[i](hidden)
+                beta = self.betas[i](hidden)
+            else:
+                affine_params = self.mlps[i](label_map)
+                gamma, beta = affine_params.chunk(2, dim=1)
+            if self.bias_only:
+                output = output + beta
+            else:
+                output = output * (1 + gamma) + beta
+        return output
+class DualAdaptiveNorm(nn.Module):
+    def __init__(self,
+                 num_features,
+                 cond_dims,
+                 projection_bias=True,
+                 weight_norm_type='',
+                 activation_norm_type='instance',
+                 activation_norm_params=None,
+                 apply_noise=False,
+                 bias_only=False,
+                 init_gain=1.0,
+                 fc_scale=None,
+                 is_spatial=None):
+        super().__init__()
+        if activation_norm_params is None:
+            activation_norm_params = SimpleNamespace(affine=False)
+        self.mlps = nn.ModuleList()
+        self.gammas = nn.ModuleList()
+        self.betas = nn.ModuleList()
+        self.bias_only = bias_only
+        # Make cond_dims a list.
+        if type(cond_dims) != list:
+            cond_dims = [cond_dims]
+        if is_spatial is None:
+            is_spatial = [False for _ in range(len(cond_dims))]
+        self.is_spatial = is_spatial
+        for cond_dim, this_is_spatial in zip(cond_dims, is_spatial):
+            kwargs = dict(weight_norm_type=weight_norm_type,
+                          bias=projection_bias,
+                          init_gain=init_gain,
+                          output_scale=fc_scale)
+            if this_is_spatial:
+                self.gammas.append(Conv2dBlock(cond_dim, num_features, 1, 1, 0, **kwargs))
+                self.betas.append(Conv2dBlock(cond_dim, num_features, 1, 1, 0, **kwargs))
+            else:
+                self.gammas.append(LinearBlock(cond_dim, num_features, **kwargs))
+                self.betas.append(LinearBlock(cond_dim, num_features, **kwargs))
+        self.norm = get_activation_norm_layer(num_features,
+                                              activation_norm_type,
+                                              2,
+                                              **vars(activation_norm_params))
+        self.conditional = True
+    def forward(self, x, *cond_inputs, **_kwargs):
+        assert len(cond_inputs) == len(self.gammas)
+        output = self.norm(x) if self.norm is not None else x
+        for cond, gamma_layer, beta_layer in zip(cond_inputs, self.gammas, self.betas):
+            if cond is None:
+                continue
+            gamma = gamma_layer(cond)
+            beta = beta_layer(cond)
+            if cond.dim() == 4 and gamma.shape != x.shape:
+                gamma = F.interpolate(gamma, size=x.size()[2:], mode='bilinear')
+                beta = F.interpolate(beta, size=x.size()[2:], mode='bilinear')
+            elif cond.dim() == 2:
+                gamma = gamma[:, :, None, None]
+                beta = beta[:, :, None, None]
+            if self.bias_only:
+                output = output + beta
+            else:
+                output = output * (1 + gamma) + beta
+        return output
+class HyperSpatiallyAdaptiveNorm(nn.Module):
+    r"""Spatially Adaptive Normalization (SPADE) initialization.
+    Args:
+        num_features (int) : Number of channels in the input tensor.
+        cond_dims (int or list of int) : List of numbers of channels
+            in the conditional input.
+        num_filters (int): Number of filters in SPADE.
+        kernel_size (int): Kernel size of the convolutional filters in
+            the SPADE layer.
+        weight_norm_type (str): Type of weight normalization.
+            ``'none'``, ``'spectral'``, or ``'weight'``.
+        activation_norm_type (str):
+            Type of activation normalization.
+            ``'none'``, ``'instance'``, ``'batch'``, ``'sync_batch'``,
+            ``'layer'``,  ``'layer_2d'``, ``'group'``.
+        is_hyper (bool): Whether to use hyper SPADE.
+    """
+    def __init__(self, num_features, cond_dims,
+                 num_filters=0, kernel_size=3,
+                 weight_norm_type='',
+                 activation_norm_type='sync_batch', is_hyper=True):
+        super().__init__()
+        padding = kernel_size // 2
+        self.mlps = nn.ModuleList()
+        if type(cond_dims) != list:
+            cond_dims = [cond_dims]
+        for i, cond_dim in enumerate(cond_dims):
+            mlp = []
+            if not is_hyper or (i != 0):
+                if num_filters > 0:
+                    mlp += [Conv2dBlock(cond_dim, num_filters, kernel_size,
+                                        padding=padding,
+                                        weight_norm_type=weight_norm_type,
+                                        nonlinearity='relu')]
+                mlp_ch = cond_dim if num_filters == 0 else num_filters
+                mlp += [Conv2dBlock(mlp_ch, num_features * 2, kernel_size,
+                                    padding=padding,
+                                    weight_norm_type=weight_norm_type)]
+                mlp = nn.Sequential(*mlp)
+            else:
+                if num_filters > 0:
+                    raise ValueError('Multi hyper layer not supported yet.')
+                mlp = HyperConv2d(padding=padding)
+            self.mlps.append(mlp)
+        self.norm = get_activation_norm_layer(num_features,
+                                              activation_norm_type,
+                                              2,
+                                              affine=False)
+        self.conditional = True
+    def forward(self, x, *cond_inputs,
+                norm_weights=(None, None), **_kwargs):
+        r"""Spatially Adaptive Normalization (SPADE) forward.
+        Args:
+            x (4D tensor) : Input tensor.
+            cond_inputs (list of tensors) : Conditional maps for SPADE.
+            norm_weights (5D tensor or list of tensors): conv weights or
+            [weights, biases].
+        Returns:
+            output (4D tensor) : Output tensor.
+        """
+        output = self.norm(x)
+        for i in range(len(cond_inputs)):
+            if cond_inputs[i] is None:
+                continue
+            if type(cond_inputs[i]) == list:
+                cond_input, mask = cond_inputs[i]
+                mask = F.interpolate(mask, size=x.size()[2:], mode='bilinear', align_corners=False)
+            else:
+                cond_input = cond_inputs[i]
+                mask = None
+            label_map = F.interpolate(cond_input, size=x.size()[2:])
+            if norm_weights is None or norm_weights[0] is None or i != 0:
+                affine_params = self.mlps[i](label_map)
+            else:
+                affine_params = self.mlps[i](label_map,
+                                             conv_weights=norm_weights)
+            gamma, beta = affine_params.chunk(2, dim=1)
+            if mask is not None:
+                gamma = gamma * (1 - mask)
+                beta = beta * (1 - mask)
+            output = output * (1 + gamma) + beta
+        return output
+class LayerNorm2d(nn.Module):
+    r"""Layer Normalization as introduced in
+    https://arxiv.org/abs/1607.06450.
+    This is the usual way to apply layer normalization in CNNs.
+    Note that unlike the pytorch implementation which applies per-element
+    scale and bias, here it applies per-channel scale and bias, similar to
+    batch/instance normalization.
+    Args:
+        num_features (int): Number of channels in the input tensor.
+        eps (float, optional, default=1e-5): a value added to the
+            denominator for numerical stability.
+        affine (bool, optional, default=False): If ``True``, performs
+            affine transformation after normalization.
+    """
+    def __init__(self, num_features, eps=1e-5, channel_only=False, affine=True):
+        super(LayerNorm2d, self).__init__()
+        self.num_features = num_features
+        self.affine = affine
+        self.eps = eps
+        self.channel_only = channel_only
+        if self.affine:
+            self.gamma = nn.Parameter(torch.Tensor(num_features).fill_(1.0))
+            self.beta = nn.Parameter(torch.zeros(num_features))
+    def forward(self, x):
+        r"""
+        Args:
+            x (tensor): Input tensor.
+        """
+        shape = [-1] + [1] * (x.dim() - 1)
+        if self.channel_only:
+            mean = x.mean(1, keepdim=True)
+            std = x.std(1, keepdim=True)
+        else:
+            mean = x.view(x.size(0), -1).mean(1).view(*shape)
+            std = x.view(x.size(0), -1).std(1).view(*shape)
+        x = (x - mean) / (std + self.eps)
+        if self.affine:
+            shape = [1, -1] + [1] * (x.dim() - 2)
+            x = x * self.gamma.view(*shape) + self.beta.view(*shape)
+        return x
+class ScaleNorm(nn.Module):
+    r"""Scale normalization:
+    "Transformers without Tears: Improving the Normalization of Self-Attention"
+    Modified from:
+    https://github.com/tnq177/transformers_without_tears
+    """
+    def __init__(self, dim=-1, learned_scale=True, eps=1e-5):
+        super().__init__()
+        # scale = num_features ** 0.5
+        if learned_scale:
+            self.scale = nn.Parameter(torch.tensor(1.))
+        else:
+            self.scale = 1.
+        # self.num_features = num_features
+        self.dim = dim
+        self.eps = eps
+        self.learned_scale = learned_scale
+    def forward(self, x):
+        # noinspection PyArgumentList
+        scale = self.scale * torch.rsqrt(torch.mean(x ** 2, dim=self.dim, keepdim=True) + self.eps)
+        return x * scale
+    def extra_repr(self):
+        s = 'learned_scale={learned_scale}'
+        return s.format(**self.__dict__)
+class PixelNorm(ScaleNorm):
+    def __init__(self, learned_scale=False, eps=1e-5, **_kwargs):
+        super().__init__(1, learned_scale, eps)
+class SplitMeanStd(nn.Module):
+    def __init__(self, num_features, eps=1e-5, **kwargs):
+        super().__init__()
+        self.num_features = num_features
+        self.eps = eps
+        self.multiple_outputs = True
+    def forward(self, x):
+        b, c, h, w = x.size()
+        mean = x.view(b, c, -1).mean(-1)[:, :, None, None]
+        var = x.view(b, c, -1).var(-1)[:, :, None, None]
+        std = torch.sqrt(var + self.eps)
+        # x = (x - mean) / std
+        return x, torch.cat((mean, std), dim=1)
+class ScaleNorm(nn.Module):
+    r"""Scale normalization:
+    "Transformers without Tears: Improving the Normalization of Self-Attention"
+    Modified from:
+    https://github.com/tnq177/transformers_without_tears
+    """
+    def __init__(self, dim=-1, learned_scale=True, eps=1e-5):
+        super().__init__()
+        # scale = num_features ** 0.5
+        if learned_scale:
+            self.scale = nn.Parameter(torch.tensor(1.))
+        else:
+            self.scale = 1.
+        # self.num_features = num_features
+        self.dim = dim
+        self.eps = eps
+        self.learned_scale = learned_scale
+    def forward(self, x):
+        # noinspection PyArgumentList
+        scale = self.scale * torch.rsqrt(
+            torch.mean(x ** 2, dim=self.dim, keepdim=True) + self.eps)
+        return x * scale
+    def extra_repr(self):
+        s = 'learned_scale={learned_scale}'
+        return s.format(**self.__dict__)
+class PixelLayerNorm(nn.Module):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self.norm = nn.LayerNorm(*args, **kwargs)
+    def forward(self, x):
+        if x.dim() == 4:
+            b, c, h, w = x.shape
+            return self.norm(x.permute(0, 2, 3, 1).view(-1, c).contiguous()).view(b, h, w, c).permute(0, 3, 1, 2).contiguous()
+        else:
+            return self.norm(x)
+def get_activation_norm_layer(num_features, norm_type, input_dim, **norm_params):
+    r"""Return an activation normalization layer.
+    Args:
+        num_features (int): Number of feature channels.
+        norm_type (str):
+            Type of activation normalization.
+            ``'none'``, ``'instance'``, ``'batch'``, ``'sync_batch'``,
+            ``'layer'``,  ``'layer_2d'``, ``'group'``, ``'adaptive'``,
+            ``'spatially_adaptive'`` or ``'hyper_spatially_adaptive'``.
+        input_dim (int): Number of input dimensions.
+        norm_params: Arbitrary keyword arguments that will be used to
+            initialize the activation normalization.
+    """
+    input_dim = max(input_dim, 1)  # Norm1d works with both 0d and 1d inputs
+    if norm_type == 'none' or norm_type == '':
+        norm_layer = None
+    elif norm_type == 'batch':
+        norm = getattr(nn, 'BatchNorm%dd' % input_dim)
+        norm_layer = norm(num_features, **norm_params)
+    elif norm_type == 'instance':
+        affine = norm_params.pop('affine', True)  # Use affine=True by default
+        norm = getattr(nn, 'InstanceNorm%dd' % input_dim)
+        norm_layer = norm(num_features, affine=affine, **norm_params)
+    elif norm_type == 'sync_batch':
+        norm_layer = SyncBatchNorm(num_features, **norm_params)
+    elif norm_type == 'layer':
+        norm_layer = nn.LayerNorm(num_features, **norm_params)
+    elif norm_type == 'layer_2d':
+        norm_layer = LayerNorm2d(num_features, **norm_params)
+    elif norm_type == 'pixel_layer':
+        elementwise_affine = norm_params.pop('affine', True)  # Use affine=True by default
+        norm_layer = PixelLayerNorm(num_features, elementwise_affine=elementwise_affine, **norm_params)
+    elif norm_type == 'scale':
+        norm_layer = ScaleNorm(**norm_params)
+    elif norm_type == 'pixel':
+        norm_layer = PixelNorm(**norm_params)
+        import imaginaire.config
+        if imaginaire.config.USE_JIT:
+            norm_layer = torch.jit.script(norm_layer)
+    elif norm_type == 'group':
+        num_groups = norm_params.pop('num_groups', 4)
+        norm_layer = nn.GroupNorm(num_channels=num_features, num_groups=num_groups, **norm_params)
+    elif norm_type == 'adaptive':
+        norm_layer = AdaptiveNorm(num_features, **norm_params)
+    elif norm_type == 'dual_adaptive':
+        norm_layer = DualAdaptiveNorm(num_features, **norm_params)
+    elif norm_type == 'spatially_adaptive':
+        if input_dim != 2:
+            raise ValueError('Spatially adaptive normalization layers '
+                             'only supports 2D input')
+        norm_layer = SpatiallyAdaptiveNorm(num_features, **norm_params)
+    elif norm_type == 'hyper_spatially_adaptive':
+        if input_dim != 2:
+            raise ValueError('Spatially adaptive normalization layers '
+                             'only supports 2D input')
+        norm_layer = HyperSpatiallyAdaptiveNorm(num_features, **norm_params)
+    else:
+        raise ValueError('Activation norm layer %s '
+                         'is not recognized' % norm_type)
+    return norm_layer

imaginaire/layers/conv.py ADDED Viewed

	@@ -0,0 +1,1377 @@

+# Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, check out LICENSE.md
+import warnings
+from types import SimpleNamespace
+import torch
+from torch import nn
+from torch.nn import functional as F
+from .misc import ApplyNoise
+from imaginaire.third_party.upfirdn2d.upfirdn2d import Blur
+class _BaseConvBlock(nn.Module):
+    r"""An abstract wrapper class that wraps a torch convolution or linear layer
+    with normalization and nonlinearity.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, padding_mode,
+                 weight_norm_type, weight_norm_params, activation_norm_type, activation_norm_params, nonlinearity,
+                 inplace_nonlinearity, apply_noise, blur, order, input_dim, clamp, blur_kernel, output_scale,
+                 init_gain):
+        super().__init__()
+        from .nonlinearity import get_nonlinearity_layer
+        from .weight_norm import get_weight_norm_layer
+        from .activation_norm import get_activation_norm_layer
+        self.weight_norm_type = weight_norm_type
+        self.stride = stride
+        self.clamp = clamp
+        self.init_gain = init_gain
+        # Nonlinearity layer.
+        if 'fused' in nonlinearity:
+            # Fusing nonlinearity with bias.
+            lr_mul = getattr(weight_norm_params, 'lr_mul', 1)
+            conv_before_nonlinearity = order.find('C') < order.find('A')
+            if conv_before_nonlinearity:
+                assert bias is True
+                bias = False
+            channel = out_channels if conv_before_nonlinearity else in_channels
+            nonlinearity_layer = get_nonlinearity_layer(
+                nonlinearity, inplace=inplace_nonlinearity,
+                num_channels=channel, lr_mul=lr_mul)
+        else:
+            nonlinearity_layer = get_nonlinearity_layer(
+                nonlinearity, inplace=inplace_nonlinearity)
+        # Noise injection layer.
+        if apply_noise:
+            order = order.replace('C', 'CG')
+            noise_layer = ApplyNoise()
+        else:
+            noise_layer = None
+        # Convolutional layer.
+        if blur:
+            assert blur_kernel is not None
+            if stride == 2:
+                # Blur - Conv - Noise - Activate
+                p = (len(blur_kernel) - 2) + (kernel_size - 1)
+                pad0, pad1 = (p + 1) // 2, p // 2
+                padding = 0
+                blur_layer = Blur(
+                    blur_kernel, pad=(pad0, pad1), padding_mode=padding_mode
+                )
+                order = order.replace('C', 'BC')
+            elif stride == 0.5:
+                # Conv - Blur - Noise - Activate
+                padding = 0
+                p = (len(blur_kernel) - 2) - (kernel_size - 1)
+                pad0, pad1 = (p + 1) // 2 + 1, p // 2 + 1
+                blur_layer = Blur(
+                    blur_kernel, pad=(pad0, pad1), padding_mode=padding_mode
+                )
+                order = order.replace('C', 'CB')
+            elif stride == 1:
+                # No blur for now
+                blur_layer = nn.Identity()
+            else:
+                raise NotImplementedError
+        else:
+            blur_layer = nn.Identity()
+        if weight_norm_params is None:
+            weight_norm_params = SimpleNamespace()
+        weight_norm = get_weight_norm_layer(
+            weight_norm_type, **vars(weight_norm_params))
+        conv_layer = weight_norm(self._get_conv_layer(
+            in_channels, out_channels, kernel_size, stride, padding, dilation,
+            groups, bias, padding_mode, input_dim))
+        # Normalization layer.
+        conv_before_norm = order.find('C') < order.find('N')
+        norm_channels = out_channels if conv_before_norm else in_channels
+        if activation_norm_params is None:
+            activation_norm_params = SimpleNamespace()
+        activation_norm_layer = get_activation_norm_layer(
+            norm_channels,
+            activation_norm_type,
+            input_dim,
+            **vars(activation_norm_params))
+        # Mapping from operation names to layers.
+        mappings = {'C': {'conv': conv_layer},
+                    'N': {'norm': activation_norm_layer},
+                    'A': {'nonlinearity': nonlinearity_layer}}
+        mappings.update({'B': {'blur': blur_layer}})
+        mappings.update({'G': {'noise': noise_layer}})
+        # All layers in order.
+        self.layers = nn.ModuleDict()
+        for op in order:
+            if list(mappings[op].values())[0] is not None:
+                self.layers.update(mappings[op])
+        # Whether this block expects conditional inputs.
+        self.conditional = \
+            getattr(conv_layer, 'conditional', False) or \
+            getattr(activation_norm_layer, 'conditional', False)
+        # Scale the output by a learnable scaler parameter.
+        if output_scale is not None:
+            self.output_scale = nn.Parameter(torch.tensor(output_scale))
+        else:
+            self.register_parameter("output_scale", None)
+    def forward(self, x, *cond_inputs, **kw_cond_inputs):
+        r"""
+        Args:
+            x (tensor): Input tensor.
+            cond_inputs (list of tensors) : Conditional input tensors.
+            kw_cond_inputs (dict) : Keyword conditional inputs.
+        """
+        for key, layer in self.layers.items():
+            if getattr(layer, 'conditional', False):
+                # Layers that require conditional inputs.
+                x = layer(x, *cond_inputs, **kw_cond_inputs)
+            else:
+                x = layer(x)
+            if self.clamp is not None and isinstance(layer, nn.Conv2d):
+                x.clamp_(max=self.clamp)
+            if key == 'conv':
+                if self.output_scale is not None:
+                    x = x * self.output_scale
+        return x
+    def _get_conv_layer(self, in_channels, out_channels, kernel_size, stride,
+                        padding, dilation, groups, bias, padding_mode,
+                        input_dim):
+        # Returns the convolutional layer.
+        if input_dim == 0:
+            layer = nn.Linear(in_channels, out_channels, bias)
+        else:
+            if stride < 1:  # Fractionally-strided convolution.
+                padding_mode = 'zeros'
+                assert padding == 0
+                layer_type = getattr(nn, f'ConvTranspose{input_dim}d')
+                stride = round(1 / stride)
+            else:
+                layer_type = getattr(nn, f'Conv{input_dim}d')
+            layer = layer_type(
+                in_channels, out_channels, kernel_size, stride, padding,
+                dilation=dilation, groups=groups, bias=bias,
+                padding_mode=padding_mode
+            )
+        return layer
+    def __repr__(self):
+        main_str = self._get_name() + '('
+        child_lines = []
+        for name, layer in self.layers.items():
+            mod_str = repr(layer)
+            if name == 'conv' and self.weight_norm_type != 'none' and \
+                    self.weight_norm_type != '':
+                mod_str = mod_str[:-1] + \
+                          ', weight_norm={}'.format(self.weight_norm_type) + ')'
+            if name == 'conv' and getattr(layer, 'base_lr_mul', 1) != 1:
+                mod_str = mod_str[:-1] + \
+                          ', lr_mul={}'.format(layer.base_lr_mul) + ')'
+            mod_str = self._addindent(mod_str, 2)
+            child_lines.append(mod_str)
+        if len(child_lines) == 1:
+            main_str += child_lines[0]
+        else:
+            main_str += '\n  ' + '\n  '.join(child_lines) + '\n'
+        main_str += ')'
+        return main_str
+    @staticmethod
+    def _addindent(s_, numSpaces):
+        s = s_.split('\n')
+        # don't do anything for single-line stuff
+        if len(s) == 1:
+            return s_
+        first = s.pop(0)
+        s = [(numSpaces * ' ') + line for line in s]
+        s = '\n'.join(s)
+        s = first + '\n' + s
+        return s
+class ModulatedConv2dBlock(_BaseConvBlock):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True,
+                 padding_mode='zeros',
+                 weight_norm_type='none', weight_norm_params=None,
+                 activation_norm_type='none', activation_norm_params=None,
+                 nonlinearity='none', inplace_nonlinearity=False,
+                 apply_noise=True, blur=True, order='CNA', demodulate=True,
+                 eps=True, style_dim=None, clamp=None, blur_kernel=(1, 3, 3, 1), output_scale=None, init_gain=1.0):
+        self.eps = eps
+        self.demodulate = demodulate
+        assert style_dim is not None
+        super().__init__(in_channels, out_channels, kernel_size, stride,
+                         padding, dilation, groups, bias, padding_mode,
+                         weight_norm_type, weight_norm_params,
+                         activation_norm_type, activation_norm_params,
+                         nonlinearity, inplace_nonlinearity, apply_noise, blur,
+                         order, 2, clamp, blur_kernel, output_scale, init_gain)
+        self.modulation = LinearBlock(style_dim, in_channels,
+                                      weight_norm_type=weight_norm_type,
+                                      weight_norm_params=weight_norm_params)
+    def _get_conv_layer(self, in_channels, out_channels, kernel_size, stride,
+                        padding, dilation, groups, bias, padding_mode,
+                        input_dim):
+        assert input_dim == 2
+        layer = ModulatedConv2d(
+            in_channels, out_channels, kernel_size, stride, padding,
+            dilation, groups, bias, padding_mode, self.demodulate, self.eps)
+        return layer
+    def forward(self, x, *cond_inputs, **kw_cond_inputs):
+        for layer in self.layers.values():
+            if getattr(layer, 'conditional', False):
+                # Layers that require conditional inputs.
+                assert len(cond_inputs) == 1
+                style = cond_inputs[0]
+                x = layer(
+                    x, self.modulation(style), **kw_cond_inputs
+                )
+            else:
+                x = layer(x)
+            if self.clamp is not None and isinstance(layer, ModulatedConv2d):
+                x.clamp_(max=self.clamp)
+        return x
+    def __repr__(self):
+        main_str = self._get_name() + '('
+        child_lines = []
+        for name, layer in self.layers.items():
+            mod_str = repr(layer)
+            if name == 'conv' and self.weight_norm_type != 'none' and \
+                    self.weight_norm_type != '':
+                mod_str = mod_str[:-1] + \
+                          ', weight_norm={}'.format(self.weight_norm_type) + \
+                          ', demodulate={}'.format(self.demodulate) + ')'
+            mod_str = self._addindent(mod_str, 2)
+            child_lines.append(mod_str)
+        child_lines.append(
+            self._addindent('Modulation(' + repr(self.modulation) + ')', 2)
+        )
+        if len(child_lines) == 1:
+            main_str += child_lines[0]
+        else:
+            main_str += '\n  ' + '\n  '.join(child_lines) + '\n'
+        main_str += ')'
+        return main_str
+class ModulatedConv2d(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride, padding,
+                 dilation, groups, bias, padding_mode, demodulate=True,
+                 eps=1e-8):
+        # in_channels, out_channels, kernel_size, stride, padding,
+        # dilation, groups, bias, padding_mode
+        assert dilation == 1 and groups == 1
+        super().__init__()
+        self.eps = eps
+        self.kernel_size = kernel_size
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.padding = padding
+        self.stride = stride
+        self.padding_mode = padding_mode
+        # kernel_size // 2
+        # assert self.padding == padding
+        self.weight = nn.Parameter(
+            torch.randn(out_channels, in_channels, kernel_size, kernel_size)
+        )
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            # noinspection PyTypeChecker
+            self.register_parameter('bias', None)
+        # self.modulation = LinearBlock(style_dim, in_channels,
+        #                               weight_norm_type=weight_norm_type)
+        self.demodulate = demodulate
+        self.conditional = True
+    def forward(self, x, style, **_kwargs):
+        batch, in_channel, height, width = x.shape
+        # style = self.modulation(style).view(batch, 1, in_channel, 1, 1)
+        # We assume the modulation layer is outside this module.
+        style = style.view(batch, 1, in_channel, 1, 1)
+        weight = self.weight.unsqueeze(0) * style
+        if self.demodulate:
+            demod = torch.rsqrt(
+                weight.pow(2).sum([2, 3, 4]) + self.eps)
+            weight = weight * demod.view(batch, self.out_channels, 1, 1, 1)
+        weight = weight.view(
+            batch * self.out_channels,
+            in_channel, self.kernel_size, self.kernel_size
+        )
+        if self.bias is not None:
+            bias = self.bias.repeat(batch)
+        else:
+            bias = self.bias
+        x = x.view(1, batch * in_channel, height, width)
+        if self.padding_mode != 'zeros':
+            x = F.pad(x, self._reversed_padding_repeated_twice,
+                      mode=self.padding_mode)
+            padding = (0, 0)
+        else:
+            padding = self.padding
+        if self.stride == 0.5:
+            weight = weight.view(
+                batch, self.out_channels, in_channel,
+                self.kernel_size, self.kernel_size
+            )
+            weight = weight.transpose(1, 2).reshape(
+                batch * in_channel, self.out_channels,
+                self.kernel_size, self.kernel_size
+            )
+            out = F.conv_transpose2d(
+                x, weight, bias, padding=padding, stride=2, groups=batch
+            )
+        elif self.stride == 2:
+            out = F.conv2d(
+                x, weight, bias, padding=padding, stride=2, groups=batch
+            )
+        else:
+            out = F.conv2d(x, weight, bias, padding=padding, groups=batch)
+        _, _, height, width = out.shape
+        out = out.view(batch, self.out_channels, height, width)
+        return out
+    def extra_repr(self):
+        s = ('{in_channels}, {out_channels}, kernel_size={kernel_size}'
+             ', stride={stride}')
+        if self.bias is None:
+            s += ', bias=False'
+        if self.padding_mode != 'zeros':
+            s += ', padding_mode={padding_mode}'
+        return s.format(**self.__dict__)
+class LinearBlock(_BaseConvBlock):
+    r"""A Wrapper class that wraps ``torch.nn.Linear`` with normalization and
+    nonlinearity.
+    Args:
+        in_features (int): Number of channels in the input tensor.
+        out_features (int): Number of channels in the output tensor.
+        bias (bool, optional, default=True):
+            If ``True``, adds a learnable bias to the output.
+        weight_norm_type (str, optional, default='none'):
+            Type of weight normalization.
+            ``'none'``, ``'spectral'``, ``'weight'``
+            or ``'weight_demod'``.
+        weight_norm_params (obj, optional, default=None):
+            Parameters of weight normalization.
+            If not ``None``, ``weight_norm_params.__dict__`` will be used as
+            keyword arguments when initializing weight normalization.
+        activation_norm_type (str, optional, default='none'):
+            Type of activation normalization.
+            ``'none'``, ``'instance'``, ``'batch'``, ``'sync_batch'``,
+            ``'layer'``,  ``'layer_2d'``, ``'group'``, ``'adaptive'``,
+            ``'spatially_adaptive'`` or ``'hyper_spatially_adaptive'``.
+        activation_norm_params (obj, optional, default=None):
+            Parameters of activation normalization.
+            If not ``None``, ``activation_norm_params.__dict__`` will be used as
+            keyword arguments when initializing activation normalization.
+        nonlinearity (str, optional, default='none'):
+            Type of nonlinear activation function.
+            ``'none'``, ``'relu'``, ``'leakyrelu'``, ``'prelu'``,
+            ``'tanh'`` , ``'sigmoid'`` or ``'softmax'``.
+        inplace_nonlinearity (bool, optional, default=False): If ``True``,
+            set ``inplace=True`` when initializing the nonlinearity layer.
+        apply_noise (bool, optional, default=False): If ``True``, add
+            Gaussian noise with learnable magnitude after the
+            fully-connected layer.
+        order (str, optional, default='CNA'): Order of operations.
+            ``'C'``: fully-connected,
+            ``'N'``: normalization,
+            ``'A'``: nonlinear activation.
+            For example, a block initialized with ``order='CNA'`` will
+            do convolution first, then normalization, then nonlinearity.
+    """
+    def __init__(self, in_features, out_features, bias=True,
+                 weight_norm_type='none', weight_norm_params=None,
+                 activation_norm_type='none', activation_norm_params=None,
+                 nonlinearity='none', inplace_nonlinearity=False,
+                 apply_noise=False, order='CNA', clamp=None, blur_kernel=(1, 3, 3, 1), output_scale=None,
+                 init_gain=1.0, **_kwargs):
+        if bool(_kwargs):
+            warnings.warn(f"Unused keyword arguments {_kwargs}")
+        super().__init__(in_features, out_features, None, None,
+                         None, None, None, bias,
+                         None, weight_norm_type, weight_norm_params,
+                         activation_norm_type, activation_norm_params,
+                         nonlinearity, inplace_nonlinearity, apply_noise,
+                         False, order, 0, clamp, blur_kernel, output_scale,
+                         init_gain)
+class EmbeddingBlock(_BaseConvBlock):
+    def __init__(self, in_features, out_features, bias=True,
+                 weight_norm_type='none', weight_norm_params=None,
+                 activation_norm_type='none', activation_norm_params=None,
+                 nonlinearity='none', inplace_nonlinearity=False,
+                 apply_noise=False, order='CNA', clamp=None, output_scale=None,
+                 init_gain=1.0, **_kwargs):
+        if bool(_kwargs):
+            warnings.warn(f"Unused keyword arguments {_kwargs}")
+        super().__init__(in_features, out_features, None, None,
+                         None, None, None, bias,
+                         None, weight_norm_type, weight_norm_params,
+                         activation_norm_type, activation_norm_params,
+                         nonlinearity, inplace_nonlinearity, apply_noise,
+                         False, order, 0, clamp, None, output_scale,
+                         init_gain)
+    def _get_conv_layer(self, in_channels, out_channels, kernel_size, stride,
+                        padding, dilation, groups, bias, padding_mode,
+                        input_dim):
+        assert input_dim == 0
+        return nn.Embedding(in_channels, out_channels)
+class Embedding2dBlock(_BaseConvBlock):
+    def __init__(self, in_features, out_features, bias=True,
+                 weight_norm_type='none', weight_norm_params=None,
+                 activation_norm_type='none', activation_norm_params=None,
+                 nonlinearity='none', inplace_nonlinearity=False,
+                 apply_noise=False, order='CNA', clamp=None, output_scale=None,
+                 init_gain=1.0, **_kwargs):
+        if bool(_kwargs):
+            warnings.warn(f"Unused keyword arguments {_kwargs}")
+        super().__init__(in_features, out_features, None, None,
+                         None, None, None, bias,
+                         None, weight_norm_type, weight_norm_params,
+                         activation_norm_type, activation_norm_params,
+                         nonlinearity, inplace_nonlinearity, apply_noise,
+                         False, order, 0, clamp, None, output_scale,
+                         init_gain)
+    def _get_conv_layer(self, in_channels, out_channels, kernel_size, stride,
+                        padding, dilation, groups, bias, padding_mode,
+                        input_dim):
+        assert input_dim == 0
+        return Embedding2d(in_channels, out_channels)
+class Conv1dBlock(_BaseConvBlock):
+    r"""A Wrapper class that wraps ``torch.nn.Conv1d`` with normalization and
+    nonlinearity.
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels in the output tensor.
+        kernel_size (int or tuple): Size of the convolving kernel.
+        stride (int or float or tuple, optional, default=1):
+            Stride of the convolution.
+        padding (int or tuple, optional, default=0):
+            Zero-padding added to both sides of the input.
+        dilation (int or tuple, optional, default=1):
+            Spacing between kernel elements.
+        groups (int, optional, default=1): Number of blocked connections
+            from input channels to output channels.
+        bias (bool, optional, default=True):
+            If ``True``, adds a learnable bias to the output.
+        padding_mode (string, optional, default='zeros'): Type of padding:
+            ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``.
+        weight_norm_type (str, optional, default='none'):
+            Type of weight normalization.
+            ``'none'``, ``'spectral'``, ``'weight'``
+            or ``'weight_demod'``.
+        weight_norm_params (obj, optional, default=None):
+            Parameters of weight normalization.
+            If not ``None``, ``weight_norm_params.__dict__`` will be used as
+            keyword arguments when initializing weight normalization.
+        activation_norm_type (str, optional, default='none'):
+            Type of activation normalization.
+            ``'none'``, ``'instance'``, ``'batch'``, ``'sync_batch'``,
+            ``'layer'``,  ``'layer_2d'``, ``'group'``, ``'adaptive'``,
+            ``'spatially_adaptive'`` or ``'hyper_spatially_adaptive'``.
+        activation_norm_params (obj, optional, default=None):
+            Parameters of activation normalization.
+            If not ``None``, ``activation_norm_params.__dict__`` will be used as
+            keyword arguments when initializing activation normalization.
+        nonlinearity (str, optional, default='none'):
+            Type of nonlinear activation function.
+            ``'none'``, ``'relu'``, ``'leakyrelu'``, ``'prelu'``,
+            ``'tanh'`` , ``'sigmoid'`` or ``'softmax'``.
+        inplace_nonlinearity (bool, optional, default=False): If ``True``,
+            set ``inplace=True`` when initializing the nonlinearity layer.
+        apply_noise (bool, optional, default=False): If ``True``, adds
+            Gaussian noise with learnable magnitude to the convolution output.
+        order (str, optional, default='CNA'): Order of operations.
+            ``'C'``: convolution,
+            ``'N'``: normalization,
+            ``'A'``: nonlinear activation.
+            For example, a block initialized with ``order='CNA'`` will
+            do convolution first, then normalization, then nonlinearity.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True,
+                 padding_mode='zeros',
+                 weight_norm_type='none', weight_norm_params=None,
+                 activation_norm_type='none', activation_norm_params=None,
+                 nonlinearity='none', inplace_nonlinearity=False,
+                 apply_noise=False, blur=False, order='CNA', clamp=None, output_scale=None, init_gain=1.0, **_kwargs):
+        super().__init__(in_channels, out_channels, kernel_size, stride,
+                         padding, dilation, groups, bias, padding_mode,
+                         weight_norm_type, weight_norm_params,
+                         activation_norm_type, activation_norm_params,
+                         nonlinearity, inplace_nonlinearity, apply_noise,
+                         blur, order, 1, clamp, None, output_scale, init_gain)
+class Conv2dBlock(_BaseConvBlock):
+    r"""A Wrapper class that wraps ``torch.nn.Conv2d`` with normalization and
+    nonlinearity.
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels in the output tensor.
+        kernel_size (int or tuple): Size of the convolving kernel.
+        stride (int or float or tuple, optional, default=1):
+            Stride of the convolution.
+        padding (int or tuple, optional, default=0):
+            Zero-padding added to both sides of the input.
+        dilation (int or tuple, optional, default=1):
+            Spacing between kernel elements.
+        groups (int, optional, default=1): Number of blocked connections
+            from input channels to output channels.
+        bias (bool, optional, default=True):
+            If ``True``, adds a learnable bias to the output.
+        padding_mode (string, optional, default='zeros'): Type of padding:
+            ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``.
+        weight_norm_type (str, optional, default='none'):
+            Type of weight normalization.
+            ``'none'``, ``'spectral'``, ``'weight'``
+            or ``'weight_demod'``.
+        weight_norm_params (obj, optional, default=None):
+            Parameters of weight normalization.
+            If not ``None``, ``weight_norm_params.__dict__`` will be used as
+            keyword arguments when initializing weight normalization.
+        activation_norm_type (str, optional, default='none'):
+            Type of activation normalization.
+            ``'none'``, ``'instance'``, ``'batch'``, ``'sync_batch'``,
+            ``'layer'``,  ``'layer_2d'``, ``'group'``, ``'adaptive'``,
+            ``'spatially_adaptive'`` or ``'hyper_spatially_adaptive'``.
+        activation_norm_params (obj, optional, default=None):
+            Parameters of activation normalization.
+            If not ``None``, ``activation_norm_params.__dict__`` will be used as
+            keyword arguments when initializing activation normalization.
+        nonlinearity (str, optional, default='none'):
+            Type of nonlinear activation function.
+            ``'none'``, ``'relu'``, ``'leakyrelu'``, ``'prelu'``,
+            ``'tanh'`` , ``'sigmoid'`` or ``'softmax'``.
+        inplace_nonlinearity (bool, optional, default=False): If ``True``,
+            set ``inplace=True`` when initializing the nonlinearity layer.
+        apply_noise (bool, optional, default=False): If ``True``, adds
+            Gaussian noise with learnable magnitude to the convolution output.
+        order (str, optional, default='CNA'): Order of operations.
+            ``'C'``: convolution,
+            ``'N'``: normalization,
+            ``'A'``: nonlinear activation.
+            For example, a block initialized with ``order='CNA'`` will
+            do convolution first, then normalization, then nonlinearity.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True,
+                 padding_mode='zeros',
+                 weight_norm_type='none', weight_norm_params=None,
+                 activation_norm_type='none', activation_norm_params=None,
+                 nonlinearity='none', inplace_nonlinearity=False,
+                 apply_noise=False, blur=False, order='CNA', clamp=None, blur_kernel=(1, 3, 3, 1),
+                 output_scale=None, init_gain=1.0):
+        super().__init__(in_channels, out_channels, kernel_size, stride,
+                         padding, dilation, groups, bias, padding_mode,
+                         weight_norm_type, weight_norm_params,
+                         activation_norm_type, activation_norm_params,
+                         nonlinearity, inplace_nonlinearity,
+                         apply_noise, blur, order, 2, clamp, blur_kernel, output_scale, init_gain)
+class Conv3dBlock(_BaseConvBlock):
+    r"""A Wrapper class that wraps ``torch.nn.Conv3d`` with normalization and
+    nonlinearity.
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels in the output tensor.
+        kernel_size (int or tuple): Size of the convolving kernel.
+        stride (int or float or tuple, optional, default=1):
+            Stride of the convolution.
+        padding (int or tuple, optional, default=0):
+            Zero-padding added to both sides of the input.
+        dilation (int or tuple, optional, default=1):
+            Spacing between kernel elements.
+        groups (int, optional, default=1): Number of blocked connections
+            from input channels to output channels.
+        bias (bool, optional, default=True):
+            If ``True``, adds a learnable bias to the output.
+        padding_mode (string, optional, default='zeros'): Type of padding:
+            ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``.
+        weight_norm_type (str, optional, default='none'):
+            Type of weight normalization.
+            ``'none'``, ``'spectral'``, ``'weight'``
+            or ``'weight_demod'``.
+        weight_norm_params (obj, optional, default=None):
+            Parameters of weight normalization.
+            If not ``None``, ``weight_norm_params.__dict__`` will be used as
+            keyword arguments when initializing weight normalization.
+        activation_norm_type (str, optional, default='none'):
+            Type of activation normalization.
+            ``'none'``, ``'instance'``, ``'batch'``, ``'sync_batch'``,
+            ``'layer'``,  ``'layer_2d'``, ``'group'``, ``'adaptive'``,
+            ``'spatially_adaptive'`` or ``'hyper_spatially_adaptive'``.
+        activation_norm_params (obj, optional, default=None):
+            Parameters of activation normalization.
+            If not ``None``, ``activation_norm_params.__dict__`` will be used as
+            keyword arguments when initializing activation normalization.
+        nonlinearity (str, optional, default='none'):
+            Type of nonlinear activation function.
+            ``'none'``, ``'relu'``, ``'leakyrelu'``, ``'prelu'``,
+            ``'tanh'`` , ``'sigmoid'`` or ``'softmax'``.
+        inplace_nonlinearity (bool, optional, default=False): If ``True``,
+            set ``inplace=True`` when initializing the nonlinearity layer.
+        apply_noise (bool, optional, default=False): If ``True``, adds
+            Gaussian noise with learnable magnitude to the convolution output.
+        order (str, optional, default='CNA'): Order of operations.
+            ``'C'``: convolution,
+            ``'N'``: normalization,
+            ``'A'``: nonlinear activation.
+            For example, a block initialized with ``order='CNA'`` will
+            do convolution first, then normalization, then nonlinearity.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True,
+                 padding_mode='zeros',
+                 weight_norm_type='none', weight_norm_params=None,
+                 activation_norm_type='none', activation_norm_params=None,
+                 nonlinearity='none', inplace_nonlinearity=False,
+                 apply_noise=False, blur=False, order='CNA', clamp=None, blur_kernel=(1, 3, 3, 1), output_scale=None,
+                 init_gain=1.0):
+        super().__init__(in_channels, out_channels, kernel_size, stride,
+                         padding, dilation, groups, bias, padding_mode,
+                         weight_norm_type, weight_norm_params,
+                         activation_norm_type, activation_norm_params,
+                         nonlinearity, inplace_nonlinearity,
+                         apply_noise, blur, order, 3, clamp, blur_kernel, output_scale, init_gain)
+class _BaseHyperConvBlock(_BaseConvBlock):
+    r"""An abstract wrapper class that wraps a hyper convolutional layer
+    with normalization and nonlinearity.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size, stride,
+                 padding, dilation, groups, bias,
+                 padding_mode,
+                 weight_norm_type, weight_norm_params,
+                 activation_norm_type, activation_norm_params,
+                 nonlinearity, inplace_nonlinearity, apply_noise, blur,
+                 is_hyper_conv, is_hyper_norm, order, input_dim, clamp=None, blur_kernel=(1, 3, 3, 1),
+                 output_scale=None, init_gain=1.0):
+        self.is_hyper_conv = is_hyper_conv
+        if is_hyper_conv:
+            weight_norm_type = 'none'
+        if is_hyper_norm:
+            activation_norm_type = 'hyper_' + activation_norm_type
+        super().__init__(in_channels, out_channels, kernel_size, stride,
+                         padding, dilation, groups, bias, padding_mode,
+                         weight_norm_type, weight_norm_params,
+                         activation_norm_type, activation_norm_params,
+                         nonlinearity, inplace_nonlinearity, apply_noise, blur,
+                         order, input_dim, clamp, blur_kernel, output_scale, init_gain)
+    def _get_conv_layer(self, in_channels, out_channels, kernel_size, stride,
+                        padding, dilation, groups, bias, padding_mode,
+                        input_dim):
+        if input_dim == 0:
+            raise ValueError('HyperLinearBlock is not supported.')
+        else:
+            name = 'HyperConv' if self.is_hyper_conv else 'nn.Conv'
+            layer_type = eval(name + '%dd' % input_dim)
+            layer = layer_type(
+                in_channels, out_channels, kernel_size, stride, padding,
+                dilation, groups, bias, padding_mode)
+        return layer
+class HyperConv2dBlock(_BaseHyperConvBlock):
+    r"""A Wrapper class that wraps ``HyperConv2d`` with normalization and
+    nonlinearity.
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels in the output tensor.
+        kernel_size (int or tuple): Size of the convolving kernel.
+        stride (int or float or tuple, optional, default=1):
+            Stride of the convolution.
+        padding (int or tuple, optional, default=0):
+            Zero-padding added to both sides of the input.
+        dilation (int or tuple, optional, default=1):
+            Spacing between kernel elements.
+        groups (int, optional, default=1): Number of blocked connections
+            from input channels to output channels.
+        bias (bool, optional, default=True):
+            If ``True``, adds a learnable bias to the output.
+        padding_mode (string, optional, default='zeros'): Type of padding:
+            ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``.
+        weight_norm_type (str, optional, default='none'):
+            Type of weight normalization.
+            ``'none'``, ``'spectral'``, ``'weight'``
+            or ``'weight_demod'``.
+        weight_norm_params (obj, optional, default=None):
+            Parameters of weight normalization.
+            If not ``None``, ``weight_norm_params.__dict__`` will be used as
+            keyword arguments when initializing weight normalization.
+        activation_norm_type (str, optional, default='none'):
+            Type of activation normalization.
+            ``'none'``, ``'instance'``, ``'batch'``, ``'sync_batch'``,
+            ``'layer'``,  ``'layer_2d'``, ``'group'``, ``'adaptive'``,
+            ``'spatially_adaptive'`` or ``'hyper_spatially_adaptive'``.
+        activation_norm_params (obj, optional, default=None):
+            Parameters of activation normalization.
+            If not ``None``, ``activation_norm_params.__dict__`` will be used as
+            keyword arguments when initializing activation normalization.
+        is_hyper_conv (bool, optional, default=False): If ``True``, use
+            ``HyperConv2d``, otherwise use ``torch.nn.Conv2d``.
+        is_hyper_norm (bool, optional, default=False): If ``True``, use
+            hyper normalizations.
+        nonlinearity (str, optional, default='none'):
+            Type of nonlinear activation function.
+            ``'none'``, ``'relu'``, ``'leakyrelu'``, ``'prelu'``,
+            ``'tanh'`` , ``'sigmoid'`` or ``'softmax'``.
+        inplace_nonlinearity (bool, optional, default=False): If ``True``,
+            set ``inplace=True`` when initializing the nonlinearity layer.
+        apply_noise (bool, optional, default=False): If ``True``, adds
+            Gaussian noise with learnable magnitude to the convolution output.
+        order (str, optional, default='CNA'): Order of operations.
+            ``'C'``: convolution,
+            ``'N'``: normalization,
+            ``'A'``: nonlinear activation.
+            For example, a block initialized with ``order='CNA'`` will
+            do convolution first, then normalization, then nonlinearity.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True,
+                 padding_mode='zeros',
+                 weight_norm_type='none', weight_norm_params=None,
+                 activation_norm_type='none', activation_norm_params=None,
+                 is_hyper_conv=False, is_hyper_norm=False,
+                 nonlinearity='none', inplace_nonlinearity=False,
+                 apply_noise=False, blur=False, order='CNA', clamp=None):
+        super().__init__(in_channels, out_channels, kernel_size, stride,
+                         padding, dilation, groups, bias, padding_mode,
+                         weight_norm_type, weight_norm_params,
+                         activation_norm_type, activation_norm_params,
+                         nonlinearity, inplace_nonlinearity, apply_noise, blur,
+                         is_hyper_conv, is_hyper_norm, order, 2, clamp)
+class HyperConv2d(nn.Module):
+    r"""Hyper Conv2d initialization.
+    Args:
+        in_channels (int): Dummy parameter.
+        out_channels (int): Dummy parameter.
+        kernel_size (int or tuple): Dummy parameter.
+        stride (int or float or tuple, optional, default=1):
+            Stride of the convolution. Default: 1
+        padding (int or tuple, optional, default=0):
+            Zero-padding added to both sides of the input.
+        padding_mode (string, optional, default='zeros'):
+            ``'zeros'``, ``'reflect'``, ``'replicate'``
+            or ``'circular'``.
+        dilation (int or tuple, optional, default=1):
+            Spacing between kernel elements.
+        groups (int, optional, default=1): Number of blocked connections
+            from input channels to output channels.
+        bias (bool, optional, default=True): If ``True``,
+            adds a learnable bias to the output.
+    """
+    def __init__(self, in_channels=0, out_channels=0, kernel_size=3,
+                 stride=1, padding=1, dilation=1, groups=1, bias=True,
+                 padding_mode='zeros'):
+        super().__init__()
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        self.use_bias = bias
+        self.padding_mode = padding_mode
+        self.conditional = True
+    def forward(self, x, *args, conv_weights=(None, None), **kwargs):
+        r"""Hyper Conv2d forward. Convolve x using the provided weight and bias.
+        Args:
+            x (N x C x H x W tensor): Input tensor.
+            conv_weights (N x C2 x C1 x k x k tensor or list of tensors):
+                Convolution weights or [weight, bias].
+        Returns:
+            y (N x C2 x H x W tensor): Output tensor.
+        """
+        if conv_weights is None:
+            conv_weight, conv_bias = None, None
+        elif isinstance(conv_weights, torch.Tensor):
+            conv_weight, conv_bias = conv_weights, None
+        else:
+            conv_weight, conv_bias = conv_weights
+        if conv_weight is None:
+            return x
+        if conv_bias is None:
+            if self.use_bias:
+                raise ValueError('bias not provided but set to true during '
+                                 'initialization')
+            conv_bias = [None] * x.size(0)
+        if self.padding_mode != 'zeros':
+            x = F.pad(x, [self.padding] * 4, mode=self.padding_mode)
+            padding = 0
+        else:
+            padding = self.padding
+        y = None
+        # noinspection PyArgumentList
+        for i in range(x.size(0)):
+            if self.stride >= 1:
+                yi = F.conv2d(x[i: i + 1],
+                              weight=conv_weight[i], bias=conv_bias[i],
+                              stride=self.stride, padding=padding,
+                              dilation=self.dilation, groups=self.groups)
+            else:
+                yi = F.conv_transpose2d(x[i: i + 1], weight=conv_weight[i],
+                                        bias=conv_bias[i], padding=self.padding,
+                                        stride=int(1 / self.stride),
+                                        dilation=self.dilation,
+                                        output_padding=self.padding,
+                                        groups=self.groups)
+            y = torch.cat([y, yi]) if y is not None else yi
+        return y
+class _BasePartialConvBlock(_BaseConvBlock):
+    r"""An abstract wrapper class that wraps a partial convolutional layer
+    with normalization and nonlinearity.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size, stride,
+                 padding, dilation, groups, bias, padding_mode,
+                 weight_norm_type, weight_norm_params,
+                 activation_norm_type, activation_norm_params,
+                 nonlinearity, inplace_nonlinearity,
+                 multi_channel, return_mask,
+                 apply_noise, order, input_dim, clamp=None, blur_kernel=(1, 3, 3, 1), output_scale=None, init_gain=1.0):
+        self.multi_channel = multi_channel
+        self.return_mask = return_mask
+        self.partial_conv = True
+        super().__init__(in_channels, out_channels, kernel_size, stride,
+                         padding, dilation, groups, bias, padding_mode,
+                         weight_norm_type, weight_norm_params,
+                         activation_norm_type, activation_norm_params,
+                         nonlinearity, inplace_nonlinearity, apply_noise,
+                         False, order, input_dim, clamp, blur_kernel, output_scale, init_gain)
+    def _get_conv_layer(self, in_channels, out_channels, kernel_size, stride,
+                        padding, dilation, groups, bias, padding_mode,
+                        input_dim):
+        if input_dim == 2:
+            layer_type = PartialConv2d
+        elif input_dim == 3:
+            layer_type = PartialConv3d
+        else:
+            raise ValueError('Partial conv only supports 2D and 3D conv now.')
+        layer = layer_type(
+            in_channels, out_channels, kernel_size, stride, padding,
+            dilation, groups, bias, padding_mode,
+            multi_channel=self.multi_channel, return_mask=self.return_mask)
+        return layer
+    def forward(self, x, *cond_inputs, mask_in=None, **kw_cond_inputs):
+        r"""
+        Args:
+            x (tensor): Input tensor.
+            cond_inputs (list of tensors) : Conditional input tensors.
+            mask_in (tensor, optional, default=``None``) If not ``None``,
+                it masks the valid input region.
+            kw_cond_inputs (dict) : Keyword conditional inputs.
+        Returns:
+            (tuple):
+              - x (tensor): Output tensor.
+              - mask_out (tensor, optional): Masks the valid output region.
+        """
+        mask_out = None
+        for layer in self.layers.values():
+            if getattr(layer, 'conditional', False):
+                x = layer(x, *cond_inputs, **kw_cond_inputs)
+            elif getattr(layer, 'partial_conv', False):
+                x = layer(x, mask_in=mask_in, **kw_cond_inputs)
+                if type(x) == tuple:
+                    x, mask_out = x
+            else:
+                x = layer(x)
+        if mask_out is not None:
+            return x, mask_out
+        return x
+class PartialConv2dBlock(_BasePartialConvBlock):
+    r"""A Wrapper class that wraps ``PartialConv2d`` with normalization and
+    nonlinearity.
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels in the output tensor.
+        kernel_size (int or tuple): Size of the convolving kernel.
+        stride (int or float or tuple, optional, default=1):
+            Stride of the convolution.
+        padding (int or tuple, optional, default=0):
+            Zero-padding added to both sides of the input.
+        dilation (int or tuple, optional, default=1):
+            Spacing between kernel elements.
+        groups (int, optional, default=1): Number of blocked connections
+            from input channels to output channels.
+        bias (bool, optional, default=True):
+            If ``True``, adds a learnable bias to the output.
+        padding_mode (string, optional, default='zeros'): Type of padding:
+            ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``.
+        weight_norm_type (str, optional, default='none'):
+            Type of weight normalization.
+            ``'none'``, ``'spectral'``, ``'weight'``
+            or ``'weight_demod'``.
+        weight_norm_params (obj, optional, default=None):
+            Parameters of weight normalization.
+            If not ``None``, ``weight_norm_params.__dict__`` will be used as
+            keyword arguments when initializing weight normalization.
+        activation_norm_type (str, optional, default='none'):
+            Type of activation normalization.
+            ``'none'``, ``'instance'``, ``'batch'``, ``'sync_batch'``,
+            ``'layer'``,  ``'layer_2d'``, ``'group'``, ``'adaptive'``,
+            ``'spatially_adaptive'`` or ``'hyper_spatially_adaptive'``.
+        activation_norm_params (obj, optional, default=None):
+            Parameters of activation normalization.
+            If not ``None``, ``activation_norm_params.__dict__`` will be used as
+            keyword arguments when initializing activation normalization.
+        nonlinearity (str, optional, default='none'):
+            Type of nonlinear activation function.
+            ``'none'``, ``'relu'``, ``'leakyrelu'``, ``'prelu'``,
+            ``'tanh'`` , ``'sigmoid'`` or ``'softmax'``.
+        inplace_nonlinearity (bool, optional, default=False): If ``True``,
+            set ``inplace=True`` when initializing the nonlinearity layer.
+        apply_noise (bool, optional, default=False): If ``True``, adds
+            Gaussian noise with learnable magnitude to the convolution output.
+        order (str, optional, default='CNA'): Order of operations.
+            ``'C'``: convolution,
+            ``'N'``: normalization,
+            ``'A'``: nonlinear activation.
+            For example, a block initialized with ``order='CNA'`` will
+            do convolution first, then normalization, then nonlinearity.
+        multi_channel (bool, optional, default=False): If ``True``, use
+            different masks for different channels.
+        return_mask (bool, optional, default=True): If ``True``, the
+            forward call also returns a new mask.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True,
+                 padding_mode='zeros',
+                 weight_norm_type='none', weight_norm_params=None,
+                 activation_norm_type='none', activation_norm_params=None,
+                 nonlinearity='none', inplace_nonlinearity=False,
+                 multi_channel=False, return_mask=True,
+                 apply_noise=False, order='CNA', clamp=None):
+        super().__init__(in_channels, out_channels, kernel_size, stride,
+                         padding, dilation, groups, bias, padding_mode,
+                         weight_norm_type, weight_norm_params,
+                         activation_norm_type, activation_norm_params,
+                         nonlinearity, inplace_nonlinearity,
+                         multi_channel, return_mask, apply_noise, order, 2,
+                         clamp)
+class PartialConv3dBlock(_BasePartialConvBlock):
+    r"""A Wrapper class that wraps ``PartialConv3d`` with normalization and
+    nonlinearity.
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels in the output tensor.
+        kernel_size (int or tuple): Size of the convolving kernel.
+        stride (int or float or tuple, optional, default=1):
+            Stride of the convolution.
+        padding (int or tuple, optional, default=0):
+            Zero-padding added to both sides of the input.
+        dilation (int or tuple, optional, default=1):
+            Spacing between kernel elements.
+        groups (int, optional, default=1): Number of blocked connections
+            from input channels to output channels.
+        bias (bool, optional, default=True):
+            If ``True``, adds a learnable bias to the output.
+        padding_mode (string, optional, default='zeros'): Type of padding:
+            ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``.
+        weight_norm_type (str, optional, default='none'):
+            Type of weight normalization.
+            ``'none'``, ``'spectral'``, ``'weight'``
+            or ``'weight_demod'``.
+        weight_norm_params (obj, optional, default=None):
+            Parameters of weight normalization.
+            If not ``None``, ``weight_norm_params.__dict__`` will be used as
+            keyword arguments when initializing weight normalization.
+        activation_norm_type (str, optional, default='none'):
+            Type of activation normalization.
+            ``'none'``, ``'instance'``, ``'batch'``, ``'sync_batch'``,
+            ``'layer'``,  ``'layer_2d'``, ``'group'``, ``'adaptive'``,
+            ``'spatially_adaptive'`` or ``'hyper_spatially_adaptive'``.
+        activation_norm_params (obj, optional, default=None):
+            Parameters of activation normalization.
+            If not ``None``, ``activation_norm_params.__dict__`` will be used as
+            keyword arguments when initializing activation normalization.
+        nonlinearity (str, optional, default='none'):
+            Type of nonlinear activation function.
+            ``'none'``, ``'relu'``, ``'leakyrelu'``, ``'prelu'``,
+            ``'tanh'`` , ``'sigmoid'`` or ``'softmax'``.
+        inplace_nonlinearity (bool, optional, default=False): If ``True``,
+            set ``inplace=True`` when initializing the nonlinearity layer.
+        apply_noise (bool, optional, default=False): If ``True``, adds
+            Gaussian noise with learnable magnitude to the convolution output.
+        order (str, optional, default='CNA'): Order of operations.
+            ``'C'``: convolution,
+            ``'N'``: normalization,
+            ``'A'``: nonlinear activation.
+            For example, a block initialized with ``order='CNA'`` will
+            do convolution first, then normalization, then nonlinearity.
+        multi_channel (bool, optional, default=False): If ``True``, use
+            different masks for different channels.
+        return_mask (bool, optional, default=True): If ``True``, the
+            forward call also returns a new mask.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True,
+                 padding_mode='zeros',
+                 weight_norm_type='none', weight_norm_params=None,
+                 activation_norm_type='none', activation_norm_params=None,
+                 nonlinearity='none', inplace_nonlinearity=False,
+                 multi_channel=False, return_mask=True,
+                 apply_noise=False, order='CNA', clamp=None):
+        super().__init__(in_channels, out_channels, kernel_size, stride,
+                         padding, dilation, groups, bias, padding_mode,
+                         weight_norm_type, weight_norm_params,
+                         activation_norm_type, activation_norm_params,
+                         nonlinearity, inplace_nonlinearity,
+                         multi_channel, return_mask, apply_noise, order, 3,
+                         clamp)
+class _MultiOutBaseConvBlock(_BaseConvBlock):
+    r"""An abstract wrapper class that wraps a hyper convolutional layer with
+    normalization and nonlinearity. It can return multiple outputs, if some
+    layers in the block return more than one output.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, padding_mode,
+                 weight_norm_type, weight_norm_params, activation_norm_type, activation_norm_params, nonlinearity,
+                 inplace_nonlinearity, apply_noise, blur, order, input_dim, clamp=None, blur_kernel=(1, 3, 3, 1),
+                 output_scale=None, init_gain=1.0):
+        super().__init__(in_channels, out_channels, kernel_size, stride,
+                         padding, dilation, groups, bias, padding_mode,
+                         weight_norm_type, weight_norm_params,
+                         activation_norm_type, activation_norm_params,
+                         nonlinearity, inplace_nonlinearity,
+                         apply_noise, blur, order, input_dim, clamp, blur_kernel, output_scale, init_gain)
+        self.multiple_outputs = True
+    def forward(self, x, *cond_inputs, **kw_cond_inputs):
+        r"""
+        Args:
+            x (tensor): Input tensor.
+            cond_inputs (list of tensors) : Conditional input tensors.
+            kw_cond_inputs (dict) : Keyword conditional inputs.
+        Returns:
+            (tuple):
+              - x (tensor): Main output tensor.
+              - other_outputs (list of tensors): Other output tensors.
+        """
+        other_outputs = []
+        for layer in self.layers.values():
+            if getattr(layer, 'conditional', False):
+                x = layer(x, *cond_inputs, **kw_cond_inputs)
+            if getattr(layer, 'multiple_outputs', False):
+                x, other_output = layer(x)
+                other_outputs.append(other_output)
+            else:
+                x = layer(x)
+        return (x, *other_outputs)
+class MultiOutConv2dBlock(_MultiOutBaseConvBlock):
+    r"""A Wrapper class that wraps ``torch.nn.Conv2d`` with normalization and
+    nonlinearity. It can return multiple outputs, if some layers in the block
+    return more than one output.
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels in the output tensor.
+        kernel_size (int or tuple): Size of the convolving kernel.
+        stride (int or float or tuple, optional, default=1):
+            Stride of the convolution.
+        padding (int or tuple, optional, default=0):
+            Zero-padding added to both sides of the input.
+        dilation (int or tuple, optional, default=1):
+            Spacing between kernel elements.
+        groups (int, optional, default=1): Number of blocked connections
+            from input channels to output channels.
+        bias (bool, optional, default=True):
+            If ``True``, adds a learnable bias to the output.
+        padding_mode (string, optional, default='zeros'): Type of padding:
+            ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``.
+        weight_norm_type (str, optional, default='none'):
+            Type of weight normalization.
+            ``'none'``, ``'spectral'``, ``'weight'``
+            or ``'weight_demod'``.
+        weight_norm_params (obj, optional, default=None):
+            Parameters of weight normalization.
+            If not ``None``, ``weight_norm_params.__dict__`` will be used as
+            keyword arguments when initializing weight normalization.
+        activation_norm_type (str, optional, default='none'):
+            Type of activation normalization.
+            ``'none'``, ``'instance'``, ``'batch'``, ``'sync_batch'``,
+            ``'layer'``,  ``'layer_2d'``, ``'group'``, ``'adaptive'``,
+            ``'spatially_adaptive'`` or ``'hyper_spatially_adaptive'``.
+        activation_norm_params (obj, optional, default=None):
+            Parameters of activation normalization.
+            If not ``None``, ``activation_norm_params.__dict__`` will be used as
+            keyword arguments when initializing activation normalization.
+        nonlinearity (str, optional, default='none'):
+            Type of nonlinear activation function.
+            ``'none'``, ``'relu'``, ``'leakyrelu'``, ``'prelu'``,
+            ``'tanh'`` , ``'sigmoid'`` or ``'softmax'``.
+        inplace_nonlinearity (bool, optional, default=False): If ``True``,
+            set ``inplace=True`` when initializing the nonlinearity layer.
+        apply_noise (bool, optional, default=False): If ``True``, adds
+            Gaussian noise with learnable magnitude to the convolution output.
+        order (str, optional, default='CNA'): Order of operations.
+            ``'C'``: convolution,
+            ``'N'``: normalization,
+            ``'A'``: nonlinear activation.
+            For example, a block initialized with ``order='CNA'`` will
+            do convolution first, then normalization, then nonlinearity.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True,
+                 padding_mode='zeros',
+                 weight_norm_type='none', weight_norm_params=None,
+                 activation_norm_type='none', activation_norm_params=None,
+                 nonlinearity='none', inplace_nonlinearity=False,
+                 apply_noise=False, blur=False, order='CNA', clamp=None):
+        super().__init__(in_channels, out_channels, kernel_size, stride,
+                         padding, dilation, groups, bias, padding_mode,
+                         weight_norm_type, weight_norm_params,
+                         activation_norm_type, activation_norm_params,
+                         nonlinearity, inplace_nonlinearity,
+                         apply_noise, blur, order, 2, clamp)
+###############################################################################
+# BSD 3-Clause License
+#
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Author & Contact: Guilin Liu (guilinl@nvidia.com)
+###############################################################################
+class PartialConv2d(nn.Conv2d):
+    r"""Partial 2D convolution in
+    "Image inpainting for irregular holes using partial convolutions."
+    Liu et al., ECCV 2018
+    """
+    def __init__(self, *args, multi_channel=False, return_mask=True, **kwargs):
+        # whether the mask is multi-channel or not
+        self.multi_channel = multi_channel
+        self.return_mask = return_mask
+        super(PartialConv2d, self).__init__(*args, **kwargs)
+        if self.multi_channel:
+            self.weight_maskUpdater = torch.ones(self.out_channels,
+                                                 self.in_channels,
+                                                 self.kernel_size[0],
+                                                 self.kernel_size[1])
+        else:
+            self.weight_maskUpdater = torch.ones(1, 1, self.kernel_size[0],
+                                                 self.kernel_size[1])
+        shape = self.weight_maskUpdater.shape
+        self.slide_winsize = shape[1] * shape[2] * shape[3]
+        self.last_size = (None, None, None, None)
+        self.update_mask = None
+        self.mask_ratio = None
+        self.partial_conv = True
+    def forward(self, x, mask_in=None):
+        r"""
+        Args:
+            x (tensor): Input tensor.
+            mask_in (tensor, optional, default=``None``) If not ``None``,
+                it masks the valid input region.
+        """
+        assert len(x.shape) == 4
+        if mask_in is not None or self.last_size != tuple(x.shape):
+            self.last_size = tuple(x.shape)
+            with torch.no_grad():
+                if self.weight_maskUpdater.type() != x.type():
+                    self.weight_maskUpdater = self.weight_maskUpdater.to(x)
+                if mask_in is None:
+                    # If mask is not provided, create a mask.
+                    if self.multi_channel:
+                        mask = torch.ones(x.data.shape[0],
+                                          x.data.shape[1],
+                                          x.data.shape[2],
+                                          x.data.shape[3]).to(x)
+                    else:
+                        mask = torch.ones(1, 1, x.data.shape[2],
+                                          x.data.shape[3]).to(x)
+                else:
+                    mask = mask_in
+                self.update_mask = F.conv2d(mask, self.weight_maskUpdater,
+                                            bias=None, stride=self.stride,
+                                            padding=self.padding,
+                                            dilation=self.dilation, groups=1)
+                # For mixed precision training, eps from 1e-8 to 1e-6.
+                eps = 1e-6
+                self.mask_ratio = self.slide_winsize / (self.update_mask + eps)
+                self.update_mask = torch.clamp(self.update_mask, 0, 1)
+                self.mask_ratio = torch.mul(self.mask_ratio, self.update_mask)
+        raw_out = super(PartialConv2d, self).forward(
+            torch.mul(x, mask) if mask_in is not None else x)
+        if self.bias is not None:
+            bias_view = self.bias.view(1, self.out_channels, 1, 1)
+            output = torch.mul(raw_out - bias_view, self.mask_ratio) + bias_view
+            output = torch.mul(output, self.update_mask)
+        else:
+            output = torch.mul(raw_out, self.mask_ratio)
+        if self.return_mask:
+            return output, self.update_mask
+        else:
+            return output
+class PartialConv3d(nn.Conv3d):
+    r"""Partial 3D convolution in
+    "Image inpainting for irregular holes using partial convolutions."
+    Liu et al., ECCV 2018
+    """
+    def __init__(self, *args, multi_channel=False, return_mask=True, **kwargs):
+        # whether the mask is multi-channel or not
+        self.multi_channel = multi_channel
+        self.return_mask = return_mask
+        super(PartialConv3d, self).__init__(*args, **kwargs)
+        if self.multi_channel:
+            self.weight_maskUpdater = \
+                torch.ones(self.out_channels, self.in_channels,
+                           self.kernel_size[0], self.kernel_size[1],
+                           self.kernel_size[2])
+        else:
+            self.weight_maskUpdater = torch.ones(1, 1, self.kernel_size[0],
+                                                 self.kernel_size[1],
+                                                 self.kernel_size[2])
+        self.weight_maskUpdater = self.weight_maskUpdater.to('cuda')
+        shape = self.weight_maskUpdater.shape
+        self.slide_winsize = shape[1] * shape[2] * shape[3] * shape[4]
+        self.partial_conv = True
+    def forward(self, x, mask_in=None):
+        r"""
+        Args:
+            x (tensor): Input tensor.
+            mask_in (tensor, optional, default=``None``) If not ``None``, it
+                masks the valid input region.
+        """
+        assert len(x.shape) == 5
+        with torch.no_grad():
+            mask = mask_in
+            update_mask = F.conv3d(mask, self.weight_maskUpdater, bias=None,
+                                   stride=self.stride, padding=self.padding,
+                                   dilation=self.dilation, groups=1)
+            mask_ratio = self.slide_winsize / (update_mask + 1e-8)
+            update_mask = torch.clamp(update_mask, 0, 1)
+            mask_ratio = torch.mul(mask_ratio, update_mask)
+        raw_out = super(PartialConv3d, self).forward(torch.mul(x, mask_in))
+        if self.bias is not None:
+            bias_view = self.bias.view(1, self.out_channels, 1, 1, 1)
+            output = torch.mul(raw_out - bias_view, mask_ratio) + bias_view
+            if mask_in is not None:
+                output = torch.mul(output, update_mask)
+        else:
+            output = torch.mul(raw_out, mask_ratio)
+        if self.return_mask:
+            return output, update_mask
+        else:
+            return output
+class Embedding2d(nn.Embedding):
+    def __init__(self, in_channels, out_channels):
+        super().__init__(in_channels, out_channels)
+    def forward(self, x):
+        return F.embedding(
+            x.squeeze(1).long(), self.weight, self.padding_idx, self.max_norm,
+            self.norm_type, self.scale_grad_by_freq, self.sparse).permute(0, 3, 1, 2).contiguous()

imaginaire/layers/misc.py ADDED Viewed

	@@ -0,0 +1,61 @@

+# Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, check out LICENSE.md
+import torch
+from torch import nn
+class ApplyNoise(nn.Module):
+    r"""Add Gaussian noise to the input tensor."""
+    def __init__(self):
+        super().__init__()
+        # scale of the noise
+        self.scale = nn.Parameter(torch.zeros(1))
+        self.conditional = True
+    def forward(self, x, *_args, noise=None, **_kwargs):
+        r"""
+        Args:
+            x (tensor): Input tensor.
+            noise (tensor, optional, default=``None``) : Noise tensor to be
+                added to the input.
+        """
+        if noise is None:
+            sz = x.size()
+            noise = x.new_empty(sz[0], 1, *sz[2:]).normal_()
+        return x + self.scale * noise
+class PartialSequential(nn.Sequential):
+    r"""Sequential block for partial convolutions."""
+    def __init__(self, *modules):
+        super(PartialSequential, self).__init__(*modules)
+    def forward(self, x):
+        r"""
+        Args:
+            x (tensor): Input tensor.
+        """
+        act = x[:, :-1]
+        mask = x[:, -1].unsqueeze(1)
+        for module in self:
+            act, mask = module(act, mask_in=mask)
+        return act
+class ConstantInput(nn.Module):
+    def __init__(self, channel, size=4):
+        super().__init__()
+        if isinstance(size, int):
+            h, w = size, size
+        else:
+            h, w = size
+        self.input = nn.Parameter(torch.randn(1, channel, h, w))
+    def forward(self):
+        return self.input

imaginaire/layers/non_local.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, check out LICENSE.md
+from functools import partial
+import torch
+import torch.nn as nn
+from imaginaire.layers import Conv2dBlock
+class NonLocal2dBlock(nn.Module):
+    r"""Self attention Layer
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        scale (bool, optional, default=True): If ``True``, scale the
+            output by a learnable parameter.
+        clamp (bool, optional, default=``False``): If ``True``, clamp the
+            scaling parameter to (-1, 1).
+        weight_norm_type (str, optional, default='none'):
+            Type of weight normalization.
+            ``'none'``, ``'spectral'``, ``'weight'``.
+        weight_norm_params (obj, optional, default=None):
+            Parameters of weight normalization.
+            If not ``None``, weight_norm_params.__dict__ will be used as
+            keyword arguments when initializing weight normalization.
+        bias (bool, optional, default=True): If ``True``, adds bias in the
+            convolutional blocks.
+    """
+    def __init__(self,
+                 in_channels,
+                 scale=True,
+                 clamp=False,
+                 weight_norm_type='none',
+                 weight_norm_params=None,
+                 bias=True):
+        super(NonLocal2dBlock, self).__init__()
+        self.clamp = clamp
+        self.gamma = nn.Parameter(torch.zeros(1)) if scale else 1.0
+        self.in_channels = in_channels
+        base_conv2d_block = partial(Conv2dBlock,
+                                    kernel_size=1,
+                                    stride=1,
+                                    padding=0,
+                                    weight_norm_type=weight_norm_type,
+                                    weight_norm_params=weight_norm_params,
+                                    bias=bias)
+        self.theta = base_conv2d_block(in_channels, in_channels // 8)
+        self.phi = base_conv2d_block(in_channels, in_channels // 8)
+        self.g = base_conv2d_block(in_channels, in_channels // 2)
+        self.out_conv = base_conv2d_block(in_channels // 2, in_channels)
+        self.softmax = nn.Softmax(dim=-1)
+        self.max_pool = nn.MaxPool2d(2)
+    def forward(self, x):
+        r"""
+        Args:
+            x (tensor) : input feature maps (B X C X W X H)
+        Returns:
+            (tuple):
+              - out (tensor) : self attention value + input feature
+              - attention (tensor): B x N x N (N is Width*Height)
+        """
+        n, c, h, w = x.size()
+        theta = self.theta(x).view(n, -1, h * w).permute(0, 2, 1).contiguous()
+        phi = self.phi(x)
+        phi = self.max_pool(phi).view(n, -1, h * w // 4)
+        energy = torch.bmm(theta, phi)
+        attention = self.softmax(energy)
+        g = self.g(x)
+        g = self.max_pool(g).view(n, -1, h * w // 4)
+        out = torch.bmm(g, attention.permute(0, 2, 1).contiguous())
+        out = out.view(n, c // 2, h, w)
+        out = self.out_conv(out)
+        if self.clamp:
+            out = self.gamma.clamp(-1, 1) * out + x
+        else:
+            out = self.gamma * out + x
+        return out

imaginaire/layers/nonlinearity.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, check out LICENSE.md
+import torch
+from torch import nn
+import torch.nn.functional as F
+from imaginaire.third_party.bias_act.bias_act import FusedNonlinearity
+class ScaledLeakyReLU(nn.Module):
+    def __init__(self, negative_slope=0.2, scale=2 ** 0.5, inplace=False):
+        super().__init__()
+        self.negative_slope = negative_slope
+        self.scale = scale
+        self.inplace = inplace
+    def forward(self, x):
+        return F.leaky_relu(x, self.negative_slope, inplace=self.inplace) * self.scale
+        # return _fused_scaled_leakyrelu(x, self.negative_slope, self.inplace, self.scale)
+# @torch.jit.script
+# def _fused_scaled_leakyrelu(x: torch.Tensor, negative_slope: float, inplace: bool, scale: float):
+#     return F.leaky_relu(x, negative_slope, inplace=inplace) * scale
+def get_nonlinearity_layer(nonlinearity_type, inplace, **kwargs):
+    r"""Return a nonlinearity layer.
+    Args:
+        nonlinearity_type (str):
+            Type of nonlinear activation function.
+            ``'none'``, ``'relu'``, ``'leakyrelu'``, ``'prelu'``,
+            ``'tanh'`` , ``'sigmoid'`` or ``'softmax'``.
+        inplace (bool): If ``True``, set ``inplace=True`` when initializing
+            the nonlinearity layer.
+    """
+    if nonlinearity_type.startswith('fused'):
+        nonlinearity = FusedNonlinearity(nonlinearity=nonlinearity_type[6:], **kwargs)
+    elif nonlinearity_type == 'relu':
+        nonlinearity = nn.ReLU(inplace=inplace)
+    elif nonlinearity_type == 'leakyrelu':
+        nonlinearity = nn.LeakyReLU(0.2, inplace=inplace)
+    elif nonlinearity_type == 'scaled_leakyrelu':
+        nonlinearity = ScaledLeakyReLU(0.2, inplace=inplace)
+        import imaginaire.config
+        if imaginaire.config.USE_JIT:
+            nonlinearity = torch.jit.script(nonlinearity)
+    elif nonlinearity_type == 'prelu':
+        nonlinearity = nn.PReLU()
+    elif nonlinearity_type == 'tanh':
+        nonlinearity = nn.Tanh()
+    elif nonlinearity_type == 'sigmoid':
+        nonlinearity = nn.Sigmoid()
+    elif nonlinearity_type.startswith('softmax'):
+        dim = nonlinearity_type.split(',')[1] if ',' in nonlinearity_type else 1
+        nonlinearity = nn.Softmax(dim=int(dim))
+    elif nonlinearity_type == 'none' or nonlinearity_type == '':
+        nonlinearity = None
+    else:
+        raise ValueError('Nonlinearity %s is not recognized' % nonlinearity_type)
+    return nonlinearity

imaginaire/layers/residual.py ADDED Viewed

	@@ -0,0 +1,1411 @@

+# Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, check out LICENSE.md
+import functools
+import torch
+from torch import nn
+from torch.nn import Upsample as NearestUpsample
+from torch.utils.checkpoint import checkpoint
+from .conv import (Conv1dBlock, Conv2dBlock, Conv3dBlock, HyperConv2dBlock,
+                   LinearBlock, MultiOutConv2dBlock, PartialConv2dBlock,
+                   PartialConv3dBlock, ModulatedConv2dBlock)
+from imaginaire.third_party.upfirdn2d.upfirdn2d import BlurUpsample
+class _BaseResBlock(nn.Module):
+    r"""An abstract class for residual blocks.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size,
+                 stride, padding, dilation, groups, bias, padding_mode,
+                 weight_norm_type, weight_norm_params,
+                 activation_norm_type, activation_norm_params,
+                 skip_activation_norm, skip_nonlinearity,
+                 nonlinearity, inplace_nonlinearity, apply_noise,
+                 hidden_channels_equal_out_channels,
+                 order, block, learn_shortcut, clamp, output_scale,
+                 skip_block=None, blur=False, upsample_first=True, skip_weight_norm=True):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.output_scale = output_scale
+        self.upsample_first = upsample_first
+        self.stride = stride
+        self.blur = blur
+        if skip_block is None:
+            skip_block = block
+        if order == 'pre_act':
+            order = 'NACNAC'
+        if isinstance(bias, bool):
+            # The bias for conv_block_0, conv_block_1, and conv_block_s.
+            biases = [bias, bias, bias]
+        elif isinstance(bias, list):
+            if len(bias) == 3:
+                biases = bias
+            else:
+                raise ValueError('Bias list must be 3.')
+        else:
+            raise ValueError('Bias must be either an integer or s list.')
+        if learn_shortcut is None:
+            self.learn_shortcut = (in_channels != out_channels)
+        else:
+            self.learn_shortcut = learn_shortcut
+        if len(order) > 6 or len(order) < 5:
+            raise ValueError('order must be either 5 or 6 characters')
+        if hidden_channels_equal_out_channels:
+            hidden_channels = out_channels
+        else:
+            hidden_channels = min(in_channels, out_channels)
+        # Parameters.
+        residual_params = {}
+        shortcut_params = {}
+        base_params = dict(dilation=dilation,
+                           groups=groups,
+                           padding_mode=padding_mode,
+                           clamp=clamp)
+        residual_params.update(base_params)
+        residual_params.update(
+            dict(activation_norm_type=activation_norm_type,
+                 activation_norm_params=activation_norm_params,
+                 weight_norm_type=weight_norm_type,
+                 weight_norm_params=weight_norm_params,
+                 padding=padding,
+                 apply_noise=apply_noise))
+        shortcut_params.update(base_params)
+        shortcut_params.update(dict(kernel_size=1))
+        if skip_activation_norm:
+            shortcut_params.update(
+                dict(activation_norm_type=activation_norm_type,
+                     activation_norm_params=activation_norm_params,
+                     apply_noise=False))
+        if skip_weight_norm:
+            shortcut_params.update(
+                dict(weight_norm_type=weight_norm_type,
+                     weight_norm_params=weight_norm_params))
+        # Residual branch.
+        if order.find('A') < order.find('C') and \
+                (activation_norm_type == '' or activation_norm_type == 'none'):
+            # Nonlinearity is the first operation in the residual path.
+            # In-place nonlinearity will modify the input variable and cause
+            # backward error.
+            first_inplace = False
+        else:
+            first_inplace = inplace_nonlinearity
+        (first_stride, second_stride, shortcut_stride,
+         first_blur, second_blur, shortcut_blur) = self._get_stride_blur()
+        self.conv_block_0 = block(
+            in_channels, hidden_channels,
+            kernel_size=kernel_size,
+            bias=biases[0],
+            nonlinearity=nonlinearity,
+            order=order[0:3],
+            inplace_nonlinearity=first_inplace,
+            stride=first_stride,
+            blur=first_blur,
+            **residual_params
+        )
+        self.conv_block_1 = block(
+            hidden_channels, out_channels,
+            kernel_size=kernel_size,
+            bias=biases[1],
+            nonlinearity=nonlinearity,
+            order=order[3:],
+            inplace_nonlinearity=inplace_nonlinearity,
+            stride=second_stride,
+            blur=second_blur,
+            **residual_params
+        )
+        # Shortcut branch.
+        if self.learn_shortcut:
+            if skip_nonlinearity:
+                skip_nonlinearity_type = nonlinearity
+            else:
+                skip_nonlinearity_type = ''
+            self.conv_block_s = skip_block(in_channels, out_channels,
+                                           bias=biases[2],
+                                           nonlinearity=skip_nonlinearity_type,
+                                           order=order[0:3],
+                                           stride=shortcut_stride,
+                                           blur=shortcut_blur,
+                                           **shortcut_params)
+        elif in_channels < out_channels:
+            if skip_nonlinearity:
+                skip_nonlinearity_type = nonlinearity
+            else:
+                skip_nonlinearity_type = ''
+            self.conv_block_s = skip_block(in_channels,
+                                           out_channels - in_channels,
+                                           bias=biases[2],
+                                           nonlinearity=skip_nonlinearity_type,
+                                           order=order[0:3],
+                                           stride=shortcut_stride,
+                                           blur=shortcut_blur,
+                                           **shortcut_params)
+        # Whether this block expects conditional inputs.
+        self.conditional = \
+            getattr(self.conv_block_0, 'conditional', False) or \
+            getattr(self.conv_block_1, 'conditional', False)
+    def _get_stride_blur(self):
+        if self.stride > 1:
+            # Downsampling.
+            first_stride, second_stride = 1, self.stride
+            first_blur, second_blur = False, self.blur
+            shortcut_stride = self.stride
+            shortcut_blur = self.blur
+            self.upsample = None
+        elif self.stride < 1:
+            # Upsampling.
+            first_stride, second_stride = self.stride, 1
+            first_blur, second_blur = self.blur, False
+            shortcut_blur = False
+            shortcut_stride = 1
+            if self.blur:
+                # The shortcut branch uses blur_upsample + stride-1 conv
+                self.upsample = BlurUpsample()
+            else:
+                shortcut_stride = self.stride
+                self.upsample = nn.Upsample(scale_factor=2)
+        else:
+            first_stride = second_stride = 1
+            first_blur = second_blur = False
+            shortcut_stride = 1
+            shortcut_blur = False
+            self.upsample = None
+        return (first_stride, second_stride, shortcut_stride,
+                first_blur, second_blur, shortcut_blur)
+    def conv_blocks(
+            self, x, *cond_inputs, separate_cond=False, **kw_cond_inputs
+    ):
+        r"""Returns the output of the residual branch.
+        Args:
+            x (tensor): Input tensor.
+            cond_inputs (list of tensors) : Conditional input tensors.
+            kw_cond_inputs (dict) : Keyword conditional inputs.
+        Returns:
+            dx (tensor): Output tensor.
+        """
+        if separate_cond:
+            dx = self.conv_block_0(x, cond_inputs[0],
+                                   **kw_cond_inputs.get('kwargs_0', {}))
+            dx = self.conv_block_1(dx, cond_inputs[1],
+                                   **kw_cond_inputs.get('kwargs_1', {}))
+        else:
+            dx = self.conv_block_0(x, *cond_inputs, **kw_cond_inputs)
+            dx = self.conv_block_1(dx, *cond_inputs, **kw_cond_inputs)
+        return dx
+    def forward(self, x, *cond_inputs, do_checkpoint=False, separate_cond=False,
+                **kw_cond_inputs):
+        r"""
+        Args:
+            x (tensor): Input tensor.
+            cond_inputs (list of tensors) : Conditional input tensors.
+            do_checkpoint (bool, optional, default=``False``) If ``True``,
+                trade compute for memory by checkpointing the model.
+            kw_cond_inputs (dict) : Keyword conditional inputs.
+        Returns:
+            output (tensor): Output tensor.
+        """
+        if do_checkpoint:
+            dx = checkpoint(self.conv_blocks, x, *cond_inputs,
+                            separate_cond=separate_cond, **kw_cond_inputs)
+        else:
+            dx = self.conv_blocks(x, *cond_inputs,
+                                  separate_cond=separate_cond, **kw_cond_inputs)
+        if self.upsample_first and self.upsample is not None:
+            x = self.upsample(x)
+        if self.learn_shortcut:
+            if separate_cond:
+                x_shortcut = self.conv_block_s(
+                    x, cond_inputs[2], **kw_cond_inputs.get('kwargs_2', {})
+                )
+            else:
+                x_shortcut = self.conv_block_s(
+                    x, *cond_inputs, **kw_cond_inputs
+                )
+        elif self.in_channels < self.out_channels:
+            if separate_cond:
+                x_shortcut_pad = self.conv_block_s(
+                    x, cond_inputs[2], **kw_cond_inputs.get('kwargs_2', {})
+                )
+            else:
+                x_shortcut_pad = self.conv_block_s(
+                    x, *cond_inputs, **kw_cond_inputs
+                )
+            x_shortcut = torch.cat((x, x_shortcut_pad), dim=1)
+        elif self.in_channels > self.out_channels:
+            x_shortcut = x[:, :self.out_channels, :, :]
+        else:
+            x_shortcut = x
+        if not self.upsample_first and self.upsample is not None:
+            x_shortcut = self.upsample(x_shortcut)
+        output = x_shortcut + dx
+        return self.output_scale * output
+    def extra_repr(self):
+        s = 'output_scale={output_scale}'
+        return s.format(**self.__dict__)
+class ModulatedRes2dBlock(_BaseResBlock):
+    def __init__(self, in_channels, out_channels, style_dim, kernel_size=3,
+                 stride=1, padding=1, dilation=1, groups=1, bias=True,
+                 padding_mode='zeros',
+                 weight_norm_type='none', weight_norm_params=None,
+                 activation_norm_type='none', activation_norm_params=None,
+                 skip_activation_norm=True, skip_nonlinearity=False,
+                 nonlinearity='leakyrelu', inplace_nonlinearity=False,
+                 apply_noise=True, hidden_channels_equal_out_channels=False,
+                 order='CNACNA', learn_shortcut=None, clamp=None, output_scale=1,
+                 demodulate=True, eps=1e-8):
+        block = functools.partial(ModulatedConv2dBlock,
+                                  style_dim=style_dim,
+                                  demodulate=demodulate, eps=eps)
+        skip_block = Conv2dBlock
+        super().__init__(in_channels, out_channels, kernel_size, stride,
+                         padding, dilation, groups, bias, padding_mode,
+                         weight_norm_type, weight_norm_params,
+                         activation_norm_type, activation_norm_params,
+                         skip_activation_norm, skip_nonlinearity, nonlinearity,
+                         inplace_nonlinearity, apply_noise,
+                         hidden_channels_equal_out_channels, order, block,
+                         learn_shortcut, clamp, output_scale, skip_block=skip_block)
+    def conv_blocks(self, x, *cond_inputs, **kw_cond_inputs):
+        assert len(list(cond_inputs)) == 2
+        dx = self.conv_block_0(x, cond_inputs[0], **kw_cond_inputs)
+        dx = self.conv_block_1(dx, cond_inputs[1], **kw_cond_inputs)
+        return dx
+class ResLinearBlock(_BaseResBlock):
+    r"""Residual block with full-connected layers.
+    Args:
+        in_channels (int) : Number of channels in the input tensor.
+        out_channels (int) : Number of channels in the output tensor.
+        weight_norm_type (str, optional, default='none'):
+            Type of weight normalization.
+            ``'none'``, ``'spectral'``, ``'weight'``
+            or ``'weight_demod'``.
+        weight_norm_params (obj, optional, default=None):
+            Parameters of weight normalization.
+            If not ``None``, ``weight_norm_params.__dict__`` will be used as
+            keyword arguments when initializing weight normalization.
+        activation_norm_type (str, optional, default='none'):
+            Type of activation normalization.
+            ``'none'``, ``'instance'``, ``'batch'``, ``'sync_batch'``,
+            ``'layer'``,  ``'layer_2d'``, ``'group'``, ``'adaptive'``,
+            ``'spatially_adaptive'`` or ``'hyper_spatially_adaptive'``.
+        activation_norm_params (obj, optional, default=None):
+            Parameters of activation normalization.
+            If not ``None``, ``activation_norm_params.__dict__`` will be used as
+            keyword arguments when initializing activation normalization.
+        skip_activation_norm (bool, optional, default=True): If ``True`` and
+            ``learn_shortcut`` is also ``True``, applies activation norm to the
+            learned shortcut connection.
+        skip_nonlinearity (bool, optional, default=True): If ``True`` and
+            ``learn_shortcut`` is also ``True``, applies nonlinearity to the
+            learned shortcut connection.
+        nonlinearity (str, optional, default='none'):
+            Type of nonlinear activation function in the residual link.
+            ``'none'``, ``'relu'``, ``'leakyrelu'``, ``'prelu'``,
+            ``'tanh'`` , ``'sigmoid'`` or ``'softmax'``.
+        inplace_nonlinearity (bool, optional, default=False): If ``True``,
+            set ``inplace=True`` when initializing the nonlinearity layers.
+        apply_noise (bool, optional, default=False): If ``True``, add
+            Gaussian noise with learnable magnitude after the
+            fully-connected layer.
+        hidden_channels_equal_out_channels (bool, optional, default=False):
+            If ``True``, set the hidden channel number to be equal to the
+            output channel number. If ``False``, the hidden channel number
+            equals to the smaller of the input channel number and the
+            output channel number.
+        order (str, optional, default='CNACNA'): Order of operations
+            in the residual link.
+            ``'C'``: fully-connected,
+            ``'N'``: normalization,
+            ``'A'``: nonlinear activation.
+        learn_shortcut (bool, optional, default=False): If ``True``, always use
+            a convolutional shortcut instead of an identity one, otherwise only
+            use a convolutional one if input and output have different number of
+            channels.
+    """
+    def __init__(self, in_channels, out_channels, bias=True,
+                 weight_norm_type='none', weight_norm_params=None,
+                 activation_norm_type='none', activation_norm_params=None,
+                 skip_activation_norm=True, skip_nonlinearity=False,
+                 nonlinearity='leakyrelu', inplace_nonlinearity=False,
+                 apply_noise=False, hidden_channels_equal_out_channels=False,
+                 order='CNACNA', learn_shortcut=None, clamp=None,
+                 output_scale=1):
+        super().__init__(in_channels, out_channels, None, 1, None, None,
+                         None, bias, None, weight_norm_type, weight_norm_params,
+                         activation_norm_type, activation_norm_params,
+                         skip_activation_norm, skip_nonlinearity, nonlinearity,
+                         inplace_nonlinearity, apply_noise,
+                         hidden_channels_equal_out_channels, order, LinearBlock,
+                         learn_shortcut, clamp, output_scale)
+class Res1dBlock(_BaseResBlock):
+    r"""Residual block for 1D input.
+    Args:
+        in_channels (int) : Number of channels in the input tensor.
+        out_channels (int) : Number of channels in the output tensor.
+        kernel_size (int, optional, default=3): Kernel size for the
+            convolutional filters in the residual link.
+        padding (int, optional, default=1): Padding size.
+        dilation (int, optional, default=1): Dilation factor.
+        groups (int, optional, default=1): Number of convolutional/linear
+            groups.
+        padding_mode (string, optional, default='zeros'): Type of padding:
+            ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``.
+        weight_norm_type (str, optional, default='none'):
+            Type of weight normalization.
+            ``'none'``, ``'spectral'``, ``'weight'``
+            or ``'weight_demod'``.
+        weight_norm_params (obj, optional, default=None):
+            Parameters of weight normalization.
+            If not ``None``, ``weight_norm_params.__dict__`` will be used as
+            keyword arguments when initializing weight normalization.
+        activation_norm_type (str, optional, default='none'):
+            Type of activation normalization.
+            ``'none'``, ``'instance'``, ``'batch'``, ``'sync_batch'``,
+            ``'layer'``,  ``'layer_2d'``, ``'group'``, ``'adaptive'``,
+            ``'spatially_adaptive'`` or ``'hyper_spatially_adaptive'``.
+        activation_norm_params (obj, optional, default=None):
+            Parameters of activation normalization.
+            If not ``None``, ``activation_norm_params.__dict__`` will be used as
+            keyword arguments when initializing activation normalization.
+        skip_activation_norm (bool, optional, default=True): If ``True`` and
+            ``learn_shortcut`` is also ``True``, applies activation norm to the
+            learned shortcut connection.
+        skip_nonlinearity (bool, optional, default=True): If ``True`` and
+            ``learn_shortcut`` is also ``True``, applies nonlinearity to the
+            learned shortcut connection.
+        nonlinearity (str, optional, default='none'):
+            Type of nonlinear activation function in the residual link.
+            ``'none'``, ``'relu'``, ``'leakyrelu'``, ``'prelu'``,
+            ``'tanh'`` , ``'sigmoid'`` or ``'softmax'``.
+        inplace_nonlinearity (bool, optional, default=False): If ``True``,
+            set ``inplace=True`` when initializing the nonlinearity layers.
+        apply_noise (bool, optional, default=False): If ``True``, adds
+            Gaussian noise with learnable magnitude to the convolution output.
+        hidden_channels_equal_out_channels (bool, optional, default=False):
+            If ``True``, set the hidden channel number to be equal to the
+            output channel number. If ``False``, the hidden channel number
+            equals to the smaller of the input channel number and the
+            output channel number.
+        order (str, optional, default='CNACNA'): Order of operations
+            in the residual link.
+            ``'C'``: convolution,
+            ``'N'``: normalization,
+            ``'A'``: nonlinear activation.
+        learn_shortcut (bool, optional, default=False): If ``True``, always use
+            a convolutional shortcut instead of an identity one, otherwise only
+            use a convolutional one if input and output have different number of
+            channels.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size=3,
+                 stride=1, padding=1, dilation=1, groups=1, bias=True,
+                 padding_mode='zeros',
+                 weight_norm_type='none', weight_norm_params=None,
+                 activation_norm_type='none', activation_norm_params=None,
+                 skip_activation_norm=True, skip_nonlinearity=False,
+                 nonlinearity='leakyrelu', inplace_nonlinearity=False,
+                 apply_noise=False, hidden_channels_equal_out_channels=False,
+                 order='CNACNA', learn_shortcut=None, clamp=None,
+                 output_scale=1):
+        super().__init__(in_channels, out_channels, kernel_size, stride,
+                         padding, dilation, groups, bias, padding_mode,
+                         weight_norm_type, weight_norm_params,
+                         activation_norm_type, activation_norm_params,
+                         skip_activation_norm, skip_nonlinearity, nonlinearity,
+                         inplace_nonlinearity, apply_noise,
+                         hidden_channels_equal_out_channels, order, Conv1dBlock,
+                         learn_shortcut, clamp, output_scale)
+class Res2dBlock(_BaseResBlock):
+    r"""Residual block for 2D input.
+    Args:
+        in_channels (int) : Number of channels in the input tensor.
+        out_channels (int) : Number of channels in the output tensor.
+        kernel_size (int, optional, default=3): Kernel size for the
+            convolutional filters in the residual link.
+        padding (int, optional, default=1): Padding size.
+        dilation (int, optional, default=1): Dilation factor.
+        groups (int, optional, default=1): Number of convolutional/linear
+            groups.
+        padding_mode (string, optional, default='zeros'): Type of padding:
+            ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``.
+        weight_norm_type (str, optional, default='none'):
+            Type of weight normalization.
+            ``'none'``, ``'spectral'``, ``'weight'``
+            or ``'weight_demod'``.
+        weight_norm_params (obj, optional, default=None):
+            Parameters of weight normalization.
+            If not ``None``, ``weight_norm_params.__dict__`` will be used as
+            keyword arguments when initializing weight normalization.
+        activation_norm_type (str, optional, default='none'):
+            Type of activation normalization.
+            ``'none'``, ``'instance'``, ``'batch'``, ``'sync_batch'``,
+            ``'layer'``,  ``'layer_2d'``, ``'group'``, ``'adaptive'``,
+            ``'spatially_adaptive'`` or ``'hyper_spatially_adaptive'``.
+        activation_norm_params (obj, optional, default=None):
+            Parameters of activation normalization.
+            If not ``None``, ``activation_norm_params.__dict__`` will be used as
+            keyword arguments when initializing activation normalization.
+        skip_activation_norm (bool, optional, default=True): If ``True`` and
+            ``learn_shortcut`` is also ``True``, applies activation norm to the
+            learned shortcut connection.
+        skip_nonlinearity (bool, optional, default=True): If ``True`` and
+            ``learn_shortcut`` is also ``True``, applies nonlinearity to the
+            learned shortcut connection.
+        nonlinearity (str, optional, default='none'):
+            Type of nonlinear activation function in the residual link.
+            ``'none'``, ``'relu'``, ``'leakyrelu'``, ``'prelu'``,
+            ``'tanh'`` , ``'sigmoid'`` or ``'softmax'``.
+        inplace_nonlinearity (bool, optional, default=False): If ``True``,
+            set ``inplace=True`` when initializing the nonlinearity layers.
+        apply_noise (bool, optional, default=False): If ``True``, adds
+            Gaussian noise with learnable magnitude to the convolution output.
+        hidden_channels_equal_out_channels (bool, optional, default=False):
+            If ``True``, set the hidden channel number to be equal to the
+            output channel number. If ``False``, the hidden channel number
+            equals to the smaller of the input channel number and the
+            output channel number.
+        order (str, optional, default='CNACNA'): Order of operations
+            in the residual link.
+            ``'C'``: convolution,
+            ``'N'``: normalization,
+            ``'A'``: nonlinear activation.
+        learn_shortcut (bool, optional, default=False): If ``True``, always use
+            a convolutional shortcut instead of an identity one, otherwise only
+            use a convolutional one if input and output have different number of
+            channels.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size=3,
+                 stride=1, padding=1, dilation=1, groups=1, bias=True,
+                 padding_mode='zeros',
+                 weight_norm_type='none', weight_norm_params=None,
+                 activation_norm_type='none', activation_norm_params=None,
+                 skip_activation_norm=True, skip_nonlinearity=False,
+                 skip_weight_norm=True,
+                 nonlinearity='leakyrelu', inplace_nonlinearity=False,
+                 apply_noise=False, hidden_channels_equal_out_channels=False,
+                 order='CNACNA', learn_shortcut=None, clamp=None,
+                 output_scale=1, blur=False, upsample_first=True):
+        super().__init__(in_channels, out_channels, kernel_size, stride,
+                         padding, dilation, groups, bias, padding_mode,
+                         weight_norm_type, weight_norm_params,
+                         activation_norm_type, activation_norm_params,
+                         skip_activation_norm, skip_nonlinearity, nonlinearity,
+                         inplace_nonlinearity, apply_noise,
+                         hidden_channels_equal_out_channels, order, Conv2dBlock,
+                         learn_shortcut, clamp, output_scale, blur=blur,
+                         upsample_first=upsample_first,
+                         skip_weight_norm=skip_weight_norm)
+class Res3dBlock(_BaseResBlock):
+    r"""Residual block for 3D input.
+    Args:
+        in_channels (int) : Number of channels in the input tensor.
+        out_channels (int) : Number of channels in the output tensor.
+        kernel_size (int, optional, default=3): Kernel size for the
+            convolutional filters in the residual link.
+        padding (int, optional, default=1): Padding size.
+        dilation (int, optional, default=1): Dilation factor.
+        groups (int, optional, default=1): Number of convolutional/linear
+            groups.
+        padding_mode (string, optional, default='zeros'): Type of padding:
+            ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``.
+        weight_norm_type (str, optional, default='none'):
+            Type of weight normalization.
+            ``'none'``, ``'spectral'``, ``'weight'``
+            or ``'weight_demod'``.
+        weight_norm_params (obj, optional, default=None):
+            Parameters of weight normalization.
+            If not ``None``, ``weight_norm_params.__dict__`` will be used as
+            keyword arguments when initializing weight normalization.
+        activation_norm_type (str, optional, default='none'):
+            Type of activation normalization.
+            ``'none'``, ``'instance'``, ``'batch'``, ``'sync_batch'``,
+            ``'layer'``,  ``'layer_2d'``, ``'group'``, ``'adaptive'``,
+            ``'spatially_adaptive'`` or ``'hyper_spatially_adaptive'``.
+        activation_norm_params (obj, optional, default=None):
+            Parameters of activation normalization.
+            If not ``None``, ``activation_norm_params.__dict__`` will be used as
+            keyword arguments when initializing activation normalization.
+        skip_activation_norm (bool, optional, default=True): If ``True`` and
+            ``learn_shortcut`` is also ``True``, applies activation norm to the
+            learned shortcut connection.
+        skip_nonlinearity (bool, optional, default=True): If ``True`` and
+            ``learn_shortcut`` is also ``True``, applies nonlinearity to the
+            learned shortcut connection.
+        nonlinearity (str, optional, default='none'):
+            Type of nonlinear activation function in the residual link.
+            ``'none'``, ``'relu'``, ``'leakyrelu'``, ``'prelu'``,
+            ``'tanh'`` , ``'sigmoid'`` or ``'softmax'``.
+        inplace_nonlinearity (bool, optional, default=False): If ``True``,
+            set ``inplace=True`` when initializing the nonlinearity layers.
+        apply_noise (bool, optional, default=False): If ``True``, adds
+            Gaussian noise with learnable magnitude to the convolution output.
+        hidden_channels_equal_out_channels (bool, optional, default=False):
+            If ``True``, set the hidden channel number to be equal to the
+            output channel number. If ``False``, the hidden channel number
+            equals to the smaller of the input channel number and the
+            output channel number.
+        order (str, optional, default='CNACNA'): Order of operations
+            in the residual link.
+            ``'C'``: convolution,
+            ``'N'``: normalization,
+            ``'A'``: nonlinear activation.
+        learn_shortcut (bool, optional, default=False): If ``True``, always use
+            a convolutional shortcut instead of an identity one, otherwise only
+            use a convolutional one if input and output have different number of
+            channels.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size=3,
+                 stride=1, padding=1, dilation=1, groups=1, bias=True,
+                 padding_mode='zeros',
+                 weight_norm_type='none', weight_norm_params=None,
+                 activation_norm_type='none', activation_norm_params=None,
+                 skip_activation_norm=True, skip_nonlinearity=False,
+                 nonlinearity='leakyrelu', inplace_nonlinearity=False,
+                 apply_noise=False, hidden_channels_equal_out_channels=False,
+                 order='CNACNA', learn_shortcut=None, clamp=None,
+                 output_scale=1):
+        super().__init__(in_channels, out_channels, kernel_size, stride,
+                         padding, dilation, groups, bias, padding_mode,
+                         weight_norm_type, weight_norm_params,
+                         activation_norm_type, activation_norm_params,
+                         skip_activation_norm, skip_nonlinearity, nonlinearity,
+                         inplace_nonlinearity, apply_noise,
+                         hidden_channels_equal_out_channels, order, Conv3dBlock,
+                         learn_shortcut, clamp, output_scale)
+class _BaseHyperResBlock(_BaseResBlock):
+    r"""An abstract class for hyper residual blocks.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size,
+                 stride, padding, dilation, groups, bias, padding_mode,
+                 weight_norm_type, weight_norm_params,
+                 activation_norm_type, activation_norm_params,
+                 skip_activation_norm, skip_nonlinearity,
+                 nonlinearity, inplace_nonlinearity, apply_noise,
+                 hidden_channels_equal_out_channels,
+                 order, is_hyper_conv, is_hyper_norm, block, learn_shortcut,
+                 clamp=None, output_scale=1):
+        block = functools.partial(block,
+                                  is_hyper_conv=is_hyper_conv,
+                                  is_hyper_norm=is_hyper_norm)
+        super().__init__(in_channels, out_channels, kernel_size, stride,
+                         padding, dilation, groups, bias, padding_mode,
+                         weight_norm_type, weight_norm_params,
+                         activation_norm_type, activation_norm_params,
+                         skip_activation_norm, skip_nonlinearity, nonlinearity,
+                         inplace_nonlinearity, apply_noise,
+                         hidden_channels_equal_out_channels, order, block,
+                         learn_shortcut, clamp, output_scale)
+    def forward(self, x, *cond_inputs, conv_weights=(None,) * 3,
+                norm_weights=(None,) * 3, **kw_cond_inputs):
+        r"""
+        Args:
+            x (tensor): Input tensor.
+            cond_inputs (list of tensors) : Conditional input tensors.
+            conv_weights (list of tensors): Convolution weights for
+                three convolutional layers respectively.
+            norm_weights (list of tensors): Normalization weights for
+                three convolutional layers respectively.
+            kw_cond_inputs (dict) : Keyword conditional inputs.
+        Returns:
+            output (tensor): Output tensor.
+        """
+        dx = self.conv_block_0(x, *cond_inputs, conv_weights=conv_weights[0],
+                               norm_weights=norm_weights[0])
+        dx = self.conv_block_1(dx, *cond_inputs, conv_weights=conv_weights[1],
+                               norm_weights=norm_weights[1])
+        if self.learn_shortcut:
+            x_shortcut = self.conv_block_s(x, *cond_inputs,
+                                           conv_weights=conv_weights[2],
+                                           norm_weights=norm_weights[2])
+        else:
+            x_shortcut = x
+        output = x_shortcut + dx
+        return self.output_scale * output
+class HyperRes2dBlock(_BaseHyperResBlock):
+    r"""Hyper residual block for 2D input.
+    Args:
+        in_channels (int) : Number of channels in the input tensor.
+        out_channels (int) : Number of channels in the output tensor.
+        kernel_size (int, optional, default=3): Kernel size for the
+            convolutional filters in the residual link.
+        padding (int, optional, default=1): Padding size.
+        dilation (int, optional, default=1): Dilation factor.
+        groups (int, optional, default=1): Number of convolutional/linear
+            groups.
+        padding_mode (string, optional, default='zeros'): Type of padding:
+            ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``.
+        weight_norm_type (str, optional, default='none'):
+            Type of weight normalization.
+            ``'none'``, ``'spectral'``, ``'weight'``
+            or ``'weight_demod'``.
+        weight_norm_params (obj, optional, default=None):
+            Parameters of weight normalization.
+            If not ``None``, ``weight_norm_params.__dict__`` will be used as
+            keyword arguments when initializing weight normalization.
+        activation_norm_type (str, optional, default='none'):
+            Type of activation normalization.
+            ``'none'``, ``'instance'``, ``'batch'``, ``'sync_batch'``,
+            ``'layer'``,  ``'layer_2d'``, ``'group'``, ``'adaptive'``,
+            ``'spatially_adaptive'`` or ``'hyper_spatially_adaptive'``.
+        activation_norm_params (obj, optional, default=None):
+            Parameters of activation normalization.
+            If not ``None``, ``activation_norm_params.__dict__`` will be used as
+            keyword arguments when initializing activation normalization.
+        skip_activation_norm (bool, optional, default=True): If ``True`` and
+            ``learn_shortcut`` is also ``True``, applies activation norm to the
+            learned shortcut connection.
+        skip_nonlinearity (bool, optional, default=True): If ``True`` and
+            ``learn_shortcut`` is also ``True``, applies nonlinearity to the
+            learned shortcut connection.
+        nonlinearity (str, optional, default='none'):
+            Type of nonlinear activation function in the residual link.
+            ``'none'``, ``'relu'``, ``'leakyrelu'``, ``'prelu'``,
+            ``'tanh'`` , ``'sigmoid'`` or ``'softmax'``.
+        inplace_nonlinearity (bool, optional, default=False): If ``True``,
+            set ``inplace=True`` when initializing the nonlinearity layers.
+        apply_noise (bool, optional, default=False): If ``True``, adds
+            Gaussian noise with learnable magnitude to the convolution output.
+        hidden_channels_equal_out_channels (bool, optional, default=False):
+            If ``True``, set the hidden channel number to be equal to the
+            output channel number. If ``False``, the hidden channel number
+            equals to the smaller of the input channel number and the
+            output channel number.
+        order (str, optional, default='CNACNA'): Order of operations
+            in the residual link.
+            ``'C'``: convolution,
+            ``'N'``: normalization,
+            ``'A'``: nonlinear activation.
+        is_hyper_conv (bool, optional, default=False): If ``True``, use
+            ``HyperConv2d``, otherwise use ``torch.nn.Conv2d``.
+        is_hyper_norm (bool, optional, default=False): If ``True``, use
+            hyper normalizations.
+        learn_shortcut (bool, optional, default=False): If ``True``, always use
+            a convolutional shortcut instead of an identity one, otherwise only
+            use a convolutional one if input and output have different number of
+            channels.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size=3,
+                 stride=1, padding=1, dilation=1, groups=1, bias=True,
+                 padding_mode='zeros',
+                 weight_norm_type='', weight_norm_params=None,
+                 activation_norm_type='', activation_norm_params=None,
+                 skip_activation_norm=True, skip_nonlinearity=False,
+                 nonlinearity='leakyrelu', inplace_nonlinearity=False,
+                 apply_noise=False, hidden_channels_equal_out_channels=False,
+                 order='CNACNA', is_hyper_conv=False, is_hyper_norm=False,
+                 learn_shortcut=None, clamp=None, output_scale=1):
+        super().__init__(in_channels, out_channels, kernel_size,
+                         stride, padding, dilation, groups, bias, padding_mode,
+                         weight_norm_type, weight_norm_params,
+                         activation_norm_type, activation_norm_params,
+                         skip_activation_norm, skip_nonlinearity,
+                         nonlinearity, inplace_nonlinearity, apply_noise,
+                         hidden_channels_equal_out_channels,
+                         order, is_hyper_conv, is_hyper_norm,
+                         HyperConv2dBlock, learn_shortcut, clamp, output_scale)
+class _BaseDownResBlock(_BaseResBlock):
+    r"""An abstract class for residual blocks with downsampling.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size,
+                 stride, padding, dilation, groups, bias, padding_mode,
+                 weight_norm_type, weight_norm_params,
+                 activation_norm_type, activation_norm_params,
+                 skip_activation_norm, skip_nonlinearity,
+                 nonlinearity, inplace_nonlinearity,
+                 apply_noise, hidden_channels_equal_out_channels,
+                 order, block, pooling, down_factor, learn_shortcut,
+                 clamp=None, output_scale=1):
+        super().__init__(in_channels, out_channels, kernel_size,
+                         stride, padding, dilation, groups, bias, padding_mode,
+                         weight_norm_type, weight_norm_params,
+                         activation_norm_type, activation_norm_params,
+                         skip_activation_norm, skip_nonlinearity, nonlinearity,
+                         inplace_nonlinearity, apply_noise,
+                         hidden_channels_equal_out_channels, order, block,
+                         learn_shortcut, clamp, output_scale)
+        self.pooling = pooling(down_factor)
+    def forward(self, x, *cond_inputs):
+        r"""
+        Args:
+            x (tensor) : Input tensor.
+            cond_inputs (list of tensors) : conditional input.
+        Returns:
+            output (tensor) : Output tensor.
+        """
+        dx = self.conv_block_0(x, *cond_inputs)
+        dx = self.conv_block_1(dx, *cond_inputs)
+        dx = self.pooling(dx)
+        if self.learn_shortcut:
+            x_shortcut = self.conv_block_s(x, *cond_inputs)
+        else:
+            x_shortcut = x
+        x_shortcut = self.pooling(x_shortcut)
+        output = x_shortcut + dx
+        return self.output_scale * output
+class DownRes2dBlock(_BaseDownResBlock):
+    r"""Residual block for 2D input with downsampling.
+    Args:
+        in_channels (int) : Number of channels in the input tensor.
+        out_channels (int) : Number of channels in the output tensor.
+        kernel_size (int, optional, default=3): Kernel size for the
+            convolutional filters in the residual link.
+        padding (int, optional, default=1): Padding size.
+        dilation (int, optional, default=1): Dilation factor.
+        groups (int, optional, default=1): Number of convolutional/linear
+            groups.
+        padding_mode (string, optional, default='zeros'): Type of padding:
+            ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``.
+        weight_norm_type (str, optional, default='none'):
+            Type of weight normalization.
+            ``'none'``, ``'spectral'``, ``'weight'``
+            or ``'weight_demod'``.
+        weight_norm_params (obj, optional, default=None):
+            Parameters of weight normalization.
+            If not ``None``, ``weight_norm_params.__dict__`` will be used as
+            keyword arguments when initializing weight normalization.
+        activation_norm_type (str, optional, default='none'):
+            Type of activation normalization.
+            ``'none'``, ``'instance'``, ``'batch'``, ``'sync_batch'``,
+            ``'layer'``,  ``'layer_2d'``, ``'group'``, ``'adaptive'``,
+            ``'spatially_adaptive'`` or ``'hyper_spatially_adaptive'``.
+        activation_norm_params (obj, optional, default=None):
+            Parameters of activation normalization.
+            If not ``None``, ``activation_norm_params.__dict__`` will be used as
+            keyword arguments when initializing activation normalization.
+        skip_activation_norm (bool, optional, default=True): If ``True`` and
+            ``learn_shortcut`` is also ``True``, applies activation norm to the
+            learned shortcut connection.
+        skip_nonlinearity (bool, optional, default=True): If ``True`` and
+            ``learn_shortcut`` is also ``True``, applies nonlinearity to the
+            learned shortcut connection.
+        nonlinearity (str, optional, default='none'):
+            Type of nonlinear activation function in the residual link.
+            ``'none'``, ``'relu'``, ``'leakyrelu'``, ``'prelu'``,
+            ``'tanh'`` , ``'sigmoid'`` or ``'softmax'``.
+        inplace_nonlinearity (bool, optional, default=False): If ``True``,
+            set ``inplace=True`` when initializing the nonlinearity layers.
+        apply_noise (bool, optional, default=False): If ``True``, adds
+            Gaussian noise with learnable magnitude to the convolution output.
+        hidden_channels_equal_out_channels (bool, optional, default=False):
+            If ``True``, set the hidden channel number to be equal to the
+            output channel number. If ``False``, the hidden channel number
+            equals to the smaller of the input channel number and the
+            output channel number.
+        order (str, optional, default='CNACNA'): Order of operations
+            in the residual link.
+            ``'C'``: convolution,
+            ``'N'``: normalization,
+            ``'A'``: nonlinear activation.
+        pooling (class, optional, default=nn.AvgPool2d): Pytorch pooling
+            layer to be used.
+        down_factor (int, optional, default=2): Downsampling factor.
+        learn_shortcut (bool, optional, default=False): If ``True``, always use
+            a convolutional shortcut instead of an identity one, otherwise only
+            use a convolutional one if input and output have different number of
+            channels.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size=3,
+                 stride=1, padding=1, dilation=1, groups=1, bias=True,
+                 padding_mode='zeros',
+                 weight_norm_type='none', weight_norm_params=None,
+                 activation_norm_type='none', activation_norm_params=None,
+                 skip_activation_norm=True, skip_nonlinearity=False,
+                 nonlinearity='leakyrelu', inplace_nonlinearity=False,
+                 apply_noise=False, hidden_channels_equal_out_channels=False,
+                 order='CNACNA', pooling=nn.AvgPool2d, down_factor=2,
+                 learn_shortcut=None, clamp=None, output_scale=1):
+        super().__init__(in_channels, out_channels, kernel_size,
+                         stride, padding, dilation, groups, bias, padding_mode,
+                         weight_norm_type, weight_norm_params,
+                         activation_norm_type, activation_norm_params,
+                         skip_activation_norm, skip_nonlinearity,
+                         nonlinearity, inplace_nonlinearity, apply_noise,
+                         hidden_channels_equal_out_channels,
+                         order, Conv2dBlock, pooling,
+                         down_factor, learn_shortcut, clamp, output_scale)
+class _BaseUpResBlock(_BaseResBlock):
+    r"""An abstract class for residual blocks with upsampling.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size,
+                 stride, padding, dilation, groups, bias, padding_mode,
+                 weight_norm_type, weight_norm_params,
+                 activation_norm_type, activation_norm_params,
+                 skip_activation_norm, skip_nonlinearity,
+                 nonlinearity, inplace_nonlinearity,
+                 apply_noise, hidden_channels_equal_out_channels,
+                 order, block, upsample, up_factor, learn_shortcut, clamp=None,
+                 output_scale=1):
+        super().__init__(in_channels, out_channels, kernel_size,
+                         stride, padding, dilation, groups, bias, padding_mode,
+                         weight_norm_type, weight_norm_params,
+                         activation_norm_type, activation_norm_params,
+                         skip_activation_norm, skip_nonlinearity, nonlinearity,
+                         inplace_nonlinearity, apply_noise,
+                         hidden_channels_equal_out_channels, order, block,
+                         learn_shortcut, clamp, output_scale)
+        self.order = order
+        self.upsample = upsample(scale_factor=up_factor)
+    def _get_stride_blur(self):
+        # Upsampling.
+        first_stride, second_stride = self.stride, 1
+        first_blur, second_blur = self.blur, False
+        shortcut_blur = False
+        shortcut_stride = 1
+        # if self.upsample == 'blur_deconv':
+        if self.blur:
+            # The shortcut branch uses blur_upsample + stride-1 conv
+            self.upsample = BlurUpsample()
+        else:
+            shortcut_stride = self.stride
+            self.upsample = nn.Upsample(scale_factor=2)
+        return (first_stride, second_stride, shortcut_stride,
+                first_blur, second_blur, shortcut_blur)
+    def forward(self, x, *cond_inputs):
+        r"""Implementation of the up residual block forward function.
+        If the order is 'NAC' for the first residual block, we will first
+        do the activation norm and nonlinearity, in the original resolution.
+        We will then upsample the activation map to a higher resolution. We
+        then do the convolution.
+        It is is other orders, then we first do the whole processing and
+        then upsample.
+        Args:
+            x (tensor) : Input tensor.
+            cond_inputs (list of tensors) : Conditional input.
+        Returns:
+            output (tensor) : Output tensor.
+        """
+        # In this particular upsample residual block operation, we first
+        # upsample the skip connection.
+        if self.learn_shortcut:
+            x_shortcut = self.upsample(x)
+            x_shortcut = self.conv_block_s(x_shortcut, *cond_inputs)
+        else:
+            x_shortcut = self.upsample(x)
+        if self.order[0:3] == 'NAC':
+            for ix, layer in enumerate(self.conv_block_0.layers.values()):
+                if getattr(layer, 'conditional', False):
+                    x = layer(x, *cond_inputs)
+                else:
+                    x = layer(x)
+                if ix == 1:
+                    x = self.upsample(x)
+        else:
+            x = self.conv_block_0(x, *cond_inputs)
+            x = self.upsample(x)
+        x = self.conv_block_1(x, *cond_inputs)
+        output = x_shortcut + x
+        return self.output_scale * output
+class UpRes2dBlock(_BaseUpResBlock):
+    r"""Residual block for 2D input with downsampling.
+    Args:
+        in_channels (int) : Number of channels in the input tensor.
+        out_channels (int) : Number of channels in the output tensor.
+        kernel_size (int, optional, default=3): Kernel size for the
+            convolutional filters in the residual link.
+        padding (int, optional, default=1): Padding size.
+        dilation (int, optional, default=1): Dilation factor.
+        groups (int, optional, default=1): Number of convolutional/linear
+            groups.
+        padding_mode (string, optional, default='zeros'): Type of padding:
+            ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``.
+        weight_norm_type (str, optional, default='none'):
+            Type of weight normalization.
+            ``'none'``, ``'spectral'``, ``'weight'``
+            or ``'weight_demod'``.
+        weight_norm_params (obj, optional, default=None):
+            Parameters of weight normalization.
+            If not ``None``, ``weight_norm_params.__dict__`` will be used as
+            keyword arguments when initializing weight normalization.
+        activation_norm_type (str, optional, default='none'):
+            Type of activation normalization.
+            ``'none'``, ``'instance'``, ``'batch'``, ``'sync_batch'``,
+            ``'layer'``,  ``'layer_2d'``, ``'group'``, ``'adaptive'``,
+            ``'spatially_adaptive'`` or ``'hyper_spatially_adaptive'``.
+        activation_norm_params (obj, optional, default=None):
+            Parameters of activation normalization.
+            If not ``None``, ``activation_norm_params.__dict__`` will be used as
+            keyword arguments when initializing activation normalization.
+        skip_activation_norm (bool, optional, default=True): If ``True`` and
+            ``learn_shortcut`` is also ``True``, applies activation norm to the
+            learned shortcut connection.
+        skip_nonlinearity (bool, optional, default=True): If ``True`` and
+            ``learn_shortcut`` is also ``True``, applies nonlinearity to the
+            learned shortcut connection.
+        nonlinearity (str, optional, default='none'):
+            Type of nonlinear activation function in the residual link.
+            ``'none'``, ``'relu'``, ``'leakyrelu'``, ``'prelu'``,
+            ``'tanh'`` , ``'sigmoid'`` or ``'softmax'``.
+        inplace_nonlinearity (bool, optional, default=False): If ``True``,
+            set ``inplace=True`` when initializing the nonlinearity layers.
+        apply_noise (bool, optional, default=False): If ``True``, adds
+            Gaussian noise with learnable magnitude to the convolution output.
+        hidden_channels_equal_out_channels (bool, optional, default=False):
+            If ``True``, set the hidden channel number to be equal to the
+            output channel number. If ``False``, the hidden channel number
+            equals to the smaller of the input channel number and the
+            output channel number.
+        order (str, optional, default='CNACNA'): Order of operations
+            in the residual link.
+            ``'C'``: convolution,
+            ``'N'``: normalization,
+            ``'A'``: nonlinear activation.
+        upsample (class, optional, default=NearestUpsample): PPytorch
+            upsampling layer to be used.
+        up_factor (int, optional, default=2): Upsampling factor.
+        learn_shortcut (bool, optional, default=False): If ``True``, always use
+            a convolutional shortcut instead of an identity one, otherwise only
+            use a convolutional one if input and output have different number of
+            channels.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size=3,
+                 stride=1, padding=1, dilation=1, groups=1, bias=True,
+                 padding_mode='zeros',
+                 weight_norm_type='none', weight_norm_params=None,
+                 activation_norm_type='none', activation_norm_params=None,
+                 skip_activation_norm=True, skip_nonlinearity=False,
+                 nonlinearity='leakyrelu', inplace_nonlinearity=False,
+                 apply_noise=False, hidden_channels_equal_out_channels=False,
+                 order='CNACNA', upsample=NearestUpsample, up_factor=2,
+                 learn_shortcut=None, clamp=None, output_scale=1):
+        super().__init__(in_channels, out_channels, kernel_size,
+                         stride, padding, dilation, groups, bias, padding_mode,
+                         weight_norm_type, weight_norm_params,
+                         activation_norm_type, activation_norm_params,
+                         skip_activation_norm, skip_nonlinearity,
+                         nonlinearity, inplace_nonlinearity,
+                         apply_noise, hidden_channels_equal_out_channels,
+                         order, Conv2dBlock,
+                         upsample, up_factor, learn_shortcut, clamp,
+                         output_scale)
+class _BasePartialResBlock(_BaseResBlock):
+    r"""An abstract class for residual blocks with partial convolution.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size,
+                 stride, padding, dilation, groups, bias, padding_mode,
+                 weight_norm_type, weight_norm_params,
+                 activation_norm_type, activation_norm_params,
+                 skip_activation_norm, skip_nonlinearity,
+                 nonlinearity, inplace_nonlinearity,
+                 multi_channel, return_mask,
+                 apply_noise, hidden_channels_equal_out_channels,
+                 order, block, learn_shortcut, clamp=None, output_scale=1):
+        block = functools.partial(block,
+                                  multi_channel=multi_channel,
+                                  return_mask=return_mask)
+        self.partial_conv = True
+        super().__init__(in_channels, out_channels, kernel_size, stride,
+                         padding, dilation, groups, bias, padding_mode,
+                         weight_norm_type, weight_norm_params,
+                         activation_norm_type, activation_norm_params,
+                         skip_activation_norm, skip_nonlinearity, nonlinearity,
+                         inplace_nonlinearity, apply_noise,
+                         hidden_channels_equal_out_channels, order, block,
+                         learn_shortcut, clamp, output_scale)
+    def forward(self, x, *cond_inputs, mask_in=None, **kw_cond_inputs):
+        r"""
+        Args:
+            x (tensor): Input tensor.
+            cond_inputs (list of tensors) : Conditional input tensors.
+            mask_in (tensor, optional, default=``None``) If not ``None``,
+                it masks the valid input region.
+            kw_cond_inputs (dict) : Keyword conditional inputs.
+        Returns:
+            (tuple):
+              - output (tensor): Output tensor.
+              - mask_out (tensor, optional): Masks the valid output region.
+        """
+        if self.conv_block_0.layers.conv.return_mask:
+            dx, mask_out = self.conv_block_0(x, *cond_inputs,
+                                             mask_in=mask_in, **kw_cond_inputs)
+            dx, mask_out = self.conv_block_1(dx, *cond_inputs,
+                                             mask_in=mask_out, **kw_cond_inputs)
+        else:
+            dx = self.conv_block_0(x, *cond_inputs,
+                                   mask_in=mask_in, **kw_cond_inputs)
+            dx = self.conv_block_1(dx, *cond_inputs,
+                                   mask_in=mask_in, **kw_cond_inputs)
+            mask_out = None
+        if self.learn_shortcut:
+            x_shortcut = self.conv_block_s(x, mask_in=mask_in, *cond_inputs,
+                                           **kw_cond_inputs)
+            if type(x_shortcut) == tuple:
+                x_shortcut, _ = x_shortcut
+        else:
+            x_shortcut = x
+        output = x_shortcut + dx
+        if mask_out is not None:
+            return output, mask_out
+        return self.output_scale * output
+class PartialRes2dBlock(_BasePartialResBlock):
+    r"""Residual block for 2D input with partial convolution.
+    Args:
+        in_channels (int) : Number of channels in the input tensor.
+        out_channels (int) : Number of channels in the output tensor.
+        kernel_size (int, optional, default=3): Kernel size for the
+            convolutional filters in the residual link.
+        padding (int, optional, default=1): Padding size.
+        dilation (int, optional, default=1): Dilation factor.
+        groups (int, optional, default=1): Number of convolutional/linear
+            groups.
+        padding_mode (string, optional, default='zeros'): Type of padding:
+            ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``.
+        weight_norm_type (str, optional, default='none'):
+            Type of weight normalization.
+            ``'none'``, ``'spectral'``, ``'weight'``
+            or ``'weight_demod'``.
+        weight_norm_params (obj, optional, default=None):
+            Parameters of weight normalization.
+            If not ``None``, ``weight_norm_params.__dict__`` will be used as
+            keyword arguments when initializing weight normalization.
+        activation_norm_type (str, optional, default='none'):
+            Type of activation normalization.
+            ``'none'``, ``'instance'``, ``'batch'``, ``'sync_batch'``,
+            ``'layer'``,  ``'layer_2d'``, ``'group'``, ``'adaptive'``,
+            ``'spatially_adaptive'`` or ``'hyper_spatially_adaptive'``.
+        activation_norm_params (obj, optional, default=None):
+            Parameters of activation normalization.
+            If not ``None``, ``activation_norm_params.__dict__`` will be used as
+            keyword arguments when initializing activation normalization.
+        skip_activation_norm (bool, optional, default=True): If ``True`` and
+            ``learn_shortcut`` is also ``True``, applies activation norm to the
+            learned shortcut connection.
+        skip_nonlinearity (bool, optional, default=True): If ``True`` and
+            ``learn_shortcut`` is also ``True``, applies nonlinearity to the
+            learned shortcut connection.
+        nonlinearity (str, optional, default='none'):
+            Type of nonlinear activation function in the residual link.
+            ``'none'``, ``'relu'``, ``'leakyrelu'``, ``'prelu'``,
+            ``'tanh'`` , ``'sigmoid'`` or ``'softmax'``.
+        inplace_nonlinearity (bool, optional, default=False): If ``True``,
+            set ``inplace=True`` when initializing the nonlinearity layers.
+        apply_noise (bool, optional, default=False): If ``True``, adds
+            Gaussian noise with learnable magnitude to the convolution output.
+        hidden_channels_equal_out_channels (bool, optional, default=False):
+            If ``True``, set the hidden channel number to be equal to the
+            output channel number. If ``False``, the hidden channel number
+            equals to the smaller of the input channel number and the
+            output channel number.
+        order (str, optional, default='CNACNA'): Order of operations
+            in the residual link.
+            ``'C'``: convolution,
+            ``'N'``: normalization,
+            ``'A'``: nonlinear activation.
+        learn_shortcut (bool, optional, default=False): If ``True``, always use
+            a convolutional shortcut instead of an identity one, otherwise only
+            use a convolutional one if input and output have different number of
+            channels.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size=3,
+                 stride=1, padding=1, dilation=1, groups=1, bias=True,
+                 padding_mode='zeros',
+                 weight_norm_type='none', weight_norm_params=None,
+                 activation_norm_type='none', activation_norm_params=None,
+                 skip_activation_norm=True, skip_nonlinearity=False,
+                 nonlinearity='leakyrelu', inplace_nonlinearity=False,
+                 multi_channel=False, return_mask=True,
+                 apply_noise=False,
+                 hidden_channels_equal_out_channels=False,
+                 order='CNACNA', learn_shortcut=None, clamp=None,
+                 output_scale=1):
+        super().__init__(in_channels, out_channels, kernel_size,
+                         stride, padding, dilation, groups, bias,
+                         padding_mode, weight_norm_type, weight_norm_params,
+                         activation_norm_type, activation_norm_params,
+                         skip_activation_norm, skip_nonlinearity, nonlinearity,
+                         inplace_nonlinearity, multi_channel, return_mask,
+                         apply_noise, hidden_channels_equal_out_channels,
+                         order, PartialConv2dBlock, learn_shortcut, clamp,
+                         output_scale)
+class PartialRes3dBlock(_BasePartialResBlock):
+    r"""Residual block for 3D input with partial convolution.
+    Args:
+        in_channels (int) : Number of channels in the input tensor.
+        out_channels (int) : Number of channels in the output tensor.
+        kernel_size (int, optional, default=3): Kernel size for the
+            convolutional filters in the residual link.
+        padding (int, optional, default=1): Padding size.
+        dilation (int, optional, default=1): Dilation factor.
+        groups (int, optional, default=1): Number of convolutional/linear
+            groups.
+        padding_mode (string, optional, default='zeros'): Type of padding:
+            ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``.
+        weight_norm_type (str, optional, default='none'):
+            Type of weight normalization.
+            ``'none'``, ``'spectral'``, ``'weight'``
+            or ``'weight_demod'``.
+        weight_norm_params (obj, optional, default=None):
+            Parameters of weight normalization.
+            If not ``None``, ``weight_norm_params.__dict__`` will be used as
+            keyword arguments when initializing weight normalization.
+        activation_norm_type (str, optional, default='none'):
+            Type of activation normalization.
+            ``'none'``, ``'instance'``, ``'batch'``, ``'sync_batch'``,
+            ``'layer'``,  ``'layer_2d'``, ``'group'``, ``'adaptive'``,
+            ``'spatially_adaptive'`` or ``'hyper_spatially_adaptive'``.
+        activation_norm_params (obj, optional, default=None):
+            Parameters of activation normalization.
+            If not ``None``, ``activation_norm_params.__dict__`` will be used as
+            keyword arguments when initializing activation normalization.
+        skip_activation_norm (bool, optional, default=True): If ``True`` and
+            ``learn_shortcut`` is also ``True``, applies activation norm to the
+            learned shortcut connection.
+        skip_nonlinearity (bool, optional, default=True): If ``True`` and
+            ``learn_shortcut`` is also ``True``, applies nonlinearity to the
+            learned shortcut connection.
+        nonlinearity (str, optional, default='none'):
+            Type of nonlinear activation function in the residual link.
+            ``'none'``, ``'relu'``, ``'leakyrelu'``, ``'prelu'``,
+            ``'tanh'`` , ``'sigmoid'`` or ``'softmax'``.
+        inplace_nonlinearity (bool, optional, default=False): If ``True``,
+            set ``inplace=True`` when initializing the nonlinearity layers.
+        apply_noise (bool, optional, default=False): If ``True``, adds
+            Gaussian noise with learnable magnitude to the convolution output.
+        hidden_channels_equal_out_channels (bool, optional, default=False):
+            If ``True``, set the hidden channel number to be equal to the
+            output channel number. If ``False``, the hidden channel number
+            equals to the smaller of the input channel number and the
+            output channel number.
+        order (str, optional, default='CNACNA'): Order of operations
+            in the residual link.
+            ``'C'``: convolution,
+            ``'N'``: normalization,
+            ``'A'``: nonlinear activation.
+        learn_shortcut (bool, optional, default=False): If ``True``, always use
+            a convolutional shortcut instead of an identity one, otherwise only
+            use a convolutional one if input and output have different number of
+            channels.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size=3,
+                 stride=1, padding=1, dilation=1, groups=1, bias=True,
+                 padding_mode='zeros',
+                 weight_norm_type='none', weight_norm_params=None,
+                 activation_norm_type='none', activation_norm_params=None,
+                 skip_activation_norm=True, skip_nonlinearity=False,
+                 nonlinearity='leakyrelu', inplace_nonlinearity=False,
+                 multi_channel=False, return_mask=True,
+                 apply_noise=False, hidden_channels_equal_out_channels=False,
+                 order='CNACNA', learn_shortcut=None, clamp=None,
+                 output_scale=1):
+        super().__init__(in_channels, out_channels, kernel_size,
+                         stride, padding, dilation, groups, bias,
+                         padding_mode, weight_norm_type, weight_norm_params,
+                         activation_norm_type, activation_norm_params,
+                         skip_activation_norm, skip_nonlinearity,
+                         nonlinearity, inplace_nonlinearity, multi_channel,
+                         return_mask, apply_noise,
+                         hidden_channels_equal_out_channels,
+                         order, PartialConv3dBlock, learn_shortcut, clamp,
+                         output_scale)
+class _BaseMultiOutResBlock(_BaseResBlock):
+    r"""An abstract class for residual blocks that can returns multiple outputs.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size,
+                 stride, padding, dilation, groups, bias, padding_mode,
+                 weight_norm_type, weight_norm_params,
+                 activation_norm_type, activation_norm_params,
+                 skip_activation_norm, skip_nonlinearity,
+                 nonlinearity, inplace_nonlinearity,
+                 apply_noise, hidden_channels_equal_out_channels,
+                 order, block, learn_shortcut, clamp=None, output_scale=1,
+                 blur=False, upsample_first=True):
+        self.multiple_outputs = True
+        super().__init__(in_channels, out_channels, kernel_size, stride,
+                         padding, dilation, groups, bias, padding_mode,
+                         weight_norm_type, weight_norm_params,
+                         activation_norm_type, activation_norm_params,
+                         skip_activation_norm, skip_nonlinearity, nonlinearity,
+                         inplace_nonlinearity, apply_noise,
+                         hidden_channels_equal_out_channels, order, block,
+                         learn_shortcut, clamp, output_scale, blur=blur,
+                         upsample_first=upsample_first)
+    def forward(self, x, *cond_inputs):
+        r"""
+        Args:
+            x (tensor): Input tensor.
+            cond_inputs (list of tensors) : Conditional input tensors.
+        Returns:
+            (tuple):
+              - output (tensor): Output tensor.
+              - aux_outputs_0 (tensor): Auxiliary output of the first block.
+              - aux_outputs_1 (tensor): Auxiliary output of the second block.
+        """
+        dx, aux_outputs_0 = self.conv_block_0(x, *cond_inputs)
+        dx, aux_outputs_1 = self.conv_block_1(dx, *cond_inputs)
+        if self.learn_shortcut:
+            # We are not using the auxiliary outputs of self.conv_block_s.
+            x_shortcut, _ = self.conv_block_s(x, *cond_inputs)
+        else:
+            x_shortcut = x
+        output = x_shortcut + dx
+        return self.output_scale * output, aux_outputs_0, aux_outputs_1
+class MultiOutRes2dBlock(_BaseMultiOutResBlock):
+    r"""Residual block for 2D input. It can return multiple outputs, if some
+    layers in the block return more than one output.
+    Args:
+        in_channels (int) : Number of channels in the input tensor.
+        out_channels (int) : Number of channels in the output tensor.
+        kernel_size (int, optional, default=3): Kernel size for the
+            convolutional filters in the residual link.
+        padding (int, optional, default=1): Padding size.
+        dilation (int, optional, default=1): Dilation factor.
+        groups (int, optional, default=1): Number of convolutional/linear
+            groups.
+        padding_mode (string, optional, default='zeros'): Type of padding:
+            ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``.
+        weight_norm_type (str, optional, default='none'):
+            Type of weight normalization.
+            ``'none'``, ``'spectral'``, ``'weight'``
+            or ``'weight_demod'``.
+        weight_norm_params (obj, optional, default=None):
+            Parameters of weight normalization.
+            If not ``None``, ``weight_norm_params.__dict__`` will be used as
+            keyword arguments when initializing weight normalization.
+        activation_norm_type (str, optional, default='none'):
+            Type of activation normalization.
+            ``'none'``, ``'instance'``, ``'batch'``, ``'sync_batch'``,
+            ``'layer'``,  ``'layer_2d'``, ``'group'``, ``'adaptive'``,
+            ``'spatially_adaptive'`` or ``'hyper_spatially_adaptive'``.
+        activation_norm_params (obj, optional, default=None):
+            Parameters of activation normalization.
+            If not ``None``, ``activation_norm_params.__dict__`` will be used as
+            keyword arguments when initializing activation normalization.
+        skip_activation_norm (bool, optional, default=True): If ``True`` and
+            ``learn_shortcut`` is also ``True``, applies activation norm to the
+            learned shortcut connection.
+        skip_nonlinearity (bool, optional, default=True): If ``True`` and
+            ``learn_shortcut`` is also ``True``, applies nonlinearity to the
+            learned shortcut connection.
+        nonlinearity (str, optional, default='none'):
+            Type of nonlinear activation function in the residual link.
+            ``'none'``, ``'relu'``, ``'leakyrelu'``, ``'prelu'``,
+            ``'tanh'`` , ``'sigmoid'`` or ``'softmax'``.
+        inplace_nonlinearity (bool, optional, default=False): If ``True``,
+            set ``inplace=True`` when initializing the nonlinearity layers.
+        apply_noise (bool, optional, default=False): If ``True``, adds
+            Gaussian noise with learnable magnitude to the convolution output.
+        hidden_channels_equal_out_channels (bool, optional, default=False):
+            If ``True``, set the hidden channel number to be equal to the
+            output channel number. If ``False``, the hidden channel number
+            equals to the smaller of the input channel number and the
+            output channel number.
+        order (str, optional, default='CNACNA'): Order of operations
+            in the residual link.
+            ``'C'``: convolution,
+            ``'N'``: normalization,
+            ``'A'``: nonlinear activation.
+        learn_shortcut (bool, optional, default=False): If ``True``, always use
+            a convolutional shortcut instead of an identity one, otherwise only
+            use a convolutional one if input and output have different number of
+            channels.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size=3,
+                 stride=1, padding=1, dilation=1, groups=1, bias=True,
+                 padding_mode='zeros',
+                 weight_norm_type='none', weight_norm_params=None,
+                 activation_norm_type='none', activation_norm_params=None,
+                 skip_activation_norm=True, skip_nonlinearity=False,
+                 nonlinearity='leakyrelu', inplace_nonlinearity=False,
+                 apply_noise=False, hidden_channels_equal_out_channels=False,
+                 order='CNACNA', learn_shortcut=None, clamp=None,
+                 output_scale=1, blur=False, upsample_first=True):
+        super().__init__(in_channels, out_channels, kernel_size, stride,
+                         padding, dilation, groups, bias, padding_mode,
+                         weight_norm_type, weight_norm_params,
+                         activation_norm_type, activation_norm_params,
+                         skip_activation_norm, skip_nonlinearity, nonlinearity,
+                         inplace_nonlinearity, apply_noise,
+                         hidden_channels_equal_out_channels, order,
+                         MultiOutConv2dBlock, learn_shortcut, clamp,
+                         output_scale, blur=blur, upsample_first=upsample_first)

imaginaire/layers/residual_deep.py ADDED Viewed

	@@ -0,0 +1,346 @@

+# Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, check out LICENSE.md
+import torch
+from torch import nn
+from torch.utils.checkpoint import checkpoint
+from imaginaire.third_party.upfirdn2d import BlurDownsample, BlurUpsample
+from .conv import Conv2dBlock
+class _BaseDeepResBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size,
+                 stride, padding, dilation, groups, bias, padding_mode,
+                 weight_norm_type, weight_norm_params,
+                 activation_norm_type, activation_norm_params,
+                 skip_activation_norm, skip_nonlinearity,
+                 nonlinearity, inplace_nonlinearity, apply_noise,
+                 hidden_channels_equal_out_channels,
+                 order, block, learn_shortcut, output_scale, skip_block=None,
+                 blur=True, border_free=True, resample_first=True,
+                 skip_weight_norm=True, hidden_channel_ratio=4):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.output_scale = output_scale
+        self.resample_first = resample_first
+        self.stride = stride
+        self.blur = blur
+        self.border_free = border_free
+        assert not border_free
+        if skip_block is None:
+            skip_block = block
+        if order == 'pre_act':
+            order = 'NACNAC'
+        if isinstance(bias, bool):
+            # The bias for conv_block_0, conv_block_1, and conv_block_s.
+            biases = [bias, bias, bias]
+        elif isinstance(bias, list):
+            if len(bias) == 3:
+                biases = bias
+            else:
+                raise ValueError('Bias list must be 3.')
+        else:
+            raise ValueError('Bias must be either an integer or s list.')
+        self.learn_shortcut = learn_shortcut
+        if len(order) > 6 or len(order) < 5:
+            raise ValueError('order must be either 5 or 6 characters')
+        hidden_channels = in_channels // hidden_channel_ratio
+        # Parameters.
+        residual_params = {}
+        shortcut_params = {}
+        base_params = dict(dilation=dilation,
+                           groups=groups,
+                           padding_mode=padding_mode)
+        residual_params.update(base_params)
+        residual_params.update(
+            dict(activation_norm_type=activation_norm_type,
+                 activation_norm_params=activation_norm_params,
+                 weight_norm_type=weight_norm_type,
+                 weight_norm_params=weight_norm_params,
+                 apply_noise=apply_noise)
+        )
+        shortcut_params.update(base_params)
+        shortcut_params.update(dict(kernel_size=1))
+        if skip_activation_norm:
+            shortcut_params.update(
+                dict(activation_norm_type=activation_norm_type,
+                     activation_norm_params=activation_norm_params,
+                     apply_noise=False))
+        if skip_weight_norm:
+            shortcut_params.update(
+                dict(weight_norm_type=weight_norm_type,
+                     weight_norm_params=weight_norm_params))
+        # Residual branch.
+        if order.find('A') < order.find('C') and \
+                (activation_norm_type == '' or activation_norm_type == 'none'):
+            # Nonlinearity is the first operation in the residual path.
+            # In-place nonlinearity will modify the input variable and cause
+            # backward error.
+            first_inplace = False
+        else:
+            first_inplace = inplace_nonlinearity
+        (first_stride, second_stride, shortcut_stride,
+         first_blur, second_blur, shortcut_blur) = self._get_stride_blur()
+        self.conv_block_1x1_in = block(
+            in_channels, hidden_channels,
+            1, 1, 0,
+            bias=biases[0],
+            nonlinearity=nonlinearity,
+            order=order[0:3],
+            inplace_nonlinearity=first_inplace,
+            **residual_params
+        )
+        self.conv_block_0 = block(
+            hidden_channels, hidden_channels,
+            kernel_size=2 if self.border_free and first_stride < 1 else
+            kernel_size,
+            padding=padding,
+            bias=biases[0],
+            nonlinearity=nonlinearity,
+            order=order[0:3],
+            inplace_nonlinearity=inplace_nonlinearity,
+            stride=first_stride,
+            blur=first_blur,
+            **residual_params
+        )
+        self.conv_block_1 = block(
+            hidden_channels, hidden_channels,
+            kernel_size=kernel_size,
+            padding=padding,
+            bias=biases[1],
+            nonlinearity=nonlinearity,
+            order=order[3:],
+            inplace_nonlinearity=inplace_nonlinearity,
+            stride=second_stride,
+            blur=second_blur,
+            **residual_params
+        )
+        self.conv_block_1x1_out = block(
+            hidden_channels, out_channels,
+            1, 1, 0,
+            bias=biases[1],
+            nonlinearity=nonlinearity,
+            order=order[0:3],
+            inplace_nonlinearity=inplace_nonlinearity,
+            **residual_params
+        )
+        # Shortcut branch.
+        if self.learn_shortcut:
+            if skip_nonlinearity:
+                skip_nonlinearity_type = nonlinearity
+            else:
+                skip_nonlinearity_type = ''
+            self.conv_block_s = skip_block(in_channels, out_channels,
+                                           bias=biases[2],
+                                           nonlinearity=skip_nonlinearity_type,
+                                           order=order[0:3],
+                                           stride=shortcut_stride,
+                                           blur=shortcut_blur,
+                                           **shortcut_params)
+        elif in_channels < out_channels:
+            if skip_nonlinearity:
+                skip_nonlinearity_type = nonlinearity
+            else:
+                skip_nonlinearity_type = ''
+            self.conv_block_s = skip_block(in_channels,
+                                           out_channels - in_channels,
+                                           bias=biases[2],
+                                           nonlinearity=skip_nonlinearity_type,
+                                           order=order[0:3],
+                                           stride=shortcut_stride,
+                                           blur=shortcut_blur,
+                                           **shortcut_params)
+        # Whether this block expects conditional inputs.
+        self.conditional = \
+            getattr(self.conv_block_0, 'conditional', False) or \
+            getattr(self.conv_block_1, 'conditional', False) or \
+            getattr(self.conv_block_1x1_in, 'conditional', False) or \
+            getattr(self.conv_block_1x1_out, 'conditional', False)
+    def _get_stride_blur(self):
+        if self.stride > 1:
+            # Downsampling.
+            first_stride, second_stride = 1, self.stride
+            first_blur, second_blur = False, self.blur
+            shortcut_blur = False
+            shortcut_stride = 1
+            if self.blur:
+                # The shortcut branch uses blur_downsample + stride-1 conv
+                if self.border_free:
+                    self.resample = nn.AvgPool2d(2)
+                else:
+                    self.resample = BlurDownsample()
+            else:
+                shortcut_stride = self.stride
+                self.resample = nn.AvgPool2d(2)
+        elif self.stride < 1:
+            # Upsampling.
+            first_stride, second_stride = self.stride, 1
+            first_blur, second_blur = self.blur, False
+            shortcut_blur = False
+            shortcut_stride = 1
+            if self.blur:
+                # The shortcut branch uses blur_upsample + stride-1 conv
+                if self.border_free:
+                    self.resample = nn.Upsample(scale_factor=2,
+                                                mode='bilinear')
+                else:
+                    self.resample = BlurUpsample()
+            else:
+                shortcut_stride = self.stride
+                self.resample = nn.Upsample(scale_factor=2)
+        else:
+            first_stride = second_stride = 1
+            first_blur = second_blur = False
+            shortcut_stride = 1
+            shortcut_blur = False
+            self.resample = None
+        return (first_stride, second_stride, shortcut_stride,
+                first_blur, second_blur, shortcut_blur)
+    def conv_blocks(
+            self, x, *cond_inputs, separate_cond=False, **kw_cond_inputs
+    ):
+        if separate_cond:
+            assert len(list(cond_inputs)) == 4
+            dx = self.conv_block_1x1_in(x, cond_inputs[0],
+                                        **kw_cond_inputs.get('kwargs_0', {}))
+            dx = self.conv_block_0(dx, cond_inputs[1],
+                                   **kw_cond_inputs.get('kwargs_1', {}))
+            dx = self.conv_block_1(dx, cond_inputs[2],
+                                   **kw_cond_inputs.get('kwargs_2', {}))
+            dx = self.conv_block_1x1_out(dx, cond_inputs[3],
+                                         **kw_cond_inputs.get('kwargs_3', {}))
+        else:
+            dx = self.conv_block_1x1_in(x, *cond_inputs, **kw_cond_inputs)
+            dx = self.conv_block_0(dx, *cond_inputs, **kw_cond_inputs)
+            dx = self.conv_block_1(dx, *cond_inputs, **kw_cond_inputs)
+            dx = self.conv_block_1x1_out(dx, *cond_inputs, **kw_cond_inputs)
+        return dx
+    def forward(self, x, *cond_inputs, do_checkpoint=False, **kw_cond_inputs):
+        if do_checkpoint:
+            dx = checkpoint(self.conv_blocks, x, *cond_inputs, **kw_cond_inputs)
+        else:
+            dx = self.conv_blocks(x, *cond_inputs, **kw_cond_inputs)
+        if self.resample_first and self.resample is not None:
+            x = self.resample(x)
+        if self.learn_shortcut:
+            x_shortcut = self.conv_block_s(
+                x, *cond_inputs, **kw_cond_inputs
+            )
+        elif self.in_channels < self.out_channels:
+            x_shortcut_pad = self.conv_block_s(
+                x, *cond_inputs, **kw_cond_inputs
+            )
+            x_shortcut = torch.cat((x, x_shortcut_pad), dim=1)
+        elif self.in_channels > self.out_channels:
+            x_shortcut = x[:, :self.out_channels, :, :]
+        else:
+            x_shortcut = x
+        if not self.resample_first and self.resample is not None:
+            x_shortcut = self.resample(x_shortcut)
+        output = x_shortcut + dx
+        return self.output_scale * output
+    def extra_repr(self):
+        s = 'output_scale={output_scale}'
+        return s.format(**self.__dict__)
+class DeepRes2dBlock(_BaseDeepResBlock):
+    r"""Residual block for 2D input.
+    Args:
+        in_channels (int) : Number of channels in the input tensor.
+        out_channels (int) : Number of channels in the output tensor.
+        kernel_size (int, optional, default=3): Kernel size for the
+            convolutional filters in the residual link.
+        padding (int, optional, default=1): Padding size.
+        dilation (int, optional, default=1): Dilation factor.
+        groups (int, optional, default=1): Number of convolutional/linear
+            groups.
+        padding_mode (string, optional, default='zeros'): Type of padding:
+            ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``.
+        weight_norm_type (str, optional, default='none'):
+            Type of weight normalization.
+            ``'none'``, ``'spectral'``, ``'weight'``
+            or ``'weight_demod'``.
+        weight_norm_params (obj, optional, default=None):
+            Parameters of weight normalization.
+            If not ``None``, ``weight_norm_params.__dict__`` will be used as
+            keyword arguments when initializing weight normalization.
+        activation_norm_type (str, optional, default='none'):
+            Type of activation normalization.
+            ``'none'``, ``'instance'``, ``'batch'``, ``'sync_batch'``,
+            ``'layer'``,  ``'layer_2d'``, ``'group'``, ``'adaptive'``,
+            ``'spatially_adaptive'`` or ``'hyper_spatially_adaptive'``.
+        activation_norm_params (obj, optional, default=None):
+            Parameters of activation normalization.
+            If not ``None``, ``activation_norm_params.__dict__`` will be used as
+            keyword arguments when initializing activation normalization.
+        skip_activation_norm (bool, optional, default=True): If ``True`` and
+            ``learn_shortcut`` is also ``True``, applies activation norm to the
+            learned shortcut connection.
+        skip_nonlinearity (bool, optional, default=True): If ``True`` and
+            ``learn_shortcut`` is also ``True``, applies nonlinearity to the
+            learned shortcut connection.
+        nonlinearity (str, optional, default='none'):
+            Type of nonlinear activation function in the residual link.
+            ``'none'``, ``'relu'``, ``'leakyrelu'``, ``'prelu'``,
+            ``'tanh'`` , ``'sigmoid'`` or ``'softmax'``.
+        inplace_nonlinearity (bool, optional, default=False): If ``True``,
+            set ``inplace=True`` when initializing the nonlinearity layers.
+        apply_noise (bool, optional, default=False): If ``True``, adds
+            Gaussian noise with learnable magnitude to the convolution output.
+        hidden_channels_equal_out_channels (bool, optional, default=False):
+            If ``True``, set the hidden channel number to be equal to the
+            output channel number. If ``False``, the hidden channel number
+            equals to the smaller of the input channel number and the
+            output channel number.
+        order (str, optional, default='CNACNA'): Order of operations
+            in the residual link.
+            ``'C'``: convolution,
+            ``'N'``: normalization,
+            ``'A'``: nonlinear activation.
+        learn_shortcut (bool, optional, default=False): If ``True``, always use
+            a convolutional shortcut instead of an identity one, otherwise only
+            use a convolutional one if input and output have different number of
+            channels.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size=3,
+                 stride=1, padding=1, dilation=1, groups=1, bias=True,
+                 padding_mode='zeros',
+                 weight_norm_type='none', weight_norm_params=None,
+                 activation_norm_type='none', activation_norm_params=None,
+                 skip_activation_norm=True, skip_nonlinearity=False,
+                 skip_weight_norm=True,
+                 nonlinearity='leakyrelu', inplace_nonlinearity=False,
+                 apply_noise=False, hidden_channels_equal_out_channels=False,
+                 order='CNACNA', learn_shortcut=False, output_scale=1,
+                 blur=True, resample_first=True, border_free=False):
+        super().__init__(in_channels, out_channels, kernel_size, stride,
+                         padding, dilation, groups, bias, padding_mode,
+                         weight_norm_type, weight_norm_params,
+                         activation_norm_type, activation_norm_params,
+                         skip_activation_norm, skip_nonlinearity, nonlinearity,
+                         inplace_nonlinearity, apply_noise,
+                         hidden_channels_equal_out_channels, order, Conv2dBlock,
+                         learn_shortcut, output_scale, blur=blur,
+                         resample_first=resample_first, border_free=border_free,
+                         skip_weight_norm=skip_weight_norm)

imaginaire/layers/vit.py ADDED Viewed

	@@ -0,0 +1,204 @@

+# Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, check out LICENSE.md
+from types import SimpleNamespace
+import torch
+from torch import nn
+from .misc import ApplyNoise
+from imaginaire.third_party.upfirdn2d.upfirdn2d import Blur
+class ViT2dBlock(nn.Module):
+    r"""An abstract wrapper class that wraps a torch convolution or linear layer
+    with normalization and nonlinearity.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size, stride,
+                 padding, dilation, groups, bias, padding_mode,
+                 weight_norm_type, weight_norm_params,
+                 activation_norm_type, activation_norm_params,
+                 nonlinearity, inplace_nonlinearity,
+                 apply_noise, blur, order, input_dim, clamp,
+                 blur_kernel=(1, 3, 3, 1), output_scale=None,
+                 init_gain=1.0):
+        super().__init__()
+        from .nonlinearity import get_nonlinearity_layer
+        from .weight_norm import get_weight_norm_layer
+        from .activation_norm import get_activation_norm_layer
+        self.weight_norm_type = weight_norm_type
+        self.stride = stride
+        self.clamp = clamp
+        self.init_gain = init_gain
+        # Nonlinearity layer.
+        if 'fused' in nonlinearity:
+            # Fusing nonlinearity with bias.
+            lr_mul = getattr(weight_norm_params, 'lr_mul', 1)
+            conv_before_nonlinearity = order.find('C') < order.find('A')
+            if conv_before_nonlinearity:
+                assert bias
+                bias = False
+            channel = out_channels if conv_before_nonlinearity else in_channels
+            nonlinearity_layer = get_nonlinearity_layer(
+                nonlinearity, inplace=inplace_nonlinearity,
+                num_channels=channel, lr_mul=lr_mul)
+        else:
+            nonlinearity_layer = get_nonlinearity_layer(
+                nonlinearity, inplace=inplace_nonlinearity)
+        # Noise injection layer.
+        if apply_noise:
+            order = order.replace('C', 'CG')
+            noise_layer = ApplyNoise()
+        else:
+            noise_layer = None
+        # Convolutional layer.
+        if blur:
+            if stride == 2:
+                # Blur - Conv - Noise - Activate
+                p = (len(blur_kernel) - 2) + (kernel_size - 1)
+                pad0, pad1 = (p + 1) // 2, p // 2
+                padding = 0
+                blur_layer = Blur(
+                    blur_kernel, pad=(pad0, pad1), padding_mode=padding_mode
+                )
+                order = order.replace('C', 'BC')
+            elif stride == 0.5:
+                # Conv - Blur - Noise - Activate
+                padding = 0
+                p = (len(blur_kernel) - 2) - (kernel_size - 1)
+                pad0, pad1 = (p + 1) // 2 + 1, p // 2 + 1
+                blur_layer = Blur(
+                    blur_kernel, pad=(pad0, pad1), padding_mode=padding_mode
+                )
+                order = order.replace('C', 'CB')
+            elif stride == 1:
+                # No blur for now
+                blur_layer = nn.Identity()
+            else:
+                raise NotImplementedError
+        else:
+            blur_layer = nn.Identity()
+        if weight_norm_params is None:
+            weight_norm_params = SimpleNamespace()
+        weight_norm = get_weight_norm_layer(
+            weight_norm_type, **vars(weight_norm_params))
+        conv_layer = weight_norm(self._get_conv_layer(
+            in_channels, out_channels, kernel_size, stride, padding, dilation,
+            groups, bias, padding_mode, input_dim))
+        # Normalization layer.
+        conv_before_norm = order.find('C') < order.find('N')
+        norm_channels = out_channels if conv_before_norm else in_channels
+        if activation_norm_params is None:
+            activation_norm_params = SimpleNamespace()
+        activation_norm_layer = get_activation_norm_layer(
+            norm_channels,
+            activation_norm_type,
+            input_dim,
+            **vars(activation_norm_params))
+        # Mapping from operation names to layers.
+        mappings = {'C': {'conv': conv_layer},
+                    'N': {'norm': activation_norm_layer},
+                    'A': {'nonlinearity': nonlinearity_layer}}
+        mappings.update({'B': {'blur': blur_layer}})
+        mappings.update({'G': {'noise': noise_layer}})
+        # All layers in order.
+        self.layers = nn.ModuleDict()
+        for op in order:
+            if list(mappings[op].values())[0] is not None:
+                self.layers.update(mappings[op])
+        # Whether this block expects conditional inputs.
+        self.conditional = \
+            getattr(conv_layer, 'conditional', False) or \
+            getattr(activation_norm_layer, 'conditional', False)
+        if output_scale is not None:
+            self.output_scale = nn.Parameter(torch.tensor(output_scale))
+        else:
+            self.register_parameter("output_scale", None)
+    def forward(self, x, *cond_inputs, **kw_cond_inputs):
+        r"""
+        Args:
+            x (tensor): Input tensor.
+            cond_inputs (list of tensors) : Conditional input tensors.
+            kw_cond_inputs (dict) : Keyword conditional inputs.
+        """
+        for key, layer in self.layers.items():
+            if getattr(layer, 'conditional', False):
+                # Layers that require conditional inputs.
+                x = layer(x, *cond_inputs, **kw_cond_inputs)
+            else:
+                x = layer(x)
+            if self.clamp is not None and isinstance(layer, nn.Conv2d):
+                x.clamp_(max=self.clamp)
+            if key == 'conv':
+                if self.output_scale is not None:
+                    x = x * self.output_scale
+        return x
+    def _get_conv_layer(self, in_channels, out_channels, kernel_size, stride,
+                        padding, dilation, groups, bias, padding_mode,
+                        input_dim):
+        # Returns the convolutional layer.
+        if input_dim == 0:
+            layer = nn.Linear(in_channels, out_channels, bias)
+        else:
+            if stride < 1:  # Fractionally-strided convolution.
+                padding_mode = 'zeros'
+                assert padding == 0
+                layer_type = getattr(nn, f'ConvTranspose{input_dim}d')
+                stride = round(1 / stride)
+            else:
+                layer_type = getattr(nn, f'Conv{input_dim}d')
+            layer = layer_type(
+                in_channels, out_channels, kernel_size, stride, padding,
+                dilation=dilation, groups=groups, bias=bias,
+                padding_mode=padding_mode
+            )
+        return layer
+    def __repr__(self):
+        main_str = self._get_name() + '('
+        child_lines = []
+        for name, layer in self.layers.items():
+            mod_str = repr(layer)
+            if name == 'conv' and self.weight_norm_type != 'none' and \
+                    self.weight_norm_type != '':
+                mod_str = mod_str[:-1] + \
+                          ', weight_norm={}'.format(self.weight_norm_type) + ')'
+            if name == 'conv' and getattr(layer, 'base_lr_mul', 1) != 1:
+                mod_str = mod_str[:-1] + \
+                          ', lr_mul={}'.format(layer.base_lr_mul) + ')'
+            mod_str = self._addindent(mod_str, 2)
+            child_lines.append(mod_str)
+        if len(child_lines) == 1:
+            main_str += child_lines[0]
+        else:
+            main_str += '\n  ' + '\n  '.join(child_lines) + '\n'
+        main_str += ')'
+        return main_str
+    @staticmethod
+    def _addindent(s_, numSpaces):
+        s = s_.split('\n')
+        # don't do anything for single-line stuff
+        if len(s) == 1:
+            return s_
+        first = s.pop(0)
+        s = [(numSpaces * ' ') + line for line in s]
+        s = '\n'.join(s)
+        s = first + '\n' + s
+        return s

imaginaire/layers/weight_norm.py ADDED Viewed

	@@ -0,0 +1,267 @@

+# Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, check out LICENSE.md
+import collections
+import functools
+import torch
+from torch import nn
+from torch.nn.utils import spectral_norm, weight_norm
+from torch.nn.utils.spectral_norm import SpectralNorm, \
+    SpectralNormStateDictHook, SpectralNormLoadStateDictPreHook
+from .conv import LinearBlock
+class WeightDemodulation(nn.Module):
+    r"""Weight demodulation in
+    "Analyzing and Improving the Image Quality of StyleGAN", Karras et al.
+    Args:
+        conv (torch.nn.Modules): Convolutional layer.
+        cond_dims (int): The number of channels in the conditional input.
+        eps (float, optional, default=1e-8): a value added to the
+            denominator for numerical stability.
+        adaptive_bias (bool, optional, default=False): If ``True``, adaptively
+            predicts bias from the conditional input.
+        demod (bool, optional, default=False): If ``True``, performs
+            weight demodulation.
+    """
+    def __init__(self, conv, cond_dims, eps=1e-8,
+                 adaptive_bias=False, demod=True):
+        super().__init__()
+        self.conv = conv
+        self.adaptive_bias = adaptive_bias
+        if adaptive_bias:
+            self.conv.register_parameter('bias', None)
+            self.fc_beta = LinearBlock(cond_dims, self.conv.out_channels)
+        self.fc_gamma = LinearBlock(cond_dims, self.conv.in_channels)
+        self.eps = eps
+        self.demod = demod
+        self.conditional = True
+    def forward(self, x, y, **_kwargs):
+        r"""Weight demodulation forward"""
+        b, c, h, w = x.size()
+        self.conv.groups = b
+        gamma = self.fc_gamma(y)
+        gamma = gamma[:, None, :, None, None]
+        weight = self.conv.weight[None, :, :, :, :] * gamma
+        if self.demod:
+            d = torch.rsqrt(
+                (weight ** 2).sum(
+                    dim=(2, 3, 4), keepdim=True) + self.eps)
+            weight = weight * d
+        x = x.reshape(1, -1, h, w)
+        _, _, *ws = weight.shape
+        weight = weight.reshape(b * self.conv.out_channels, *ws)
+        x = self.conv._conv_forward(x, weight)
+        x = x.reshape(-1, self.conv.out_channels, h, w)
+        if self.adaptive_bias:
+            x += self.fc_beta(y)[:, :, None, None]
+        return x
+def weight_demod(
+        conv, cond_dims=256, eps=1e-8, adaptive_bias=False, demod=True):
+    r"""Weight demodulation."""
+    return WeightDemodulation(conv, cond_dims, eps, adaptive_bias, demod)
+class ScaledLR(object):
+    def __init__(self, weight_name, bias_name):
+        self.weight_name = weight_name
+        self.bias_name = bias_name
+    def compute_weight(self, module):
+        weight = getattr(module, self.weight_name + '_ori')
+        return weight * module.weight_scale
+    def compute_bias(self, module):
+        bias = getattr(module, self.bias_name + '_ori')
+        if bias is not None:
+            return bias * module.bias_scale
+        else:
+            return None
+    @staticmethod
+    def apply(module, weight_name, bias_name, lr_mul, equalized):
+        assert weight_name == 'weight'
+        assert bias_name == 'bias'
+        fn = ScaledLR(weight_name, bias_name)
+        module.register_forward_pre_hook(fn)
+        if hasattr(module, bias_name):
+            # module.bias is a parameter (can be None).
+            bias = getattr(module, bias_name)
+            delattr(module, bias_name)
+            module.register_parameter(bias_name + '_ori', bias)
+        else:
+            # module.bias does not exist.
+            bias = None
+            setattr(module, bias_name + '_ori', bias)
+        if bias is not None:
+            setattr(module, bias_name, bias.data)
+        else:
+            setattr(module, bias_name, None)
+        module.register_buffer('bias_scale', torch.tensor(lr_mul))
+        if hasattr(module, weight_name + '_orig'):
+            # The module has been wrapped with spectral normalization.
+            # We only want to keep a single weight parameter.
+            weight = getattr(module, weight_name + '_orig')
+            delattr(module, weight_name + '_orig')
+            module.register_parameter(weight_name + '_ori', weight)
+            setattr(module, weight_name + '_orig', weight.data)
+            # Put this hook before the spectral norm hook.
+            module._forward_pre_hooks = collections.OrderedDict(
+                reversed(list(module._forward_pre_hooks.items()))
+            )
+            module.use_sn = True
+        else:
+            weight = getattr(module, weight_name)
+            delattr(module, weight_name)
+            module.register_parameter(weight_name + '_ori', weight)
+            setattr(module, weight_name, weight.data)
+            module.use_sn = False
+        # assert weight.dim() == 4 or weight.dim() == 2
+        if equalized:
+            fan_in = weight.data.size(1) * weight.data[0][0].numel()
+            # Theoretically, the gain should be sqrt(2) instead of 1.
+            # The official StyleGAN2 uses 1 for some reason.
+            module.register_buffer(
+                'weight_scale', torch.tensor(lr_mul * ((1 / fan_in) ** 0.5))
+            )
+        else:
+            module.register_buffer('weight_scale', torch.tensor(lr_mul))
+        module.lr_mul = module.weight_scale
+        module.base_lr_mul = lr_mul
+        return fn
+    def remove(self, module):
+        with torch.no_grad():
+            weight = self.compute_weight(module)
+        delattr(module, self.weight_name + '_ori')
+        if module.use_sn:
+            setattr(module, self.weight_name + '_orig', weight.detach())
+        else:
+            delattr(module, self.weight_name)
+            module.register_parameter(self.weight_name,
+                                      torch.nn.Parameter(weight.detach()))
+        with torch.no_grad():
+            bias = self.compute_bias(module)
+        delattr(module, self.bias_name)
+        delattr(module, self.bias_name + '_ori')
+        if bias is not None:
+            module.register_parameter(self.bias_name,
+                                      torch.nn.Parameter(bias.detach()))
+        else:
+            module.register_parameter(self.bias_name, None)
+        module.lr_mul = 1.0
+        module.base_lr_mul = 1.0
+    def __call__(self, module, input):
+        weight = self.compute_weight(module)
+        if module.use_sn:
+            # The following spectral norm hook will compute the SN of
+            # "module.weight_orig" and store the normalized weight in
+            # "module.weight".
+            setattr(module, self.weight_name + '_orig', weight)
+        else:
+            setattr(module, self.weight_name, weight)
+        bias = self.compute_bias(module)
+        setattr(module, self.bias_name, bias)
+def remove_weight_norms(module, weight_name='weight', bias_name='bias'):
+    if hasattr(module, 'weight_ori') or hasattr(module, 'weight_orig'):
+        for k in list(module._forward_pre_hooks.keys()):
+            hook = module._forward_pre_hooks[k]
+            if (isinstance(hook, ScaledLR) or isinstance(hook, SpectralNorm)):
+                hook.remove(module)
+                del module._forward_pre_hooks[k]
+        for k, hook in module._state_dict_hooks.items():
+            if isinstance(hook, SpectralNormStateDictHook) and \
+                    hook.fn.name == weight_name:
+                del module._state_dict_hooks[k]
+                break
+        for k, hook in module._load_state_dict_pre_hooks.items():
+            if isinstance(hook, SpectralNormLoadStateDictPreHook) and \
+                    hook.fn.name == weight_name:
+                del module._load_state_dict_pre_hooks[k]
+                break
+    return module
+def remove_equalized_lr(module, weight_name='weight', bias_name='bias'):
+    for k, hook in module._forward_pre_hooks.items():
+        if isinstance(hook, ScaledLR) and hook.weight_name == weight_name:
+            hook.remove(module)
+            del module._forward_pre_hooks[k]
+            break
+    else:
+        raise ValueError("Equalized learning rate not found")
+    return module
+def scaled_lr(
+        module, weight_name='weight', bias_name='bias', lr_mul=1.,
+        equalized=False,
+):
+    ScaledLR.apply(module, weight_name, bias_name, lr_mul, equalized)
+    return module
+def get_weight_norm_layer(norm_type, **norm_params):
+    r"""Return weight normalization.
+    Args:
+        norm_type (str):
+            Type of weight normalization.
+            ``'none'``, ``'spectral'``, ``'weight'``
+            or ``'weight_demod'``.
+        norm_params: Arbitrary keyword arguments that will be used to
+            initialize the weight normalization.
+    """
+    if norm_type == 'none' or norm_type == '':  # no normalization
+        return lambda x: x
+    elif norm_type == 'spectral':  # spectral normalization
+        return functools.partial(spectral_norm, **norm_params)
+    elif norm_type == 'weight':  # weight normalization
+        return functools.partial(weight_norm, **norm_params)
+    elif norm_type == 'weight_demod':  # weight demodulation
+        return functools.partial(weight_demod, **norm_params)
+    elif norm_type == 'equalized_lr':  # equalized learning rate
+        return functools.partial(scaled_lr, equalized=True, **norm_params)
+    elif norm_type == 'scaled_lr':  # equalized learning rate
+        return functools.partial(scaled_lr, **norm_params)
+    elif norm_type == 'equalized_lr_spectral':
+        lr_mul = norm_params.pop('lr_mul', 1.0)
+        return lambda x: functools.partial(
+            scaled_lr, equalized=True, lr_mul=lr_mul)(
+            functools.partial(spectral_norm, **norm_params)(x)
+        )
+    elif norm_type == 'scaled_lr_spectral':
+        lr_mul = norm_params.pop('lr_mul', 1.0)
+        return lambda x: functools.partial(
+            scaled_lr, lr_mul=lr_mul)(
+            functools.partial(spectral_norm, **norm_params)(x)
+        )
+    else:
+        raise ValueError(
+            'Weight norm layer %s is not recognized' % norm_type)

imaginaire/losses/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, check out LICENSE.md
+from .gan import GANLoss
+from .perceptual import PerceptualLoss
+from .feature_matching import FeatureMatchingLoss
+from .kl import GaussianKLLoss
+__all__ = ['GANLoss', 'PerceptualLoss', 'FeatureMatchingLoss', 'GaussianKLLoss',
+           'MaskedL1Loss', 'FlowLoss', 'DictLoss',
+           'WeightedMSELoss']
+try:
+    from .gradient_penalty import GradientPenaltyLoss
+    __all__.extend(['GradientPenaltyLoss'])
+except:  # noqa
+    pass

imaginaire/losses/feature_matching.py ADDED Viewed

	@@ -0,0 +1,38 @@

+# Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, check out LICENSE.md
+import torch.nn as nn
+class FeatureMatchingLoss(nn.Module):
+    r"""Compute feature matching loss"""
+    def __init__(self, criterion='l1'):
+        super(FeatureMatchingLoss, self).__init__()
+        if criterion == 'l1':
+            self.criterion = nn.L1Loss()
+        elif criterion == 'l2' or criterion == 'mse':
+            self.criterion = nn.MSELoss()
+        else:
+            raise ValueError('Criterion %s is not recognized' % criterion)
+    def forward(self, fake_features, real_features):
+        r"""Return the target vector for the binary cross entropy loss
+        computation.
+        Args:
+           fake_features (list of lists): Discriminator features of fake images.
+           real_features (list of lists): Discriminator features of real images.
+        Returns:
+           (tensor): Loss value.
+        """
+        num_d = len(fake_features)
+        dis_weight = 1.0 / num_d
+        loss = fake_features[0][0].new_tensor(0)
+        for i in range(num_d):
+            for j in range(len(fake_features[i])):
+                tmp_loss = self.criterion(fake_features[i][j],
+                                          real_features[i][j].detach())
+                loss += dis_weight * tmp_loss
+        return loss

imaginaire/losses/gan.py ADDED Viewed

	@@ -0,0 +1,173 @@

+# Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, check out LICENSE.md
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from imaginaire.utils.distributed import master_only_print as print
+@torch.jit.script
+def fuse_math_min_mean_pos(x):
+    r"""Fuse operation min mean for hinge loss computation of positive
+    samples"""
+    minval = torch.min(x - 1, x * 0)
+    loss = -torch.mean(minval)
+    return loss
+@torch.jit.script
+def fuse_math_min_mean_neg(x):
+    r"""Fuse operation min mean for hinge loss computation of negative
+    samples"""
+    minval = torch.min(-x - 1, x * 0)
+    loss = -torch.mean(minval)
+    return loss
+class GANLoss(nn.Module):
+    r"""GAN loss constructor.
+    Args:
+        gan_mode (str): Type of GAN loss. ``'hinge'``, ``'least_square'``,
+            ``'non_saturated'``, ``'wasserstein'``.
+        target_real_label (float): The desired output label for real images.
+        target_fake_label (float): The desired output label for fake images.
+        decay_k (float): The decay factor per epoch for top-k training.
+        min_k (float): The minimum percentage of samples to select.
+        separate_topk (bool): If ``True``, selects top-k for each sample
+            separately, otherwise selects top-k among all samples.
+    """
+    def __init__(self, gan_mode, target_real_label=1.0, target_fake_label=0.0,
+                 decay_k=1., min_k=1., separate_topk=False):
+        super(GANLoss, self).__init__()
+        self.real_label = target_real_label
+        self.fake_label = target_fake_label
+        self.real_label_tensor = None
+        self.fake_label_tensor = None
+        self.gan_mode = gan_mode
+        self.decay_k = decay_k
+        self.min_k = min_k
+        self.separate_topk = separate_topk
+        self.register_buffer('k', torch.tensor(1.0))
+        print('GAN mode: %s' % gan_mode)
+    def forward(self, dis_output, t_real, dis_update=True, reduce=True):
+        r"""GAN loss computation.
+        Args:
+            dis_output (tensor or list of tensors): Discriminator outputs.
+            t_real (bool): If ``True``, uses the real label as target, otherwise uses the fake label as target.
+            dis_update (bool): If ``True``, the loss will be used to update the discriminator, otherwise the generator.
+            reduce (bool): If ``True``, when a list of discriminator outputs are provided, it will return the average
+                of all losses, otherwise it will return a list of losses.
+        Returns:
+            loss (tensor): Loss value.
+        """
+        if isinstance(dis_output, list):
+            # For multi-scale discriminators.
+            # In this implementation, the loss is first averaged for each scale
+            # (batch size and number of locations) then averaged across scales,
+            # so that the gradient is not dominated by the discriminator that
+            # has the most output values (highest resolution).
+            losses = []
+            for dis_output_i in dis_output:
+                assert isinstance(dis_output_i, torch.Tensor)
+                losses.append(self.loss(dis_output_i, t_real, dis_update))
+            if reduce:
+                return torch.mean(torch.stack(losses))
+            else:
+                return losses
+        else:
+            return self.loss(dis_output, t_real, dis_update)
+    def loss(self, dis_output, t_real, dis_update=True):
+        r"""GAN loss computation.
+        Args:
+            dis_output (tensor): Discriminator outputs.
+            t_real (bool): If ``True``, uses the real label as target, otherwise
+                uses the fake label as target.
+            dis_update (bool): Updating the discriminator or the generator.
+        Returns:
+            loss (tensor): Loss value.
+        """
+        if not dis_update:
+            assert t_real, \
+                "The target should be real when updating the generator."
+        if not dis_update and self.k < 1:
+            r"""
+            Use top-k training:
+            "Top-k Training of GANs: Improving GAN Performance by Throwing
+            Away Bad Samples"
+            Here, each sample may have multiple discriminator output values
+            (patch discriminator). We could either select top-k for each sample
+            separately (when ``self.separate_topk=True``), or collect values
+            from all samples and then select top-k (default, when
+            ``self.separate_topk=False``).
+            """
+            if self.separate_topk:
+                dis_output = dis_output.view(dis_output.size(0), -1)
+            else:
+                dis_output = dis_output.view(-1)
+            k = math.ceil(self.k * dis_output.size(-1))
+            dis_output, _ = torch.topk(dis_output, k)
+        if self.gan_mode == 'non_saturated':
+            target_tensor = self.get_target_tensor(dis_output, t_real)
+            loss = F.binary_cross_entropy_with_logits(dis_output,
+                                                      target_tensor)
+        elif self.gan_mode == 'least_square':
+            target_tensor = self.get_target_tensor(dis_output, t_real)
+            loss = 0.5 * F.mse_loss(dis_output, target_tensor)
+        elif self.gan_mode == 'hinge':
+            if dis_update:
+                if t_real:
+                    loss = fuse_math_min_mean_pos(dis_output)
+                else:
+                    loss = fuse_math_min_mean_neg(dis_output)
+            else:
+                loss = -torch.mean(dis_output)
+        elif self.gan_mode == 'wasserstein':
+            if t_real:
+                loss = -torch.mean(dis_output)
+            else:
+                loss = torch.mean(dis_output)
+        elif self.gan_mode == 'softplus':
+            target_tensor = self.get_target_tensor(dis_output, t_real)
+            loss = F.binary_cross_entropy_with_logits(dis_output,
+                                                      target_tensor)
+        else:
+            raise ValueError('Unexpected gan_mode {}'.format(self.gan_mode))
+        return loss
+    def get_target_tensor(self, dis_output, t_real):
+        r"""Return the target vector for the binary cross entropy loss
+        computation.
+        Args:
+            dis_output (tensor): Discriminator outputs.
+            t_real (bool): If ``True``, uses the real label as target, otherwise
+                uses the fake label as target.
+        Returns:
+            target (tensor): Target tensor vector.
+        """
+        if t_real:
+            if self.real_label_tensor is None:
+                self.real_label_tensor = dis_output.new_tensor(self.real_label)
+            return self.real_label_tensor.expand_as(dis_output)
+        else:
+            if self.fake_label_tensor is None:
+                self.fake_label_tensor = dis_output.new_tensor(self.fake_label)
+            return self.fake_label_tensor.expand_as(dis_output)
+    def topk_anneal(self):
+        r"""Anneal k after each epoch."""
+        if self.decay_k < 1:
+            # noinspection PyAttributeOutsideInit
+            self.k.fill_(max(self.decay_k * self.k, self.min_k))
+            print("Top-k training: update k to {}.".format(self.k))

imaginaire/losses/info_nce.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, check out LICENSE.md
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributed as dist
+from imaginaire.utils.distributed import get_world_size, get_rank, \
+    dist_all_reduce_tensor
+class GatherLayer(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input):
+        ctx.save_for_backward(input)
+        output = [torch.zeros_like(input) for _ in range(dist.get_world_size())]
+        dist.all_gather(output, input)
+        return tuple(output)
+    @staticmethod
+    def backward(ctx, *grads):
+        input, = ctx.saved_tensors
+        grad_out = torch.zeros_like(input)
+        all_grads = torch.stack(grads)
+        all_grads = dist_all_reduce_tensor(all_grads, reduce='sum')
+        grad_out[:] = all_grads[get_rank()]
+        return grad_out
+class InfoNCELoss(nn.Module):
+    def __init__(self,
+                 temperature=0.07,
+                 gather_distributed=True,
+                 learn_temperature=True,
+                 single_direction=False,
+                 flatten=True):
+        super(InfoNCELoss, self).__init__()
+        self.logit_scale = nn.Parameter(torch.tensor([math.log(1/temperature)]))
+        self.logit_scale.requires_grad = learn_temperature
+        self.gather_distributed = gather_distributed
+        self.single_direction = single_direction
+        self.flatten = flatten
+    def forward(self, features_a, features_b, gather_distributed=None, eps=1e-8):
+        if gather_distributed is None:
+            gather_distributed = self.gather_distributed
+        if features_a is None or features_b is None:
+            return torch.tensor(0, device='cuda'), torch.tensor(0, device='cuda')
+        bs_a, bs_b = features_a.size(0), features_b.size(0)
+        if self.flatten:
+            features_a, features_b = features_a.reshape(bs_a, -1), features_b.reshape(bs_b, -1)
+        else:
+            features_a = features_a.reshape(bs_a, features_a.size(1), -1).mean(-1)
+            features_b = features_b.reshape(bs_b, features_b.size(1), -1).mean(-1)
+        # Temperature clipping.
+        self.logit_scale.data = torch.clamp(self.logit_scale.data, 0, 4.6052)
+        # normalized features
+        features_a = features_a / (features_a.norm(dim=1, keepdim=True) + eps)
+        features_b = features_b / (features_b.norm(dim=1, keepdim=True) + eps)
+        loss_a = self._forward_single_direction(features_a, features_b, gather_distributed)
+        if self.single_direction:
+            return loss_a
+        else:
+            loss_b = self._forward_single_direction(features_b, features_a, gather_distributed)
+            return loss_a + loss_b
+    def _forward_single_direction(
+            self, features_a, features_b, gather_distributed):
+        bs_a = features_a.shape[0]
+        logit_scale = self.logit_scale.exp()
+        if get_world_size() > 1 and gather_distributed:
+            gather_features_b = torch.cat(GatherLayer.apply(features_b))
+            gather_labels_a = torch.arange(bs_a, device='cuda') + get_rank() * bs_a
+            logits_a = logit_scale * features_a @ gather_features_b.t()
+        else:
+            gather_labels_a = torch.arange(bs_a, device='cuda')
+            logits_a = logit_scale * features_a @ features_b.t()
+        loss_a = F.cross_entropy(logits_a, gather_labels_a)
+        return loss_a

imaginaire/losses/kl.py ADDED Viewed

	@@ -0,0 +1,23 @@

+# Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, check out LICENSE.md
+import torch
+import torch.nn as nn
+class GaussianKLLoss(nn.Module):
+    r"""Compute KL loss in VAE for Gaussian distributions"""
+    def __init__(self):
+        super(GaussianKLLoss, self).__init__()
+    def forward(self, mu, logvar=None):
+        r"""Compute loss
+        Args:
+            mu (tensor): mean
+            logvar (tensor): logarithm of variance
+        """
+        if logvar is None:
+            logvar = torch.zeros_like(mu)
+        return -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())

imaginaire/losses/perceptual.py ADDED Viewed

	@@ -0,0 +1,395 @@

+# Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, check out LICENSE.md
+import torch
+import torch.nn.functional as F
+import torchvision
+from torch import nn, distributed as dist
+from imaginaire.losses.info_nce import InfoNCELoss
+from imaginaire.utils.distributed import master_only_print as print, \
+    is_local_master
+from imaginaire.utils.misc import apply_imagenet_normalization, to_float
+class PerceptualLoss(nn.Module):
+    r"""Perceptual loss initialization.
+    Args:
+       network (str) : The name of the loss network: 'vgg16' | 'vgg19'.
+       layers (str or list of str) : The layers used to compute the loss.
+       weights (float or list of float : The loss weights of each layer.
+       criterion (str): The type of distance function: 'l1' | 'l2'.
+       resize (bool) : If ``True``, resize the input images to 224x224.
+       resize_mode (str): Algorithm used for resizing.
+       num_scales (int): The loss will be evaluated at original size and
+        this many times downsampled sizes.
+       per_sample_weight (bool): Output loss for individual samples in the
+        batch instead of mean loss.
+    """
+    def __init__(self, network='vgg19', layers='relu_4_1', weights=None,
+                 criterion='l1', resize=False, resize_mode='bilinear',
+                 num_scales=1, per_sample_weight=False,
+                 info_nce_temperature=0.07,
+                 info_nce_gather_distributed=True,
+                 info_nce_learn_temperature=True,
+                 info_nce_flatten=True):
+        super().__init__()
+        if isinstance(layers, str):
+            layers = [layers]
+        if weights is None:
+            weights = [1.] * len(layers)
+        elif isinstance(layers, float) or isinstance(layers, int):
+            weights = [weights]
+        if dist.is_initialized() and not is_local_master():
+            # Make sure only the first process in distributed training downloads
+            # the model, and the others will use the cache
+            # noinspection PyUnresolvedReferences
+            torch.distributed.barrier()
+        assert len(layers) == len(weights), \
+            'The number of layers (%s) must be equal to ' \
+            'the number of weights (%s).' % (len(layers), len(weights))
+        if network == 'vgg19':
+            self.model = _vgg19(layers)
+        elif network == 'vgg16':
+            self.model = _vgg16(layers)
+        elif network == 'alexnet':
+            self.model = _alexnet(layers)
+        elif network == 'inception_v3':
+            self.model = _inception_v3(layers)
+        elif network == 'resnet50':
+            self.model = _resnet50(layers)
+        elif network == 'robust_resnet50':
+            self.model = _robust_resnet50(layers)
+        elif network == 'vgg_face_dag':
+            self.model = _vgg_face_dag(layers)
+        else:
+            raise ValueError('Network %s is not recognized' % network)
+        if dist.is_initialized() and is_local_master():
+            # Make sure only the first process in distributed training downloads
+            # the model, and the others will use the cache
+            # noinspection PyUnresolvedReferences
+            torch.distributed.barrier()
+        self.num_scales = num_scales
+        self.layers = layers
+        self.weights = weights
+        reduction = 'mean' if not per_sample_weight else 'none'
+        if criterion == 'l1':
+            self.criterion = nn.L1Loss(reduction=reduction)
+        elif criterion == 'l2' or criterion == 'mse':
+            self.criterion = nn.MSELoss(reduction=reduction)
+        elif criterion == 'info_nce':
+            self.criterion = InfoNCELoss(
+                temperature=info_nce_temperature,
+                gather_distributed=info_nce_gather_distributed,
+                learn_temperature=info_nce_learn_temperature,
+                flatten=info_nce_flatten,
+                single_direction=True
+            )
+        else:
+            raise ValueError('Criterion %s is not recognized' % criterion)
+        self.resize = resize
+        self.resize_mode = resize_mode
+        print('Perceptual loss:')
+        print('\tMode: {}'.format(network))
+    def forward(self, inp, target, per_sample_weights=None):
+        r"""Perceptual loss forward.
+        Args:
+           inp (4D tensor) : Input tensor.
+           target (4D tensor) : Ground truth tensor, same shape as the input.
+           per_sample_weight (bool): Output loss for individual samples in the
+            batch instead of mean loss.
+        Returns:
+           (scalar tensor) : The perceptual loss.
+        """
+        if not torch.is_autocast_enabled():
+            inp, target = to_float([inp, target])
+        # Perceptual loss should operate in eval mode by default.
+        self.model.eval()
+        inp, target = apply_imagenet_normalization(inp), apply_imagenet_normalization(target)
+        if self.resize:
+            inp = F.interpolate(inp, mode=self.resize_mode, size=(224, 224), align_corners=False)
+            target = F.interpolate(target, mode=self.resize_mode, size=(224, 224), align_corners=False)
+        # Evaluate perceptual loss at each scale.
+        loss = 0
+        for scale in range(self.num_scales):
+            input_features, target_features = self.model(inp), self.model(target)
+            for layer, weight in zip(self.layers, self.weights):
+                # Example per-layer VGG19 loss values after applying
+                # [0.03125, 0.0625, 0.125, 0.25, 1.0] weighting.
+                # relu_1_1, 0.014698
+                # relu_2_1, 0.085817
+                # relu_3_1, 0.349977
+                # relu_4_1, 0.544188
+                # relu_5_1, 0.906261
+                # print('%s, %f' % (
+                #     layer,
+                #     weight * self.criterion(
+                #                  input_features[layer],
+                #                  target_features[
+                #                  layer].detach()).item()))
+                l_tmp = self.criterion(input_features[layer], target_features[layer].detach())
+                if per_sample_weights is not None:
+                    l_tmp = l_tmp.mean(1).mean(1).mean(1)
+                loss += weight * l_tmp
+            # Downsample the input and target.
+            if scale != self.num_scales - 1:
+                inp = F.interpolate(
+                    inp, mode=self.resize_mode, scale_factor=0.5,
+                    align_corners=False, recompute_scale_factor=True)
+                target = F.interpolate(
+                    target, mode=self.resize_mode, scale_factor=0.5,
+                    align_corners=False, recompute_scale_factor=True)
+        return loss.float()
+class _PerceptualNetwork(nn.Module):
+    r"""The network that extracts features to compute the perceptual loss.
+    Args:
+        network (nn.Sequential) : The network that extracts features.
+        layer_name_mapping (dict) : The dictionary that
+            maps a layer's index to its name.
+        layers (list of str): The list of layer names that we are using.
+    """
+    def __init__(self, network, layer_name_mapping, layers):
+        super().__init__()
+        assert isinstance(network, nn.Sequential), \
+            'The network needs to be of type "nn.Sequential".'
+        self.network = network
+        self.layer_name_mapping = layer_name_mapping
+        self.layers = layers
+        for param in self.parameters():
+            param.requires_grad = False
+    def forward(self, x):
+        r"""Extract perceptual features."""
+        output = {}
+        for i, layer in enumerate(self.network):
+            x = layer(x)
+            layer_name = self.layer_name_mapping.get(i, None)
+            if layer_name in self.layers:
+                # If the current layer is used by the perceptual loss.
+                output[layer_name] = x
+        return output
+def _vgg19(layers):
+    r"""Get vgg19 layers"""
+    vgg = torchvision.models.vgg19(pretrained=True)
+    # network = vgg.features
+    network = torch.nn.Sequential(*(list(vgg.features) + [vgg.avgpool] + [nn.Flatten()] + list(vgg.classifier)))
+    layer_name_mapping = {1: 'relu_1_1',
+                          3: 'relu_1_2',
+                          6: 'relu_2_1',
+                          8: 'relu_2_2',
+                          11: 'relu_3_1',
+                          13: 'relu_3_2',
+                          15: 'relu_3_3',
+                          17: 'relu_3_4',
+                          20: 'relu_4_1',
+                          22: 'relu_4_2',
+                          24: 'relu_4_3',
+                          26: 'relu_4_4',
+                          29: 'relu_5_1',
+                          31: 'relu_5_2',
+                          33: 'relu_5_3',
+                          35: 'relu_5_4',
+                          36: 'pool_5',
+                          42: 'fc_2'}
+    return _PerceptualNetwork(network, layer_name_mapping, layers)
+def _vgg16(layers):
+    r"""Get vgg16 layers"""
+    network = torchvision.models.vgg16(pretrained=True).features
+    layer_name_mapping = {1: 'relu_1_1',
+                          3: 'relu_1_2',
+                          6: 'relu_2_1',
+                          8: 'relu_2_2',
+                          11: 'relu_3_1',
+                          13: 'relu_3_2',
+                          15: 'relu_3_3',
+                          18: 'relu_4_1',
+                          20: 'relu_4_2',
+                          22: 'relu_4_3',
+                          25: 'relu_5_1'}
+    return _PerceptualNetwork(network, layer_name_mapping, layers)
+def _alexnet(layers):
+    r"""Get alexnet layers"""
+    network = torchvision.models.alexnet(pretrained=True).features
+    layer_name_mapping = {0: 'conv_1',
+                          1: 'relu_1',
+                          3: 'conv_2',
+                          4: 'relu_2',
+                          6: 'conv_3',
+                          7: 'relu_3',
+                          8: 'conv_4',
+                          9: 'relu_4',
+                          10: 'conv_5',
+                          11: 'relu_5'}
+    return _PerceptualNetwork(network, layer_name_mapping, layers)
+def _inception_v3(layers):
+    r"""Get inception v3 layers"""
+    inception = torchvision.models.inception_v3(pretrained=True)
+    network = nn.Sequential(inception.Conv2d_1a_3x3,
+                            inception.Conv2d_2a_3x3,
+                            inception.Conv2d_2b_3x3,
+                            nn.MaxPool2d(kernel_size=3, stride=2),
+                            inception.Conv2d_3b_1x1,
+                            inception.Conv2d_4a_3x3,
+                            nn.MaxPool2d(kernel_size=3, stride=2),
+                            inception.Mixed_5b,
+                            inception.Mixed_5c,
+                            inception.Mixed_5d,
+                            inception.Mixed_6a,
+                            inception.Mixed_6b,
+                            inception.Mixed_6c,
+                            inception.Mixed_6d,
+                            inception.Mixed_6e,
+                            inception.Mixed_7a,
+                            inception.Mixed_7b,
+                            inception.Mixed_7c,
+                            nn.AdaptiveAvgPool2d(output_size=(1, 1)))
+    layer_name_mapping = {3: 'pool_1',
+                          6: 'pool_2',
+                          14: 'mixed_6e',
+                          18: 'pool_3'}
+    return _PerceptualNetwork(network, layer_name_mapping, layers)
+def _resnet50(layers):
+    r"""Get resnet50 layers"""
+    resnet50 = torchvision.models.resnet50(pretrained=True)
+    network = nn.Sequential(resnet50.conv1,
+                            resnet50.bn1,
+                            resnet50.relu,
+                            resnet50.maxpool,
+                            resnet50.layer1,
+                            resnet50.layer2,
+                            resnet50.layer3,
+                            resnet50.layer4,
+                            resnet50.avgpool)
+    layer_name_mapping = {4: 'layer_1',
+                          5: 'layer_2',
+                          6: 'layer_3',
+                          7: 'layer_4'}
+    return _PerceptualNetwork(network, layer_name_mapping, layers)
+def _robust_resnet50(layers):
+    r"""Get robust resnet50 layers"""
+    resnet50 = torchvision.models.resnet50(pretrained=False)
+    state_dict = torch.utils.model_zoo.load_url(
+        'http://andrewilyas.com/ImageNet.pt')
+    new_state_dict = {}
+    for k, v in state_dict['model'].items():
+        if k.startswith('module.model.'):
+            new_state_dict[k[13:]] = v
+    resnet50.load_state_dict(new_state_dict)
+    network = nn.Sequential(resnet50.conv1,
+                            resnet50.bn1,
+                            resnet50.relu,
+                            resnet50.maxpool,
+                            resnet50.layer1,
+                            resnet50.layer2,
+                            resnet50.layer3,
+                            resnet50.layer4,
+                            resnet50.avgpool)
+    layer_name_mapping = {4: 'layer_1',
+                          5: 'layer_2',
+                          6: 'layer_3',
+                          7: 'layer_4'}
+    return _PerceptualNetwork(network, layer_name_mapping, layers)
+def _vgg_face_dag(layers):
+    network = torchvision.models.vgg16(num_classes=2622)
+    state_dict = torch.utils.model_zoo.load_url(
+        'http://www.robots.ox.ac.uk/~albanie/models/pytorch-mcn/'
+        'vgg_face_dag.pth')
+    feature_layer_name_mapping = {
+        0: 'conv1_1',
+        2: 'conv1_2',
+        5: 'conv2_1',
+        7: 'conv2_2',
+        10: 'conv3_1',
+        12: 'conv3_2',
+        14: 'conv3_3',
+        17: 'conv4_1',
+        19: 'conv4_2',
+        21: 'conv4_3',
+        24: 'conv5_1',
+        26: 'conv5_2',
+        28: 'conv5_3'}
+    new_state_dict = {}
+    for k, v in feature_layer_name_mapping.items():
+        new_state_dict['features.' + str(k) + '.weight'] = \
+            state_dict[v + '.weight']
+        new_state_dict['features.' + str(k) + '.bias'] = \
+            state_dict[v + '.bias']
+    classifier_layer_name_mapping = {
+        0: 'fc6',
+        3: 'fc7',
+        6: 'fc8'}
+    for k, v in classifier_layer_name_mapping.items():
+        new_state_dict['classifier.' + str(k) + '.weight'] = \
+            state_dict[v + '.weight']
+        new_state_dict['classifier.' + str(k) + '.bias'] = \
+            state_dict[v + '.bias']
+    network.load_state_dict(new_state_dict)
+    class Flatten(nn.Module):
+        def forward(self, x):
+            return x.view(x.shape[0], -1)
+    layer_name_mapping = {
+        0: 'conv_1_1',
+        1: 'relu_1_1',
+        2: 'conv_1_2',
+        5: 'conv_2_1',  # 1/2
+        6: 'relu_2_1',
+        7: 'conv_2_2',
+        10: 'conv_3_1',  # 1/4
+        11: 'relu_3_1',
+        12: 'conv_3_2',
+        14: 'conv_3_3',
+        17: 'conv_4_1',  # 1/8
+        18: 'relu_4_1',
+        19: 'conv_4_2',
+        21: 'conv_4_3',
+        24: 'conv_5_1',  # 1/16
+        25: 'relu_5_1',
+        26: 'conv_5_2',
+        28: 'conv_5_3',
+        33: 'fc6',
+        36: 'fc7',
+        39: 'fc8'
+    }
+    seq_layers = []
+    for feature in network.features:
+        seq_layers += [feature]
+    seq_layers += [network.avgpool, Flatten()]
+    for classifier in network.classifier:
+        seq_layers += [classifier]
+    network = nn.Sequential(*seq_layers)
+    return _PerceptualNetwork(network, layer_name_mapping, layers)

imaginaire/losses/weighted_mse.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, check out LICENSE.md
+import torch
+import torch.nn as nn
+class WeightedMSELoss(nn.Module):
+    r"""Compute Weighted MSE loss"""
+    def __init__(self, reduction='mean'):
+        super(WeightedMSELoss, self).__init__()
+        self.reduction = reduction
+    def forward(self, input, target, weight):
+        r"""Return weighted MSE Loss.
+        Args:
+           input (tensor):
+           target (tensor):
+           weight (tensor):
+        Returns:
+           (tensor): Loss value.
+        """
+        if self.reduction == 'mean':
+            loss = torch.mean(weight * (input - target) ** 2)
+        else:
+            loss = torch.sum(weight * (input - target) ** 2)
+        return loss

imaginaire/model_utils/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, check out LICENSE.md

imaginaire/model_utils/gancraft/camctl.py ADDED Viewed

	@@ -0,0 +1,679 @@

+# Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, check out LICENSE.md
+import numpy as np
+import torch
+class EvalCameraController:
+    def __init__(self, voxel, maxstep=128, pattern=0, cam_ang=73, smooth_decay_multiplier=1.0):
+        self.voxel = voxel
+        self.maxstep = maxstep
+        self.camera_poses = []  # ori, dir, up, f
+        circle = torch.linspace(0, 2*np.pi, steps=maxstep)
+        size = min(voxel.voxel_t.size(1), voxel.voxel_t.size(2)) / 2
+        # Shrink the circle a bit.
+        shift = size * 0.2
+        size = size * 0.8
+        if pattern == 0:
+            height_history = []
+            # Calculate smooth height.
+            for i in range(maxstep):
+                farpoint = torch.tensor([
+                    70,
+                    torch.sin(circle[i])*size + voxel.voxel_t.size(1)/2 + shift,
+                    torch.cos(circle[i])*size + voxel.voxel_t.size(2)/2 + shift])
+                height_history.append(self._get_height(farpoint[1], farpoint[2], farpoint[0]))
+            # Filtfilt
+            height_history = self.filtfilt(height_history, decay=0.2*smooth_decay_multiplier)
+            for i in range(maxstep):
+                farpoint = torch.tensor([
+                    70,
+                    torch.sin(circle[i])*size + voxel.voxel_t.size(1)/2 + shift,
+                    torch.cos(circle[i])*size + voxel.voxel_t.size(2)/2 + shift])
+                farpoint[0] = height_history[i]
+                nearpoint = torch.tensor([
+                    60,
+                    torch.sin(circle[i]+0.5*np.pi)*size*0.5 + voxel.voxel_t.size(1)/2 + shift,
+                    torch.cos(circle[i]+0.5*np.pi)*size*0.5 + voxel.voxel_t.size(2)/2 + shift])
+                cam_ori = self.voxel.world2local(farpoint)
+                cam_dir = self.voxel.world2local(nearpoint - farpoint, is_vec=True)
+                cam_up = self.voxel.world2local(torch.tensor([1, 0, 0], dtype=torch.float32), is_vec=True)
+                cam_f = 0.5/np.tan(np.deg2rad(cam_ang/2))  # about 24mm fov
+                self.camera_poses.append((cam_ori, cam_dir, cam_up, cam_f))
+        elif pattern == 1:
+            zoom = torch.linspace(1.0, 0.25, steps=maxstep)
+            height_history = []
+            for i in range(maxstep):
+                farpoint = torch.tensor([
+                    90,
+                    torch.sin(circle[i])*size + voxel.voxel_t.size(1)/2 + shift,
+                    torch.cos(circle[i])*size + voxel.voxel_t.size(2)/2 + shift])
+                height_history.append(self._get_height(farpoint[1], farpoint[2], farpoint[0]))
+            height_history = self.filtfilt(height_history, decay=0.2*smooth_decay_multiplier)
+            for i in range(maxstep):
+                farpoint = torch.tensor([
+                    90,
+                    torch.sin(circle[i])*size + voxel.voxel_t.size(1)/2 + shift,
+                    torch.cos(circle[i])*size + voxel.voxel_t.size(2)/2 + shift])
+                farpoint[0] = height_history[i]
+                nearpoint = torch.tensor([
+                    60,
+                    torch.sin(circle[i]-0.3*np.pi)*size*0.3 + voxel.voxel_t.size(1)/2 + shift,
+                    torch.cos(circle[i]-0.3*np.pi)*size*0.3 + voxel.voxel_t.size(2)/2 + shift])
+                cam_ori = self.voxel.world2local(farpoint)
+                cam_dir = self.voxel.world2local(nearpoint - farpoint, is_vec=True)
+                cam_up = self.voxel.world2local(torch.tensor([1, 0, 0], dtype=torch.float32), is_vec=True)
+                cam_f = 0.5/np.tan(np.deg2rad(cam_ang/2)*zoom[i])  # about 24mm fov
+                self.camera_poses.append((cam_ori, cam_dir, cam_up, cam_f))
+        elif pattern == 2:
+            move = torch.linspace(1.0, 0.2, steps=maxstep)
+            height_history = []
+            for i in range(maxstep):
+                farpoint = torch.tensor([
+                    90,
+                    torch.sin(circle[i])*size*move[i] + voxel.voxel_t.size(1)/2 + shift,
+                    torch.cos(circle[i])*size*move[i] + voxel.voxel_t.size(2)/2 + shift])
+                height_history.append(self._get_height(farpoint[1], farpoint[2], farpoint[0]))
+            height_history = self.filtfilt(height_history, decay=0.2*smooth_decay_multiplier)
+            for i in range(maxstep):
+                farpoint = torch.tensor([
+                    90,
+                    torch.sin(circle[i])*size*move[i] + voxel.voxel_t.size(1)/2 + shift,
+                    torch.cos(circle[i])*size*move[i] + voxel.voxel_t.size(2)/2 + shift])
+                farpoint[0] = height_history[i]
+                nearpoint = torch.tensor([
+                    60,
+                    torch.sin(circle[i]+0.5*np.pi)*size*0.3*move[i] + voxel.voxel_t.size(1)/2 + shift,
+                    torch.cos(circle[i]+0.5*np.pi)*size*0.3*move[i] + voxel.voxel_t.size(2)/2 + shift])
+                cam_ori = self.voxel.world2local(farpoint)
+                cam_dir = self.voxel.world2local(nearpoint - farpoint, is_vec=True)
+                cam_up = self.voxel.world2local(torch.tensor([1, 0, 0], dtype=torch.float32), is_vec=True)
+                cam_f = 0.5/np.tan(np.deg2rad(cam_ang/2))  # about 24mm fov
+                self.camera_poses.append((cam_ori, cam_dir, cam_up, cam_f))
+        elif pattern == 3:
+            move = torch.linspace(0.75, 0.2, steps=maxstep)
+            height_history = []
+            for i in range(maxstep):
+                farpoint = torch.tensor([
+                    70,
+                    torch.sin(-circle[i])*size*move[i] + voxel.voxel_t.size(1)/2 + shift,
+                    torch.cos(-circle[i])*size*move[i] + voxel.voxel_t.size(2)/2 + shift])
+                height_history.append(self._get_height(farpoint[1], farpoint[2], farpoint[0]))
+            height_history = self.filtfilt(height_history, decay=0.2*smooth_decay_multiplier)
+            for i in range(maxstep):
+                farpoint = torch.tensor([
+                    70,
+                    torch.sin(-circle[i])*size*move[i] + voxel.voxel_t.size(1)/2 + shift,
+                    torch.cos(-circle[i])*size*move[i] + voxel.voxel_t.size(2)/2 + shift])
+                farpoint[0] = height_history[i]
+                nearpoint = torch.tensor([
+                    60,
+                    torch.sin(-circle[i]-0.4*np.pi)*size*0.9*move[i] + voxel.voxel_t.size(1)/2 + shift,
+                    torch.cos(-circle[i]-0.4*np.pi)*size*0.9*move[i] + voxel.voxel_t.size(2)/2 + shift])
+                cam_ori = self.voxel.world2local(farpoint)
+                cam_dir = self.voxel.world2local(nearpoint - farpoint, is_vec=True)
+                cam_up = self.voxel.world2local(torch.tensor([1, 0, 0], dtype=torch.float32), is_vec=True)
+                cam_f = 0.5/np.tan(np.deg2rad(cam_ang/2))  # about 24mm fov
+                self.camera_poses.append((cam_ori, cam_dir, cam_up, cam_f))
+        elif pattern == 4:
+            move = torch.linspace(1.0, 0.5, steps=maxstep)
+            height_history = []
+            for i in range(maxstep):
+                farpoint = torch.tensor([
+                    90,
+                    torch.sin(circle[i])*size*move[i] + voxel.voxel_t.size(1)/2 + shift,
+                    torch.cos(circle[i])*size*move[i] + voxel.voxel_t.size(2)/2 + shift])
+                height_history.append(self._get_height(farpoint[1], farpoint[2], farpoint[0]))
+            height_history = self.filtfilt(height_history, decay=0.2*smooth_decay_multiplier)
+            for i in range(maxstep):
+                farpoint = torch.tensor([
+                    90,
+                    torch.sin(circle[i])*size*move[i] + voxel.voxel_t.size(1)/2 + shift,
+                    torch.cos(circle[i])*size*move[i] + voxel.voxel_t.size(2)/2 + shift])
+                farpoint[0] = height_history[i]
+                nearpoint = torch.tensor([
+                    60,
+                    torch.sin(circle[i]+0.5*np.pi)*size*0.3*move[i] + voxel.voxel_t.size(1)/2 + shift,
+                    torch.cos(circle[i]+0.5*np.pi)*size*0.3*move[i] + voxel.voxel_t.size(2)/2 + shift])
+                cam_ori = self.voxel.world2local(farpoint)
+                cam_dir = self.voxel.world2local(nearpoint - farpoint, is_vec=True)
+                cam_up = self.voxel.world2local(torch.tensor([1, 0, 0], dtype=torch.float32), is_vec=True)
+                cam_f = 0.5/np.tan(np.deg2rad(cam_ang/2))  # about 24mm fov
+                self.camera_poses.append((cam_ori, cam_dir, cam_up, cam_f))
+        # look outward
+        elif pattern == 5:
+            move = torch.linspace(1.0, 0.5, steps=maxstep)
+            height_history = []
+            for i in range(maxstep):
+                nearpoint = torch.tensor([
+                    60,
+                    torch.sin(circle[i]+0.5*np.pi)*size*0.3*move[i] + voxel.voxel_t.size(1)/2 + shift,
+                    torch.cos(circle[i]+0.5*np.pi)*size*0.3*move[i] + voxel.voxel_t.size(2)/2 + shift])
+                height_history.append(self._get_height(nearpoint[1], nearpoint[2], nearpoint[0]))
+            height_history = self.filtfilt(height_history, decay=0.2*smooth_decay_multiplier)
+            for i in range(maxstep):
+                nearpoint = torch.tensor([
+                    60,
+                    torch.sin(circle[i]+0.5*np.pi)*size*0.3*move[i] + voxel.voxel_t.size(1)/2 + shift,
+                    torch.cos(circle[i]+0.5*np.pi)*size*0.3*move[i] + voxel.voxel_t.size(2)/2 + shift])
+                nearpoint[0] = height_history[i]
+                farpoint = torch.tensor([
+                    60,
+                    torch.sin(circle[i])*size*move[i] + voxel.voxel_t.size(1)/2 + shift,
+                    torch.cos(circle[i])*size*move[i] + voxel.voxel_t.size(2)/2 + shift])
+                cam_ori = self.voxel.world2local(nearpoint)
+                cam_dir = self.voxel.world2local(farpoint - nearpoint, is_vec=True)
+                cam_up = self.voxel.world2local(torch.tensor([1, 0, 0], dtype=torch.float32), is_vec=True)
+                cam_f = 0.5/np.tan(np.deg2rad(cam_ang/2))  # about 24mm fov
+                self.camera_poses.append((cam_ori, cam_dir, cam_up, cam_f))
+        # Rise
+        elif pattern == 6:
+            shift = 0
+            lift = torch.linspace(0.0, 200.0, steps=maxstep)
+            zoom = torch.linspace(0.8, 1.6, steps=maxstep)
+            for i in range(maxstep):
+                farpoint = torch.tensor([
+                    80+lift[i],
+                    torch.sin(circle[i]/4)*size*0.2 + voxel.voxel_t.size(1)/2 + shift,
+                    torch.cos(circle[i]/4)*size*0.2 + voxel.voxel_t.size(2)/2 + shift])
+                farpoint[0] = self._get_height(farpoint[1], farpoint[2], farpoint[0])
+                nearpoint = torch.tensor([
+                    65,
+                    torch.sin(circle[i]/4+0.5*np.pi)*size*0.1 + voxel.voxel_t.size(1)/2 + shift,
+                    torch.cos(circle[i]/4+0.5*np.pi)*size*0.1 + voxel.voxel_t.size(2)/2 + shift])
+                cam_ori = self.voxel.world2local(farpoint)
+                cam_dir = self.voxel.world2local(nearpoint - farpoint, is_vec=True)
+                cam_up = self.voxel.world2local(torch.tensor([1, 0, 0], dtype=torch.float32), is_vec=True)
+                cam_f = 0.5/np.tan(np.deg2rad(73/2)*zoom[i])  # about 24mm fov
+                self.camera_poses.append((cam_ori, cam_dir, cam_up, cam_f))
+        # 45deg
+        elif pattern == 7:
+            rad = torch.tensor([np.deg2rad(45).astype(np.float32)])
+            size = 1536
+            for i in range(maxstep):
+                farpoint = torch.tensor([
+                    61+size,
+                    torch.sin(rad)*size + voxel.voxel_t.size(1)/2,
+                    torch.cos(rad)*size + voxel.voxel_t.size(2)/2])
+                nearpoint = torch.tensor([
+                    61,
+                    voxel.voxel_t.size(1)/2,
+                    voxel.voxel_t.size(2)/2])
+                cam_ori = self.voxel.world2local(farpoint)
+                cam_dir = self.voxel.world2local(nearpoint - farpoint, is_vec=True)
+                cam_up = self.voxel.world2local(torch.tensor([1, 0, 0], dtype=torch.float32), is_vec=True)
+                cam_f = 0.5/np.tan(np.deg2rad(19.5/2))  # about 50mm fov
+                self.camera_poses.append((cam_ori, cam_dir, cam_up, cam_f))
+        elif pattern == 8:
+            size = self.voxel.voxel_t.size(1) // 2
+            for i in range(maxstep):
+                farpoint = torch.tensor([
+                    300,
+                    0*size + voxel.voxel_t.size(1)//2,
+                    -1*size + voxel.voxel_t.size(2)/2 + size // maxstep * (i - maxstep // 4)])
+                nearpoint = torch.tensor([
+                    120,
+                    0*size*0.5 + voxel.voxel_t.size(1)//2,
+                    -1*size*0.5 + voxel.voxel_t.size(2)/2 + size // maxstep * (i - maxstep // 4)])
+                cam_ori = self.voxel.world2local(farpoint)
+                cam_dir = self.voxel.world2local(nearpoint - farpoint, is_vec=True)
+                cam_up = self.voxel.world2local(torch.tensor([1, 0, 0], dtype=torch.float32), is_vec=True)
+                cam_f = 0.5/np.tan(np.deg2rad(cam_ang/2))  # about 24mm fov
+                self.camera_poses.append((cam_ori, cam_dir, cam_up, cam_f))
+        elif pattern == 9:
+            size = self.voxel.voxel_t.size(2) // 2
+            for i in range(maxstep):
+                farpoint = torch.tensor([
+                    140,
+                    voxel.voxel_t.size(1)//2,
+                    -size // 4 + size * 8 // maxstep * i]
+                    , dtype=torch.float32)
+                nearpoint = torch.tensor([
+                    100,
+                    voxel.voxel_t.size(1)//2,
+                    size * 8 // maxstep * i]
+                    , dtype=torch.float32)
+                cam_ori = self.voxel.world2local(farpoint)
+                cam_dir = self.voxel.world2local(nearpoint - farpoint, is_vec=True)
+                cam_up = self.voxel.world2local(torch.tensor([1, 0, 0], dtype=torch.float32), is_vec=True)
+                cam_f = 0.5/np.tan(np.deg2rad(cam_ang/2))  # about 24mm fov
+                self.camera_poses.append((cam_ori, cam_dir, cam_up, cam_f))
+    def _get_height(self, loc0, loc1, minheight):
+        loc0 = int(loc0)
+        loc1 = int(loc1)
+        height = minheight
+        for dx in range(-3, 4):
+            for dy in range(-3, 4):
+                if (loc0+dx) < 0 or (loc0+dx) >= self.voxel.heightmap.shape[0] or (loc1+dy) < 0 or \
+                        (loc1+dy) >= self.voxel.heightmap.shape[1]:
+                    height = max(height, minheight)
+                else:
+                    height = max(height, self.voxel.heightmap[loc0+dx, loc1+dy] + 2)
+        return height
+    def filtfilt(self, height_history, decay=0.2):
+        # Filtfilt
+        height_history2 = []
+        maxstep = len(height_history)
+        prev_height = height_history[0]
+        for i in range(maxstep):
+            prev_height = prev_height - decay
+            if prev_height < height_history[i]:
+                prev_height = height_history[i]
+            height_history2.append(prev_height)
+        prev_height = height_history[-1]
+        for i in range(maxstep-1, -1, -1):
+            prev_height = prev_height - decay
+            if prev_height < height_history[i]:
+                prev_height = height_history[i]
+            height_history2[i] = max(prev_height, height_history2[i])
+        return height_history2
+    def __len__(self):
+        return len(self.camera_poses)
+    def __getitem__(self, idx):
+        return self.camera_poses[idx]
+class TourCameraController:
+    def __init__(self, voxel, maxstep=128):
+        self.voxel = voxel
+        self.maxstep = maxstep
+        self.camera_poses = []  # ori, dir, up, f
+        circle = torch.linspace(0, 2*np.pi, steps=maxstep//4)
+        size = min(voxel.voxel_t.size(1), voxel.voxel_t.size(2)) / 2
+        # Shrink the circle a bit
+        shift = size * 0.2
+        size = size * 0.8
+        for i in range(maxstep//4):
+            farpoint = torch.tensor([
+                70,
+                torch.sin(circle[i])*size + voxel.voxel_t.size(1)/2 + shift,
+                torch.cos(circle[i])*size + voxel.voxel_t.size(2)/2 + shift])
+            farpoint[0] = self._get_height(farpoint[1], farpoint[2], farpoint[0])
+            nearpoint = torch.tensor([
+                60,
+                torch.sin(circle[i]+0.5*np.pi)*size*0.5 + voxel.voxel_t.size(1)/2 + shift,
+                torch.cos(circle[i]+0.5*np.pi)*size*0.5 + voxel.voxel_t.size(2)/2 + shift])
+            cam_ori = self.voxel.world2local(farpoint)
+            cam_dir = self.voxel.world2local(nearpoint - farpoint, is_vec=True)
+            cam_up = self.voxel.world2local(torch.tensor([1, 0, 0], dtype=torch.float32), is_vec=True)
+            cam_f = 0.5/np.tan(np.deg2rad(73/2))  # about 24mm fov
+            self.camera_poses.append((cam_ori, cam_dir, cam_up, cam_f))
+        zoom = torch.linspace(1.0, 0.25, steps=maxstep//4)
+        for i in range(maxstep//4):
+            farpoint = torch.tensor([
+                90,
+                torch.sin(circle[i])*size + voxel.voxel_t.size(1)/2 + shift,
+                torch.cos(circle[i])*size + voxel.voxel_t.size(2)/2 + shift])
+            farpoint[0] = self._get_height(farpoint[1], farpoint[2], farpoint[0])
+            nearpoint = torch.tensor([
+                60,
+                torch.sin(circle[i]-0.3*np.pi)*size*0.3 + voxel.voxel_t.size(1)/2 + shift,
+                torch.cos(circle[i]-0.3*np.pi)*size*0.3 + voxel.voxel_t.size(2)/2 + shift])
+            cam_ori = self.voxel.world2local(farpoint)
+            cam_dir = self.voxel.world2local(nearpoint - farpoint, is_vec=True)
+            cam_up = self.voxel.world2local(torch.tensor([1, 0, 0], dtype=torch.float32), is_vec=True)
+            cam_f = 0.5/np.tan(np.deg2rad(73/2)*zoom[i])  # about 24mm fov
+            self.camera_poses.append((cam_ori, cam_dir, cam_up, cam_f))
+        move = torch.linspace(1.0, 0.2, steps=maxstep//4)
+        for i in range(maxstep//4):
+            farpoint = torch.tensor([
+                90,
+                torch.sin(circle[i])*size*move[i] + voxel.voxel_t.size(1)/2 + shift,
+                torch.cos(circle[i])*size*move[i] + voxel.voxel_t.size(2)/2 + shift])
+            farpoint[0] = self._get_height(farpoint[1], farpoint[2], farpoint[0])
+            nearpoint = torch.tensor([
+                60,
+                torch.sin(circle[i]+0.5*np.pi)*size*0.3*move[i] + voxel.voxel_t.size(1)/2 + shift,
+                torch.cos(circle[i]+0.5*np.pi)*size*0.3*move[i] + voxel.voxel_t.size(2)/2 + shift])
+            cam_ori = self.voxel.world2local(farpoint)
+            cam_dir = self.voxel.world2local(nearpoint - farpoint, is_vec=True)
+            cam_up = self.voxel.world2local(torch.tensor([1, 0, 0], dtype=torch.float32), is_vec=True)
+            cam_f = 0.5/np.tan(np.deg2rad(73/2))  # about 24mm fov
+            self.camera_poses.append((cam_ori, cam_dir, cam_up, cam_f))
+        lift = torch.linspace(0.0, 200.0, steps=maxstep//4)
+        zoom = torch.linspace(0.6, 1.2, steps=maxstep//4)
+        for i in range(maxstep//4):
+            farpoint = torch.tensor([
+                80+lift[i],
+                torch.sin(circle[i])*size*0.2 + voxel.voxel_t.size(1)/2 + shift,
+                torch.cos(circle[i])*size*0.2 + voxel.voxel_t.size(2)/2 + shift])
+            farpoint[0] = self._get_height(farpoint[1], farpoint[2], farpoint[0])
+            nearpoint = torch.tensor([
+                60,
+                torch.sin(circle[i]+0.5*np.pi)*size*0.1 + voxel.voxel_t.size(1)/2 + shift,
+                torch.cos(circle[i]+0.5*np.pi)*size*0.1 + voxel.voxel_t.size(2)/2 + shift])
+            cam_ori = self.voxel.world2local(farpoint)
+            cam_dir = self.voxel.world2local(nearpoint - farpoint, is_vec=True)
+            cam_up = self.voxel.world2local(torch.tensor([1, 0, 0], dtype=torch.float32), is_vec=True)
+            cam_f = 0.5/np.tan(np.deg2rad(73/2)*zoom[i])  # about 24mm fov
+            self.camera_poses.append((cam_ori, cam_dir, cam_up, cam_f))
+    def _get_height(self, loc0, loc1, minheight):
+        loc0 = int(loc0)
+        loc1 = int(loc1)
+        height = minheight
+        for dx in range(-3, 4):
+            for dy in range(-3, 4):
+                if (loc0+dx) < 0 or (loc0+dx) >= self.voxel.heightmap.shape[0] or (loc1+dy) < 0 or \
+                        (loc1+dy) >= self.voxel.heightmap.shape[1]:
+                    height = max(height, minheight)
+                else:
+                    height = max(height, self.voxel.heightmap[loc0+dx, loc1+dy] + 2)
+        return height
+    def __len__(self):
+        return len(self.camera_poses)
+    def __getitem__(self, idx):
+        return self.camera_poses[idx]
+def rand_camera_pose_birdseye(voxel, border=128):
+    r"""Generating random camera pose in the upper hemisphere, in the format of origin-direction-up
+    Assuming [Y X Z] coordinate. Y is negative gravity direction.
+    The camera pose is converted into the voxel coordinate system so that it can be used directly for rendering
+    1. Uniformly sample a point on the upper hemisphere of a unit sphere, as cam_ori.
+    2. Set cam_dir to be from cam_ori to the origin
+    3. cam_up is always pointing towards sky
+    4. move cam_ori to random place according to voxel size
+    """
+    cam_dir = torch.randn(3, dtype=torch.float32)
+    cam_dir = cam_dir / torch.sqrt(torch.sum(cam_dir*cam_dir))
+    cam_dir[0] = -torch.abs(cam_dir[0])
+    cam_up = torch.tensor([1, 0, 0], dtype=torch.float32)
+    # generate camera lookat target
+    r = np.random.rand(2)
+    r[0] *= voxel.voxel_t.size(1)-border-border
+    r[1] *= voxel.voxel_t.size(2)-border-border
+    r = r + border
+    y = voxel.heightmap[int(r[0]+0.5), int(r[1]+0.5)] + (np.random.rand(1)-0.5) * 5
+    cam_target = torch.tensor([y, r[0], r[1]], dtype=torch.float32)
+    cam_ori = cam_target - cam_dir * (np.random.rand(1).item() * 100)
+    cam_ori[0] = max(voxel.heightmap[int(cam_ori[1]+0.5), int(cam_ori[2]+0.5)]+2, cam_ori[0])
+    # Translate to voxel coordinate
+    cam_ori = voxel.world2local(cam_ori)
+    cam_dir = voxel.world2local(cam_dir, is_vec=True)
+    cam_up = voxel.world2local(cam_up, is_vec=True)
+    return cam_ori, cam_dir, cam_up
+def get_neighbor_height(heightmap, loc0, loc1, minheight, neighbor_size=7):
+    loc0 = int(loc0)
+    loc1 = int(loc1)
+    height = 0
+    for dx in range(-neighbor_size//2, neighbor_size//2+1):
+        for dy in range(-neighbor_size//2, neighbor_size//2+1):
+            if (loc0+dx) < 0 or (loc0+dx) >= heightmap.shape[0] or (loc1+dy) < 0 or (loc1+dy) >= heightmap.shape[1]:
+                height = max(height, minheight)
+            else:
+                height = max(minheight, heightmap[loc0+dx, loc1+dy] + 2)
+    return height
+def rand_camera_pose_firstperson(voxel, border=128):
+    r"""Generating random camera pose in the upper hemisphere, in the format of origin-direction-up
+    """
+    r = np.random.rand(5)
+    r[0] *= voxel.voxel_t.size(1)-border-border
+    r[1] *= voxel.voxel_t.size(2)-border-border
+    r[0] = r[0] + border
+    r[1] = r[1] + border
+    y = get_neighbor_height(voxel.heightmap, r[0], r[1], 0) + np.random.rand(1) * 15
+    cam_ori = torch.tensor([y, r[0], r[1]], dtype=torch.float32)
+    rand_ang_h = r[2] * 2 * np.pi
+    cam_target = torch.tensor([0, cam_ori[1]+np.sin(rand_ang_h)*border*r[4], cam_ori[2] +
+                              np.cos(rand_ang_h)*border*r[4]], dtype=torch.float32)
+    cam_target[0] = get_neighbor_height(voxel.heightmap, cam_target[1],
+                                        cam_target[2], 0, neighbor_size=1) - 2 + r[3] * 10
+    cam_dir = cam_target - cam_ori
+    cam_up = torch.tensor([1, 0, 0], dtype=torch.float32)
+    cam_ori = voxel.world2local(cam_ori)
+    cam_dir = voxel.world2local(cam_dir, is_vec=True)
+    cam_up = voxel.world2local(cam_up, is_vec=True)
+    return cam_ori, cam_dir, cam_up
+def rand_camera_pose_thridperson(voxel, border=96):
+    r = torch.rand(2)
+    r[0] *= voxel.voxel_t.size(1)
+    r[1] *= voxel.voxel_t.size(2)
+    rand_height = 60 + torch.rand(1) * 40
+    rand_height = get_neighbor_height(voxel.heightmap, r[0], r[1], rand_height, neighbor_size=5)
+    farpoint = torch.tensor([rand_height, r[0], r[1]], dtype=torch.float32)
+    r = torch.rand(2)
+    r[0] *= voxel.voxel_t.size(1) - border - border
+    r[1] *= voxel.voxel_t.size(2) - border - border
+    r[0] = r[0] + border
+    r[1] = r[1] + border
+    rand_height = get_neighbor_height(voxel.heightmap, r[0], r[1], 65, neighbor_size=1) - 5
+    nearpoint = torch.tensor([rand_height, r[0], r[1]], dtype=torch.float32)
+    cam_ori = voxel.world2local(farpoint)
+    cam_dir = voxel.world2local(nearpoint - farpoint, is_vec=True)
+    cam_up = voxel.world2local(torch.tensor([1, 0, 0], dtype=torch.float32), is_vec=True)
+    return cam_ori, cam_dir, cam_up
+def rand_camera_pose_thridperson2(voxel, border=48):
+    r = torch.rand(2)
+    r[0] *= voxel.voxel_t.size(1) - border - border
+    r[1] *= voxel.voxel_t.size(2) - border - border
+    r[0] = r[0] + border
+    r[1] = r[1] + border
+    rand_height = 60 + torch.rand(1) * 40
+    rand_height = get_neighbor_height(voxel.heightmap, r[0], r[1], rand_height, neighbor_size=5)
+    farpoint = torch.tensor([rand_height, r[0], r[1]], dtype=torch.float32)
+    r = torch.rand(2)
+    r[0] *= voxel.voxel_t.size(1) - border - border
+    r[1] *= voxel.voxel_t.size(2) - border - border
+    r[0] = r[0] + border
+    r[1] = r[1] + border
+    rand_height = get_neighbor_height(voxel.heightmap, r[0], r[1], 65, neighbor_size=1) - 5
+    nearpoint = torch.tensor([rand_height, r[0], r[1]], dtype=torch.float32)
+    # Random Up vector (tilt a little bit)
+    # up = torch.randn(3) * 0.05 # cutoff +-0.1, Tan(10deg) = 0.176
+    up = torch.randn(3) * 0.02
+    up[0] = 1.0
+    up = up / up.norm(p=2)
+    cam_ori = voxel.world2local(farpoint)
+    cam_dir = voxel.world2local(nearpoint - farpoint, is_vec=True)
+    cam_up = voxel.world2local(up, is_vec=True)
+    return cam_ori, cam_dir, cam_up
+def rand_camera_pose_thridperson3(voxel, border=64):
+    r"""Attempting to solve the camera too close to wall problem and the lack of aerial poses."""
+    r = torch.rand(2)
+    r[0] *= voxel.voxel_t.size(1) - border - border
+    r[1] *= voxel.voxel_t.size(2) - border - border
+    r[0] = r[0] + border
+    r[1] = r[1] + border
+    rand_height = 60 + torch.rand(1) * 40
+    if torch.rand(1) > 0.8:
+        rand_height = 60 + torch.rand(1) * 60
+    rand_height = get_neighbor_height(voxel.heightmap, r[0], r[1], rand_height, neighbor_size=7)
+    farpoint = torch.tensor([rand_height, r[0], r[1]], dtype=torch.float32)
+    r = torch.rand(2)
+    r[0] *= voxel.voxel_t.size(1) - border - border
+    r[1] *= voxel.voxel_t.size(2) - border - border
+    r[0] = r[0] + border
+    r[1] = r[1] + border
+    rand_height = get_neighbor_height(voxel.heightmap, r[0], r[1], 65, neighbor_size=3) - 5
+    nearpoint = torch.tensor([rand_height, r[0], r[1]], dtype=torch.float32)
+    # Random Up vector (tilt a little bit)
+    # up = torch.randn(3) * 0.05 # cutoff +-0.1, Tan(10deg) = 0.176
+    up = torch.randn(3) * 0.02
+    up[0] = 1.0
+    up = up / up.norm(p=2)
+    # print(up)
+    cam_ori = voxel.world2local(farpoint)
+    cam_dir = voxel.world2local(nearpoint - farpoint, is_vec=True)
+    cam_up = voxel.world2local(up, is_vec=True)
+    return cam_ori, cam_dir, cam_up
+def rand_camera_pose_tour(voxel):
+    size = min(voxel.voxel_t.size(1), voxel.voxel_t.size(2)) / 2
+    center = [voxel.voxel_t.size(1)/2, voxel.voxel_t.size(2)/2]
+    rnd = torch.rand(8)
+    rnd_deg = torch.rand(1) * 2 * np.pi
+    far_radius = rnd[0]*0.8+0.2
+    far_height = rnd[1]*30 + 60
+    farpoint = torch.tensor([
+        far_height,
+        torch.sin(rnd_deg)*size*far_radius + center[0],
+        torch.cos(rnd_deg)*size*far_radius + center[1]])
+    farpoint[0] = get_neighbor_height(voxel.heightmap, farpoint[1], farpoint[2], farpoint[0], neighbor_size=7)
+    near_radius = far_radius * rnd[2]
+    near_shift_rad = np.pi*(rnd[3]-0.5)
+    near_height = 60 + rnd[4] * 10
+    nearpoint = torch.tensor([
+        near_height,
+        torch.sin(rnd_deg+near_shift_rad)*size*near_radius + center[0],
+        torch.cos(rnd_deg+near_shift_rad)*size*near_radius + center[1]])
+    # Random Up vector (tilt a little bit)
+    # up = torch.randn(3) * 0.05 # cutoff +-0.1, Tan(10deg) = 0.176
+    up = torch.randn(3) * 0.02
+    up[0] = 1.0
+    up = up / up.norm(p=2)
+    cam_ori = voxel.world2local(farpoint)
+    cam_dir = voxel.world2local(nearpoint - farpoint, is_vec=True)
+    cam_up = voxel.world2local(up, is_vec=True)
+    cam_f = 0.5/np.tan(np.deg2rad(73/2)*(rnd[5]*0.75+0.25))  # about 24mm fov
+    return cam_ori, cam_dir, cam_up, cam_f
+# Look from center to outward
+def rand_camera_pose_insideout(voxel):
+    size = min(voxel.voxel_t.size(1), voxel.voxel_t.size(2)) / 2
+    center = [voxel.voxel_t.size(1)/2, voxel.voxel_t.size(2)/2]
+    rnd = torch.rand(8)
+    rnd_deg = torch.rand(1) * 2 * np.pi
+    far_radius = rnd[0]*0.8+0.2
+    far_height = rnd[1]*10 + 60
+    farpoint = torch.tensor([
+        far_height,
+        torch.sin(rnd_deg)*size*far_radius + center[0],
+        torch.cos(rnd_deg)*size*far_radius + center[1]])
+    near_radius = far_radius * rnd[2]
+    near_shift_rad = np.pi*(rnd[3]-0.5)
+    near_height = 60 + rnd[4] * 30
+    nearpoint = torch.tensor([
+        near_height,
+        torch.sin(rnd_deg+near_shift_rad)*size*near_radius + center[0],
+        torch.cos(rnd_deg+near_shift_rad)*size*near_radius + center[1]])
+    nearpoint[0] = get_neighbor_height(voxel.heightmap, nearpoint[1], nearpoint[2], nearpoint[0], neighbor_size=7)
+    # Random Up vector (tilt a little bit)
+    # up = torch.randn(3) * 0.05 # cutoff +-0.1, Tan(10deg) = 0.176
+    up = torch.randn(3) * 0.02
+    up[0] = 1.0
+    up = up / up.norm(p=2)
+    cam_ori = voxel.world2local(nearpoint)
+    cam_dir = voxel.world2local(farpoint-nearpoint, is_vec=True)
+    cam_up = voxel.world2local(up, is_vec=True)
+    cam_f = 0.5/np.tan(np.deg2rad(73/2)*(rnd[5]*0.75+0.25))  # about 24mm fov
+    return cam_ori, cam_dir, cam_up, cam_f

imaginaire/model_utils/gancraft/gaugan_lbl2col.csv ADDED Viewed

	@@ -0,0 +1,182 @@

+person,#00AC0D
+bicycle,#012F47
+car,#0275B8
+motorcycle,#03C098
+airplane,#04434F
+bus,#05FB29
+train,#06C312
+truck,#076728
+boat,#0809B6
+traffic-light,#09D3CF
+fire-hydrant,#0A150B
+street-sign,#0BF2A6
+stop-sign,#0C246F
+parking-meter,#0D575D
+bench,#0E46F9
+bird,#0FD881
+cat,#1058DF
+dog,#118C76
+horse,#123A2C
+sheep,#13C1D8
+cow,#14E67D
+elephant,#152718
+bear,#165743
+zebra,#17AED2
+giraffe,#1858EF
+hat,#195103
+backpack,#1AA5EA
+umbrella,#1B19CC
+shoe,#1C4DE6
+eye-glasses,#1D4823
+handbag,#1E09D6
+tie,#1F94FE
+suitcase,#2073BD
+frisbee,#21D0C5
+skis,#22F3D7
+snowboard,#23C52B
+sports-ball,#24FE20
+kite,#254F0B
+baseball-bat,#26AF68
+baseball-glove,#27C0D4
+skateboard,#28528A
+surfboard,#2963B6
+tennis-racket,#2AD8EB
+bottle,#2BB1A5
+plate,#2CF37D
+wine-glass,#2D1D9C
+cup,#2E936F
+fork,#2F93E8
+knife,#308E02
+spoon,#31A71B
+bowl,#3220D3
+banana,#33C1D9
+apple,#340997
+sandwich,#35B935
+orange,#367F33
+broccoli,#3720AE
+carrot,#381F94
+hot-dog,#39CAB5
+pizza,#3AF41D
+donut,#3B9743
+cake,#3CA323
+chair,#3DFE27
+couch,#3ECB89
+potted-plant,#3F7249
+bed,#40B729
+mirror,#411C97
+dining-table,#422283
+window,#43802E
+desk,#4480DA
+toilet,#45A4B2
+door,#46356C
+tv,#478503
+laptop,#48261F
+mouse,#49E809
+remote,#4AF48A
+keyboard,#4B111B
+cell-phone,#4C4FAD
+microwave,#4D84C7
+oven,#4E69A7
+toaster,#4F2A3D
+sink,#50BA55
+refrigerator,#511F61
+blender,#52782C
+book,#530122
+clock,#5441A2
+vase,#55E758
+scissors,#56A921
+teddy-bear,#573985
+hair-drier,#5823E8
+toothbrush,#5966FF
+hair-brush,#5A7724
+banner,#5B0B00
+blanket,#5CAECB
+branch,#5D5222
+bridge,#5E5BC5
+building-other,#5F807E
+bush,#606E32
+cabinet,#6163FE
+cage,#623550
+cardboard,#638CBE
+carpet,#647988
+ceiling-other,#65AABD
+ceiling-tile,#665481
+cloth,#67CBD1
+clothes,#684470
+clouds,#696969
+counter,#6AC478
+cupboard,#6B2F5B
+curtain,#6C7FA8
+desk-stuff,#6DF474
+dirt,#6E6E28
+door-stuff,#6FCCB0
+fence,#706419
+floor-marble,#71B443
+floor-other,#72E867
+floor-stone,#734EFC
+floor-tile,#748F23
+floor-wood,#759472
+flower,#760000
+fog,#77BA1D
+food-other,#7817F1
+fruit,#79CF21
+furniture-other,#7A8D92
+grass,#7BC800
+gravel,#7C32C8
+ground-other,#7D3054
+hill,#7EC864
+house,#7F4502
+leaves,#80A945
+light,#81A365
+mat,#82C08C
+metal,#835F2C
+mirror-stuff,#84C575
+moss,#855EFD
+mountain,#869664
+mud,#87716F
+napkin,#88B25B
+net,#892455
+paper,#8AA2A7
+pavement,#8B3027
+pillow,#8C5DCB
+plant,#8DE61E
+plastic,#8E629E
+platform,#8F2A91
+playingfield,#90CDC6
+railing,#9170C7
+railroad,#92E712
+river,#9364C8
+road,#946E28
+rock,#956432
+roof,#9600B1
+rug,#978A29
+salad,#98725D
+sand,#999900
+sea,#9AC6DA
+shelf,#9B7FC9
+sky,#9CEEDD
+skyscraper,#9DBBF2
+snow,#9E9EAA
+solid-other,#9F79DB
+stairs,#A06249
+stone,#A1A164
+straw,#A2A3EB
+structural,#A3DED1
+table,#A47B69
+tent,#A5C3BA
+textile-other,#A65280
+towel,#A7AED6
+tree,#A8C832
+vegetable,#A99410
+wall-brick,#AAD16A
+wall-concrete,#AB32A4
+wall-other,#AC9B5E
+wall-panel,#AD0E18
+wall-stone,#AE2974
+wall-tile,#AF3ABF
+wall-wood,#B0C1C3
+water,#B1C8FF
+waterdrops,#B20A88
+window-blind,#B356B8
+window-other,#B42B5B
+wood,#B57B00

imaginaire/model_utils/gancraft/gaugan_reduction.csv ADDED Viewed

	@@ -0,0 +1,182 @@

+person,ignore
+bicycle,ignore
+car,ignore
+motorcycle,ignore
+airplane,ignore
+bus,ignore
+train,ignore
+truck,ignore
+boat,ignore
+traffic-light,ignore
+fire-hydrant,ignore
+street-sign,ignore
+stop-sign,ignore
+parking-meter,ignore
+bench,ignore
+bird,ignore
+cat,ignore
+dog,ignore
+horse,ignore
+sheep,ignore
+cow,ignore
+elephant,ignore
+bear,ignore
+zebra,ignore
+giraffe,ignore
+hat,ignore
+backpack,ignore
+umbrella,ignore
+shoe,ignore
+eye-glasses,ignore
+handbag,ignore
+tie,ignore
+suitcase,ignore
+frisbee,ignore
+skis,ignore
+snowboard,ignore
+sports-ball,ignore
+kite,ignore
+baseball-bat,ignore
+baseball-glove,ignore
+skateboard,ignore
+surfboard,ignore
+tennis-racket,ignore
+bottle,ignore
+plate,ignore
+wine-glass,ignore
+cup,ignore
+fork,ignore
+knife,ignore
+spoon,ignore
+bowl,ignore
+banana,ignore
+apple,ignore
+sandwich,ignore
+orange,ignore
+broccoli,ignore
+carrot,ignore
+hot-dog,ignore
+pizza,ignore
+donut,ignore
+cake,ignore
+chair,ignore
+couch,ignore
+potted-plant,ignore
+bed,ignore
+mirror,ignore
+dining-table,ignore
+window,ignore
+desk,ignore
+toilet,ignore
+door,ignore
+tv,ignore
+laptop,ignore
+mouse,ignore
+remote,ignore
+keyboard,ignore
+cell-phone,ignore
+microwave,ignore
+oven,ignore
+toaster,ignore
+sink,ignore
+refrigerator,ignore
+blender,ignore
+book,ignore
+clock,ignore
+vase,ignore
+scissors,ignore
+teddy-bear,ignore
+hair-drier,ignore
+toothbrush,ignore
+hair-brush,ignore
+banner,ignore
+blanket,ignore
+branch,tree
+bridge,ignore
+building-other,ignore
+bush,tree
+cabinet,ignore
+cage,ignore
+cardboard,ignore
+carpet,ignore
+ceiling-other,ignore
+ceiling-tile,ignore
+cloth,ignore
+clothes,ignore
+clouds,sky
+counter,ignore
+cupboard,ignore
+curtain,ignore
+desk-stuff,ignore
+dirt,dirt
+door-stuff,ignore
+fence,ignore
+floor-marble,ignore
+floor-other,ignore
+floor-stone,ignore
+floor-tile,ignore
+floor-wood,ignore
+flower,flower
+fog,sky
+food-other,ignore
+fruit,ignore
+furniture-other,ignore
+grass,grass
+gravel,gravel
+ground-other,ignore
+hill,grass
+house,ignore
+leaves,tree
+light,ignore
+mat,ignore
+metal,ignore
+mirror-stuff,ignore
+moss,grass
+mountain,grass
+mud,dirt
+napkin,ignore
+net,ignore
+paper,ignore
+pavement,ignore
+pillow,ignore
+plant,flower
+plastic,ignore
+platform,ignore
+playingfield,ignore
+railing,ignore
+railroad,ignore
+river,water
+road,ignore
+rock,rock
+roof,ignore
+rug,ignore
+salad,ignore
+sand,sand
+sea,water
+shelf,ignore
+sky,sky
+skyscraper,ignore
+snow,snow
+solid-other,ignore
+stairs,ignore
+stone,stone
+straw,grass
+structural,ignore
+table,ignore
+tent,ignore
+textile-other,ignore
+towel,ignore
+tree,tree
+vegetable,ignore
+wall-brick,ignore
+wall-concrete,ignore
+wall-other,ignore
+wall-panel,ignore
+wall-stone,ignore
+wall-tile,ignore
+wall-wood,ignore
+water,water
+waterdrops,ignore
+window-blind,ignore
+window-other,ignore
+wood,ignore

imaginaire/model_utils/gancraft/id2name_gg.csv ADDED Viewed

	@@ -0,0 +1,680 @@

+0,air,0,sky
+1,stone,7368816,stone
+2,granite,7368816,rock
+3,polished_granite,7368816,rock
+4,diorite,7368816,rock
+5,polished_diorite,7368816,rock
+6,andesite,7368816,rock
+7,polished_andesite,7368816,rock
+8,grass_block,8368696,grass
+9,dirt,9923917,dirt
+10,coarse_dirt,9923917,dirt
+11,podzol,9923917,dirt
+12,cobblestone,7368816,stone
+13,oak_planks,9402184,wood
+14,spruce_planks,9402184,wood
+15,birch_planks,9402184,wood
+16,jungle_planks,9402184,wood
+17,acacia_planks,9402184,wood
+18,dark_oak_planks,9402184,wood
+19,oak_sapling,31744,plant
+20,spruce_sapling,31744,plant
+21,birch_sapling,31744,plant
+22,jungle_sapling,31744,plant
+23,acacia_sapling,31744,plant
+24,dark_oak_sapling,31744,plant
+25,bedrock,7368816,rock
+26,water,4210943,water
+27,lava,16711680,
+28,sand,16247203,sand
+29,red_sand,16247203,sand
+30,gravel,16247203,gravel
+31,gold_ore,7368816,rock
+32,iron_ore,7368816,rock
+33,coal_ore,7368816,rock
+34,oak_log,9402184,tree
+35,spruce_log,9402184,tree
+36,birch_log,9402184,tree
+37,jungle_log,9402184,tree
+38,acacia_log,9402184,tree
+39,dark_oak_log,9402184,tree
+40,stripped_spruce_log,9402184,wood
+41,stripped_birch_log,9402184,wood
+42,stripped_jungle_log,9402184,wood
+43,stripped_acacia_log,9402184,wood
+44,stripped_dark_oak_log,9402184,wood
+45,stripped_oak_log,9402184,wood
+46,oak_wood,9402184,wood
+47,spruce_wood,9402184,wood
+48,birch_wood,9402184,wood
+49,jungle_wood,9402184,wood
+50,acacia_wood,9402184,wood
+51,dark_oak_wood,9402184,wood
+52,stripped_oak_wood,9402184,wood
+53,stripped_spruce_wood,9402184,wood
+54,stripped_birch_wood,9402184,wood
+55,stripped_jungle_wood,9402184,wood
+56,stripped_acacia_wood,9402184,wood
+57,stripped_dark_oak_wood,9402184,wood
+58,oak_leaves,31744,tree
+59,spruce_leaves,31744,tree
+60,birch_leaves,31744,tree
+61,jungle_leaves,31744,tree
+62,acacia_leaves,31744,tree
+63,dark_oak_leaves,31744,tree
+64,sponge,15066419,
+65,wet_sponge,15066419,
+66,glass,0,
+67,lapis_ore,7368816,
+68,lapis_block,10987431,
+69,dispenser,7368816,
+70,sandstone,7368816,sand
+71,chiseled_sandstone,7368816,sand
+72,cut_sandstone,7368816,sand
+73,note_block,9402184,
+74,white_bed,13092807,
+75,orange_bed,13092807,
+76,magenta_bed,13092807,
+77,light_blue_bed,13092807,
+78,yellow_bed,13092807,
+79,lime_bed,13092807,
+80,pink_bed,13092807,
+81,gray_bed,13092807,
+82,light_gray_bed,13092807,
+83,cyan_bed,13092807,
+84,purple_bed,13092807,
+85,blue_bed,13092807,
+86,brown_bed,13092807,
+87,green_bed,13092807,
+88,red_bed,13092807,
+89,black_bed,13092807,
+90,powered_rail,0,
+91,detector_rail,0,
+92,sticky_piston,7368816,
+93,cobweb,13092807,
+94,grass,31744,grass
+95,fern,31744,grass
+96,dead_bush,31744,grass
+97,seagrass,4210943,water
+98,tall_seagrass,4210943,water
+99,piston,7368816,
+100,piston_head,7368816,
+101,white_wool,13092807,
+102,orange_wool,13092807,
+103,magenta_wool,13092807,
+104,light_blue_wool,13092807,
+105,yellow_wool,13092807,
+106,lime_wool,13092807,
+107,pink_wool,13092807,
+108,gray_wool,13092807,
+109,light_gray_wool,13092807,
+110,cyan_wool,13092807,
+111,purple_wool,13092807,
+112,blue_wool,13092807,
+113,brown_wool,13092807,
+114,green_wool,13092807,
+115,red_wool,13092807,
+116,black_wool,13092807,
+117,moving_piston,7368816,
+118,dandelion,31744,flower
+119,poppy,31744,flower
+120,blue_orchid,31744,flower
+121,allium,31744,flower
+122,azure_bluet,31744,flower
+123,red_tulip,31744,flower
+124,orange_tulip,31744,flower
+125,white_tulip,31744,flower
+126,pink_tulip,31744,flower
+127,oxeye_daisy,31744,flower
+128,cornflower,31744,flower
+129,wither_rose,31744,flower
+130,lily_of_the_valley,31744,flower
+131,brown_mushroom,31744,flower
+132,red_mushroom,31744,flower
+133,gold_block,10987431,
+134,iron_block,10987431,
+135,bricks,7368816,
+136,tnt,16711680,
+137,bookshelf,9402184,
+138,mossy_cobblestone,7368816,
+139,obsidian,7368816,
+140,torch,0,
+141,wall_torch,0,
+142,fire,0,
+143,spawner,7368816,
+144,oak_stairs,9402184,
+145,chest,9402184,
+146,redstone_wire,0,
+147,diamond_ore,7368816,
+148,diamond_block,10987431,
+149,crafting_table,9402184,
+150,wheat,31744,
+151,farmland,9923917,
+152,furnace,7368816,
+153,oak_sign,9402184,
+154,spruce_sign,9402184,
+155,birch_sign,9402184,
+156,acacia_sign,9402184,
+157,jungle_sign,9402184,
+158,dark_oak_sign,9402184,
+159,oak_door,9402184,
+160,ladder,0,
+161,rail,0,
+162,cobblestone_stairs,7368816,
+163,oak_wall_sign,9402184,
+164,spruce_wall_sign,9402184,
+165,birch_wall_sign,9402184,
+166,acacia_wall_sign,9402184,
+167,jungle_wall_sign,9402184,
+168,dark_oak_wall_sign,9402184,
+169,lever,0,
+170,stone_pressure_plate,7368816,
+171,iron_door,10987431,
+172,oak_pressure_plate,9402184,
+173,spruce_pressure_plate,9402184,
+174,birch_pressure_plate,9402184,
+175,jungle_pressure_plate,9402184,
+176,acacia_pressure_plate,9402184,
+177,dark_oak_pressure_plate,9402184,
+178,redstone_ore,7368816,
+179,redstone_torch,0,
+180,redstone_wall_torch,0,
+181,stone_button,0,
+182,snow,16777215,snow
+183,ice,10526975,snow
+184,snow_block,16777215,snow
+185,cactus,31744,plant
+186,clay,10791096,
+187,sugar_cane,31744,plant
+188,jukebox,9402184,
+189,oak_fence,9402184,
+190,pumpkin,31744,
+191,netherrack,7368816,
+192,soul_sand,16247203,
+193,glowstone,0,
+194,nether_portal,0,
+195,carved_pumpkin,31744,
+196,jack_o_lantern,31744,
+197,cake,0,
+198,repeater,0,
+199,white_stained_glass,0,
+200,orange_stained_glass,0,
+201,magenta_stained_glass,0,
+202,light_blue_stained_glass,0,
+203,yellow_stained_glass,0,
+204,lime_stained_glass,0,
+205,pink_stained_glass,0,
+206,gray_stained_glass,0,
+207,light_gray_stained_glass,0,
+208,cyan_stained_glass,0,
+209,purple_stained_glass,0,
+210,blue_stained_glass,0,
+211,brown_stained_glass,0,
+212,green_stained_glass,0,
+213,red_stained_glass,0,
+214,black_stained_glass,0,
+215,oak_trapdoor,9402184,
+216,spruce_trapdoor,9402184,
+217,birch_trapdoor,9402184,
+218,jungle_trapdoor,9402184,
+219,acacia_trapdoor,9402184,
+220,dark_oak_trapdoor,9402184,
+221,stone_bricks,7368816,
+222,mossy_stone_bricks,7368816,
+223,cracked_stone_bricks,7368816,
+224,chiseled_stone_bricks,7368816,
+225,infested_stone,10791096,
+226,infested_cobblestone,10791096,
+227,infested_stone_bricks,10791096,
+228,infested_mossy_stone_bricks,10791096,
+229,infested_cracked_stone_bricks,10791096,
+230,infested_chiseled_stone_bricks,10791096,
+231,brown_mushroom_block,9402184,tree
+232,red_mushroom_block,9402184,tree
+233,mushroom_stem,9402184,tree
+234,iron_bars,10987431,
+235,glass_pane,0,
+236,melon,31744,
+237,attached_pumpkin_stem,31744,
+238,attached_melon_stem,31744,
+239,pumpkin_stem,31744,
+240,melon_stem,31744,
+241,vine,31744,plant
+242,oak_fence_gate,9402184,
+243,brick_stairs,7368816,
+244,stone_brick_stairs,7368816,
+245,mycelium,8368696,
+246,lily_pad,31744,grass
+247,nether_bricks,7368816,
+248,nether_brick_fence,7368816,
+249,nether_brick_stairs,7368816,
+250,nether_wart,31744,
+251,enchanting_table,7368816,
+252,brewing_stand,10987431,
+253,cauldron,10987431,
+254,end_portal,0,
+255,end_portal_frame,7368816,
+256,end_stone,7368816,
+257,dragon_egg,31744,
+258,redstone_lamp,0,
+259,cocoa,31744,
+260,sandstone_stairs,7368816,
+261,emerald_ore,7368816,
+262,ender_chest,7368816,
+263,tripwire_hook,0,
+264,tripwire,0,
+265,emerald_block,10987431,
+266,spruce_stairs,9402184,
+267,birch_stairs,9402184,
+268,jungle_stairs,9402184,
+269,command_block,10987431,
+270,beacon,0,
+271,cobblestone_wall,7368816,
+272,mossy_cobblestone_wall,7368816,
+273,flower_pot,0,
+274,potted_oak_sapling,0,
+275,potted_spruce_sapling,0,
+276,potted_birch_sapling,0,
+277,potted_jungle_sapling,0,
+278,potted_acacia_sapling,0,
+279,potted_dark_oak_sapling,0,
+280,potted_fern,0,
+281,potted_dandelion,0,
+282,potted_poppy,0,
+283,potted_blue_orchid,0,
+284,potted_allium,0,
+285,potted_azure_bluet,0,
+286,potted_red_tulip,0,
+287,potted_orange_tulip,0,
+288,potted_white_tulip,0,
+289,potted_pink_tulip,0,
+290,potted_oxeye_daisy,0,
+291,potted_cornflower,0,
+292,potted_lily_of_the_valley,0,
+293,potted_wither_rose,0,
+294,potted_red_mushroom,0,
+295,potted_brown_mushroom,0,
+296,potted_dead_bush,0,
+297,potted_cactus,0,
+298,carrots,31744,
+299,potatoes,31744,
+300,oak_button,0,
+301,spruce_button,0,
+302,birch_button,0,
+303,jungle_button,0,
+304,acacia_button,0,
+305,dark_oak_button,0,
+306,skeleton_skull,0,
+307,skeleton_wall_skull,0,
+308,wither_skeleton_skull,0,
+309,wither_skeleton_wall_skull,0,
+310,zombie_head,0,
+311,zombie_wall_head,0,
+312,player_head,0,
+313,player_wall_head,0,
+314,creeper_head,0,
+315,creeper_wall_head,0,
+316,dragon_head,0,
+317,dragon_wall_head,0,
+318,anvil,10987431,
+319,chipped_anvil,10987431,
+320,damaged_anvil,10987431,
+321,trapped_chest,9402184,
+322,light_weighted_pressure_plate,10987431,
+323,heavy_weighted_pressure_plate,10987431,
+324,comparator,0,
+325,daylight_detector,9402184,
+326,redstone_block,10987431,
+327,nether_quartz_ore,7368816,
+328,hopper,10987431,
+329,quartz_block,7368816,
+330,chiseled_quartz_block,7368816,
+331,quartz_pillar,7368816,
+332,quartz_stairs,7368816,
+333,activator_rail,0,
+334,dropper,7368816,
+335,white_terracotta,7368816,
+336,orange_terracotta,7368816,
+337,magenta_terracotta,7368816,
+338,light_blue_terracotta,7368816,
+339,yellow_terracotta,7368816,
+340,lime_terracotta,7368816,
+341,pink_terracotta,7368816,
+342,gray_terracotta,7368816,
+343,light_gray_terracotta,7368816,
+344,cyan_terracotta,7368816,
+345,purple_terracotta,7368816,
+346,blue_terracotta,7368816,
+347,brown_terracotta,7368816,
+348,green_terracotta,7368816,
+349,red_terracotta,7368816,
+350,black_terracotta,7368816,
+351,white_stained_glass_pane,0,
+352,orange_stained_glass_pane,0,
+353,magenta_stained_glass_pane,0,
+354,light_blue_stained_glass_pane,0,
+355,yellow_stained_glass_pane,0,
+356,lime_stained_glass_pane,0,
+357,pink_stained_glass_pane,0,
+358,gray_stained_glass_pane,0,
+359,light_gray_stained_glass_pane,0,
+360,cyan_stained_glass_pane,0,
+361,purple_stained_glass_pane,0,
+362,blue_stained_glass_pane,0,
+363,brown_stained_glass_pane,0,
+364,green_stained_glass_pane,0,
+365,red_stained_glass_pane,0,
+366,black_stained_glass_pane,0,
+367,acacia_stairs,9402184,
+368,dark_oak_stairs,9402184,
+369,slime_block,10791096,
+370,barrier,0,
+371,iron_trapdoor,10987431,
+372,prismarine,7368816,
+373,prismarine_bricks,7368816,
+374,dark_prismarine,7368816,
+375,prismarine_stairs,7368816,
+376,prismarine_brick_stairs,7368816,
+377,dark_prismarine_stairs,7368816,
+378,prismarine_slab,7368816,
+379,prismarine_brick_slab,7368816,
+380,dark_prismarine_slab,7368816,
+381,sea_lantern,0,
+382,hay_block,8368696,
+383,white_carpet,13092807,
+384,orange_carpet,13092807,
+385,magenta_carpet,13092807,
+386,light_blue_carpet,13092807,
+387,yellow_carpet,13092807,
+388,lime_carpet,13092807,
+389,pink_carpet,13092807,
+390,gray_carpet,13092807,
+391,light_gray_carpet,13092807,
+392,cyan_carpet,13092807,
+393,purple_carpet,13092807,
+394,blue_carpet,13092807,
+395,brown_carpet,13092807,
+396,green_carpet,13092807,
+397,red_carpet,13092807,
+398,black_carpet,13092807,
+399,terracotta,7368816,
+400,coal_block,7368816,
+401,packed_ice,10526975,
+402,sunflower,31744,flower
+403,lilac,31744,flower
+404,rose_bush,31744,flower
+405,peony,31744,flower
+406,tall_grass,31744,plant
+407,large_fern,31744,plant
+408,white_banner,9402184,
+409,orange_banner,9402184,
+410,magenta_banner,9402184,
+411,light_blue_banner,9402184,
+412,yellow_banner,9402184,
+413,lime_banner,9402184,
+414,pink_banner,9402184,
+415,gray_banner,9402184,
+416,light_gray_banner,9402184,
+417,cyan_banner,9402184,
+418,purple_banner,9402184,
+419,blue_banner,9402184,
+420,brown_banner,9402184,
+421,green_banner,9402184,
+422,red_banner,9402184,
+423,black_banner,9402184,
+424,white_wall_banner,9402184,
+425,orange_wall_banner,9402184,
+426,magenta_wall_banner,9402184,
+427,light_blue_wall_banner,9402184,
+428,yellow_wall_banner,9402184,
+429,lime_wall_banner,9402184,
+430,pink_wall_banner,9402184,
+431,gray_wall_banner,9402184,
+432,light_gray_wall_banner,9402184,
+433,cyan_wall_banner,9402184,
+434,purple_wall_banner,9402184,
+435,blue_wall_banner,9402184,
+436,brown_wall_banner,9402184,
+437,green_wall_banner,9402184,
+438,red_wall_banner,9402184,
+439,black_wall_banner,9402184,
+440,red_sandstone,7368816,
+441,chiseled_red_sandstone,7368816,
+442,cut_red_sandstone,7368816,
+443,red_sandstone_stairs,7368816,
+444,oak_slab,9402184,
+445,spruce_slab,9402184,
+446,birch_slab,9402184,
+447,jungle_slab,9402184,
+448,acacia_slab,9402184,
+449,dark_oak_slab,9402184,
+450,stone_slab,7368816,
+451,smooth_stone_slab,7368816,
+452,sandstone_slab,7368816,
+453,cut_sandstone_slab,7368816,
+454,petrified_oak_slab,7368816,
+455,cobblestone_slab,7368816,
+456,brick_slab,7368816,
+457,stone_brick_slab,7368816,
+458,nether_brick_slab,7368816,
+459,quartz_slab,7368816,
+460,red_sandstone_slab,7368816,
+461,cut_red_sandstone_slab,7368816,
+462,purpur_slab,7368816,
+463,smooth_stone,7368816,
+464,smooth_sandstone,7368816,
+465,smooth_quartz,7368816,
+466,smooth_red_sandstone,7368816,
+467,spruce_fence_gate,9402184,
+468,birch_fence_gate,9402184,
+469,jungle_fence_gate,9402184,
+470,acacia_fence_gate,9402184,
+471,dark_oak_fence_gate,9402184,
+472,spruce_fence,9402184,
+473,birch_fence,9402184,
+474,jungle_fence,9402184,
+475,acacia_fence,9402184,
+476,dark_oak_fence,9402184,
+477,spruce_door,9402184,
+478,birch_door,9402184,
+479,jungle_door,9402184,
+480,acacia_door,9402184,
+481,dark_oak_door,9402184,
+482,end_rod,0,
+483,chorus_plant,31744,
+484,chorus_flower,31744,
+485,purpur_block,7368816,
+486,purpur_pillar,7368816,
+487,purpur_stairs,7368816,
+488,end_stone_bricks,7368816,
+489,beetroots,31744,
+490,grass_path,9923917,
+491,end_gateway,0,
+492,repeating_command_block,10987431,
+493,chain_command_block,10987431,
+494,frosted_ice,10526975,
+495,magma_block,7368816,
+496,nether_wart_block,8368696,
+497,red_nether_bricks,7368816,
+498,bone_block,7368816,
+499,structure_void,0,
+500,observer,7368816,
+501,shulker_box,8339378,
+502,white_shulker_box,8339378,
+503,orange_shulker_box,8339378,
+504,magenta_shulker_box,8339378,
+505,light_blue_shulker_box,8339378,
+506,yellow_shulker_box,8339378,
+507,lime_shulker_box,8339378,
+508,pink_shulker_box,8339378,
+509,gray_shulker_box,8339378,
+510,light_gray_shulker_box,8339378,
+511,cyan_shulker_box,8339378,
+512,purple_shulker_box,8339378,
+513,blue_shulker_box,8339378,
+514,brown_shulker_box,8339378,
+515,green_shulker_box,8339378,
+516,red_shulker_box,8339378,
+517,black_shulker_box,8339378,
+518,white_glazed_terracotta,7368816,
+519,orange_glazed_terracotta,7368816,
+520,magenta_glazed_terracotta,7368816,
+521,light_blue_glazed_terracotta,7368816,
+522,yellow_glazed_terracotta,7368816,
+523,lime_glazed_terracotta,7368816,
+524,pink_glazed_terracotta,7368816,
+525,gray_glazed_terracotta,7368816,
+526,light_gray_glazed_terracotta,7368816,
+527,cyan_glazed_terracotta,7368816,
+528,purple_glazed_terracotta,7368816,
+529,blue_glazed_terracotta,7368816,
+530,brown_glazed_terracotta,7368816,
+531,green_glazed_terracotta,7368816,
+532,red_glazed_terracotta,7368816,
+533,black_glazed_terracotta,7368816,
+534,white_concrete,7368816,
+535,orange_concrete,7368816,
+536,magenta_concrete,7368816,
+537,light_blue_concrete,7368816,
+538,yellow_concrete,7368816,
+539,lime_concrete,7368816,
+540,pink_concrete,7368816,
+541,gray_concrete,7368816,
+542,light_gray_concrete,7368816,
+543,cyan_concrete,7368816,
+544,purple_concrete,7368816,
+545,blue_concrete,7368816,
+546,brown_concrete,7368816,
+547,green_concrete,7368816,
+548,red_concrete,7368816,
+549,black_concrete,7368816,
+550,white_concrete_powder,16247203,
+551,orange_concrete_powder,16247203,
+552,magenta_concrete_powder,16247203,
+553,light_blue_concrete_powder,16247203,
+554,yellow_concrete_powder,16247203,
+555,lime_concrete_powder,16247203,
+556,pink_concrete_powder,16247203,
+557,gray_concrete_powder,16247203,
+558,light_gray_concrete_powder,16247203,
+559,cyan_concrete_powder,16247203,
+560,purple_concrete_powder,16247203,
+561,blue_concrete_powder,16247203,
+562,brown_concrete_powder,16247203,
+563,green_concrete_powder,16247203,
+564,red_concrete_powder,16247203,
+565,black_concrete_powder,16247203,
+566,kelp,4210943,
+567,kelp_plant,4210943,
+568,dried_kelp_block,8368696,
+569,turtle_egg,31744,
+570,dead_tube_coral_block,7368816,
+571,dead_brain_coral_block,7368816,
+572,dead_bubble_coral_block,7368816,
+573,dead_fire_coral_block,7368816,
+574,dead_horn_coral_block,7368816,
+575,tube_coral_block,7368816,
+576,brain_coral_block,7368816,
+577,bubble_coral_block,7368816,
+578,fire_coral_block,7368816,
+579,horn_coral_block,7368816,
+580,dead_tube_coral,7368816,
+581,dead_brain_coral,7368816,
+582,dead_bubble_coral,7368816,
+583,dead_fire_coral,7368816,
+584,dead_horn_coral,7368816,
+585,tube_coral,4210943,
+586,brain_coral,4210943,
+587,bubble_coral,4210943,
+588,fire_coral,4210943,
+589,horn_coral,4210943,
+590,dead_tube_coral_fan,7368816,
+591,dead_brain_coral_fan,7368816,
+592,dead_bubble_coral_fan,7368816,
+593,dead_fire_coral_fan,7368816,
+594,dead_horn_coral_fan,7368816,
+595,tube_coral_fan,4210943,
+596,brain_coral_fan,4210943,
+597,bubble_coral_fan,4210943,
+598,fire_coral_fan,4210943,
+599,horn_coral_fan,4210943,
+600,dead_tube_coral_wall_fan,7368816,
+601,dead_brain_coral_wall_fan,7368816,
+602,dead_bubble_coral_wall_fan,7368816,
+603,dead_fire_coral_wall_fan,7368816,
+604,dead_horn_coral_wall_fan,7368816,
+605,tube_coral_wall_fan,4210943,
+606,brain_coral_wall_fan,4210943,
+607,bubble_coral_wall_fan,4210943,
+608,fire_coral_wall_fan,4210943,
+609,horn_coral_wall_fan,4210943,
+610,sea_pickle,4210943,
+611,blue_ice,10526975,
+612,conduit,0,
+613,bamboo_sapling,9402184,plant
+614,bamboo,9402184,plant
+615,potted_bamboo,0,
+616,void_air,0,dirt
+617,cave_air,0,dirt
+618,bubble_column,4210943,
+619,polished_granite_stairs,7368816,
+620,smooth_red_sandstone_stairs,7368816,
+621,mossy_stone_brick_stairs,7368816,
+622,polished_diorite_stairs,7368816,
+623,mossy_cobblestone_stairs,7368816,
+624,end_stone_brick_stairs,7368816,
+625,stone_stairs,7368816,
+626,smooth_sandstone_stairs,7368816,
+627,smooth_quartz_stairs,7368816,
+628,granite_stairs,7368816,
+629,andesite_stairs,7368816,
+630,red_nether_brick_stairs,7368816,
+631,polished_andesite_stairs,7368816,
+632,diorite_stairs,7368816,
+633,polished_granite_slab,7368816,
+634,smooth_red_sandstone_slab,7368816,
+635,mossy_stone_brick_slab,7368816,
+636,polished_diorite_slab,7368816,
+637,mossy_cobblestone_slab,7368816,
+638,end_stone_brick_slab,7368816,
+639,smooth_sandstone_slab,7368816,
+640,smooth_quartz_slab,7368816,
+641,granite_slab,7368816,
+642,andesite_slab,7368816,
+643,red_nether_brick_slab,7368816,
+644,polished_andesite_slab,7368816,
+645,diorite_slab,7368816,
+646,brick_wall,7368816,
+647,prismarine_wall,7368816,
+648,red_sandstone_wall,7368816,
+649,mossy_stone_brick_wall,7368816,
+650,granite_wall,7368816,
+651,stone_brick_wall,7368816,
+652,nether_brick_wall,7368816,
+653,andesite_wall,7368816,
+654,red_nether_brick_wall,7368816,
+655,sandstone_wall,7368816,
+656,end_stone_brick_wall,7368816,
+657,diorite_wall,7368816,
+658,scaffolding,0,
+659,loom,9402184,
+660,barrel,9402184,
+661,smoker,7368816,
+662,blast_furnace,7368816,
+663,cartography_table,9402184,
+664,fletching_table,9402184,
+665,grindstone,10987431,
+666,lectern,9402184,
+667,smithing_table,9402184,
+668,stonecutter,7368816,
+669,bell,10987431,
+670,lantern,10987431,
+671,campfire,9402184,
+672,sweet_berry_bush,31744,
+673,structure_block,10987431,
+674,jigsaw,10987431,
+675,composter,9402184,
+676,bee_nest,9402184,
+677,beehive,9402184,
+678,honey_block,10791096,
+679,honeycomb_block,10791096,

imaginaire/model_utils/gancraft/loss.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, check out LICENSE.md
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class GANLoss(nn.Module):
+    def __init__(self, target_real_label=1.0, target_fake_label=0.0):
+        r"""GAN loss constructor.
+        Args:
+            target_real_label (float): Desired output label for the real images.
+            target_fake_label (float): Desired output label for the fake images.
+        """
+        super(GANLoss, self).__init__()
+        self.real_label = target_real_label
+        self.fake_label = target_fake_label
+        self.real_label_tensor = None
+        self.fake_label_tensor = None
+    def forward(self, input_x, t_real, weight=None,
+                reduce_dim=True, dis_update=True):
+        r"""GAN loss computation.
+        Args:
+            input_x (tensor or list of tensors): Output values.
+            t_real (boolean): Is this output value for real images.
+            reduce_dim (boolean): Whether we reduce the dimensions first. This makes a difference when we use
+            multi-resolution discriminators.
+            weight (float): Weight to scale the loss value.
+            dis_update (boolean): Updating the discriminator or the generator.
+        Returns:
+            loss (tensor): Loss value.
+        """
+        if isinstance(input_x, list):
+            loss = 0
+            for pred_i in input_x:
+                if isinstance(pred_i, list):
+                    pred_i = pred_i[-1]
+                loss_tensor = self.loss(pred_i, t_real, weight,
+                                        reduce_dim, dis_update)
+                bs = 1 if len(loss_tensor.size()) == 0 else loss_tensor.size(0)
+                new_loss = torch.mean(loss_tensor.view(bs, -1), dim=1)
+                loss += new_loss
+            return loss / len(input_x)
+        else:
+            return self.loss(input_x, t_real, weight, reduce_dim, dis_update)
+    def loss(self, input_x, t_real, weight=None,
+             reduce_dim=True, dis_update=True):
+        r"""N+1 label GAN loss computation.
+        Args:
+            input_x (tensor): Output values.
+            t_real (boolean): Is this output value for real images.
+            reduce_dim (boolean): Whether we reduce the dimensions first. This makes a difference when we use
+            multi-resolution discriminators.
+            weight (float): Weight to scale the loss value.
+            dis_update (boolean): Updating the discriminator or the generator.
+        Returns:
+            loss (tensor): Loss value.
+        """
+        assert reduce_dim is True
+        pred = input_x['pred'].clone()
+        label = input_x['label'].clone()
+        batch_size = pred.size(0)
+        # ignore label 0
+        label[:, 0, ...] = 0
+        pred[:, 0, ...] = 0
+        pred = F.log_softmax(pred, dim=1)
+        assert pred.size(1) == (label.size(1) + 1)
+        if dis_update:
+            if t_real:
+                pred_real = pred[:, :-1, :, :]
+                loss = - label * pred_real
+                loss = torch.sum(loss, dim=1, keepdim=True)
+            else:
+                pred_fake = pred[:, -1, None, :, :]  # N plus 1
+                loss = - pred_fake
+        else:
+            assert t_real, "GAN loss must be aiming for real."
+            pred_real = pred[:, :-1, :, :]
+            loss = - label * pred_real
+            loss = torch.sum(loss, dim=1, keepdim=True)
+        if weight is not None:
+            loss = loss * weight
+        if reduce_dim:
+            loss = torch.mean(loss)
+        else:
+            loss = loss.view(batch_size, -1).mean(dim=1)
+        return loss