Add inference code

Browse files

Files changed (9) hide show

assets/model_architecture.png +3 -0
config.json +26 -0
examples/Mexico_HLS.S30.T13REM.2018026T173609.v2.0_cropped.tif +3 -0
examples/Mexico_HLS.S30.T13REM.2018106T172859.v2.0_cropped.tif +3 -0
examples/Mexico_HLS.S30.T13REM.2018201T172901.v2.0_cropped.tif +3 -0
examples/Mexico_HLS.S30.T13REM.2018266T173029.v2.0_cropped.tif +3 -0
inference.py +528 -0
prithvi_mae.py +766 -0
requirements.txt +5 -0

assets/model_architecture.png ADDED Viewed

Git LFS Details

SHA256: 30d14e91bfaf1ec39a182254bb7cbdf3b98ae87d941b846c96d2042269c46cdb
Pointer size: 132 Bytes
Size of remote file: 1.84 MB

config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "architecture": "prithvi_eo_v2_tiny",
+  "num_features": 192,
+  "pretrained_cfg": {
+    "img_size": 224,
+    "num_frames": 4,
+    "patch_size": [1, 16, 16],
+    "in_chans": 6,
+    "embed_dim": 192,
+    "depth": 12,
+    "num_heads": 3,
+    "decoder_embed_dim": 512,
+    "decoder_depth": 8,
+    "decoder_num_heads": 16,
+    "mlp_ratio": 4,
+    "coords_encoding": ["time", "location"],
+    "coords_scale_learn": true,
+    "mask_ratio": 0.75,
+    "norm_pix_loss": false,
+    "bands": ["B02", "B03", "B04", "B05", "B06", "B07"],
+    "mean": [1087.0, 1342.0, 1433.0, 2734.0, 1958.0, 1363.0],
+    "std": [2248.0, 2179.0, 2178.0, 1850.0, 1242.0, 1049.0],
+    "origin_url": "https://huggingface.co/ibm-nasa-geospatial/Prithvi-EO-2.0-tiny",
+    "paper_ids": "arXiv:X.X"
+  }
+}

examples/Mexico_HLS.S30.T13REM.2018026T173609.v2.0_cropped.tif ADDED Viewed

Git LFS Details

SHA256: e34c1e8f6b69092bbf16f87da1a0c2337e8e53f28d172d8076e2efab292b795d
Pointer size: 132 Bytes
Size of remote file: 3.01 MB

examples/Mexico_HLS.S30.T13REM.2018106T172859.v2.0_cropped.tif ADDED Viewed

Git LFS Details

SHA256: a4b24a34d83d25cac7dbcb7742db3f5b1e4849e5773172c1f0fc43c541bcd3fd
Pointer size: 132 Bytes
Size of remote file: 3.01 MB

examples/Mexico_HLS.S30.T13REM.2018201T172901.v2.0_cropped.tif ADDED Viewed

Git LFS Details

SHA256: fce050cc821ebec2974e85cfe702c0f093d74caf12196adb7ee88c8a30773d4f
Pointer size: 132 Bytes
Size of remote file: 3.01 MB

examples/Mexico_HLS.S30.T13REM.2018266T173029.v2.0_cropped.tif ADDED Viewed

Git LFS Details

SHA256: f7f8c67c32027cd663f48226a5932c6c8119a55fb6e80a02636dea57f4733963
Pointer size: 132 Bytes
Size of remote file: 3.01 MB

inference.py ADDED Viewed

	@@ -0,0 +1,528 @@

+import argparse
+import functools
+import os
+from typing import List, Union
+import re
+import datetime
+import numpy as np
+import pandas as pd
+import rasterio
+import torch
+import yaml
+from einops import rearrange
+from functools import partial
+from torch.distributed.checkpoint import state_dict
+from prithvi_mae import PrithviMAE
+NO_DATA = -9999
+NO_DATA_FLOAT = 0.0001
+OFFSET = 0
+PERCENTILE = 99.9
+def process_channel_group(orig_img, new_img, channels, mean, std):
+    """Process *orig_img* and *new_img* for RGB visualization. Each band is rescaled back to the
+        original range using *data_mean* and *data_std* and then lowest and highest percentiles are
+        removed to enhance contrast. Data is rescaled to (0, 1) range and stacked channels_first.
+    Args:
+        orig_img: torch.Tensor representing original image (reference) with shape = (bands, H, W).
+        new_img: torch.Tensor representing image with shape = (bands, H, W).
+        channels: list of indices representing RGB channels.
+        mean: list of mean values for each band.
+        std: list of std values for each band.
+    Returns:
+        torch.Tensor with shape (num_channels, height, width) for original image
+        torch.Tensor with shape (num_channels, height, width) for the other image
+    """
+    mean = torch.tensor(np.asarray(mean)[:, None, None])  # C H W
+    std = torch.tensor(np.asarray(std)[:, None, None])
+    orig_img = orig_img[channels, ...]
+    valid_mask = torch.ones_like(orig_img, dtype=torch.bool)
+    valid_mask[orig_img == NO_DATA_FLOAT] = False
+    # Back to original data range
+    orig_img = (orig_img * std[channels]) + mean[channels]
+    new_img = (new_img[channels, ...] * std[channels]) + mean[channels]
+    # Rescale (enhancing contrast)
+    max_value = max(3000, np.percentile(orig_img[valid_mask], PERCENTILE))
+    min_value = OFFSET
+    orig_img = torch.clamp((orig_img - min_value) / (max_value - min_value), 0, 1)
+    new_img = torch.clamp((new_img - min_value) / (max_value - min_value), 0, 1)
+    # No data as zeros
+    orig_img[~valid_mask] = 0
+    new_img[~valid_mask] = 0
+    return orig_img, new_img
+def read_geotiff(file_path: str):
+    """Read all bands from *file_path* and return image + meta info.
+    Args:
+        file_path: path to image file.
+    Returns:
+        np.ndarray with shape (bands, height, width)
+        meta info dict
+    """
+    with rasterio.open(file_path) as src:
+        img = src.read()
+        meta = src.meta
+        try:
+            coords = src.lnglat()
+        except:
+            # Cannot read coords
+            coords = None
+    return img, meta, coords
+def save_geotiff(image, output_path: str, meta: dict):
+    """Save multi-band image in Geotiff file.
+    Args:
+        image: np.ndarray with shape (bands, height, width)
+        output_path: path where to save the image
+        meta: dict with meta info.
+    """
+    with rasterio.open(output_path, "w", **meta) as dest:
+        for i in range(image.shape[0]):
+            dest.write(image[i, :, :], i + 1)
+    return
+def _convert_np_uint8(float_image: torch.Tensor):
+    image = float_image.numpy() * 255.0
+    image = image.astype(dtype=np.uint8)
+    return image
+def load_example(
+    file_paths: List[str],
+    mean: List[float],
+    std: List[float],
+    indices: Union[list[int], None] = None,
+):
+    """Build an input example by loading images in *file_paths*.
+    Args:
+        file_paths: list of file paths .
+        mean: list containing mean values for each band in the images in *file_paths*.
+        std: list containing std values for each band in the images in *file_paths*.
+    Returns:
+        np.array containing created example
+        list of meta info for each image in *file_paths*
+    """
+    imgs = []
+    metas = []
+    temporal_coords = []
+    location_coords = []
+    for file in file_paths:
+        img, meta, coords = read_geotiff(file)
+        # Rescaling (don't normalize on nodata)
+        img = np.moveaxis(img, 0, -1)  # channels last for rescaling
+        if indices is not None:
+            img = img[..., indices]
+        img = np.where(img == NO_DATA, NO_DATA_FLOAT, (img - mean) / std)
+        imgs.append(img)
+        metas.append(meta)
+        if coords is not None:
+            location_coords.append(coords)
+        try:
+            match = re.search(r'(\d{7,8}T\d{6})', file)
+            if match:
+                    year = int(match.group(1)[:4])
+                    julian_day = match.group(1).split('T')[0][4:]
+                    if len(julian_day) == 3:
+                        julian_day = int(julian_day)
+                    else:
+                        julian_day = datetime.datetime.strptime(julian_day, '%m%d').timetuple().tm_yday
+            temporal_coords.append([year, julian_day])
+        except Exception as e:
+            print(f'Could not extract timestamp for {file} ({e})')
+    imgs = np.stack(imgs, axis=0)  # num_frames, H, W, C
+    imgs = np.moveaxis(imgs, -1, 0).astype("float32")  # C, num_frames, H, W
+    imgs = np.expand_dims(imgs, axis=0)  # add batch di
+    return imgs, temporal_coords, location_coords, metas
+def run_model(
+    model: torch.nn.Module,
+    input_data: torch.Tensor,
+    temporal_coords: None | torch.Tensor,
+    location_coords: None | torch.Tensor,
+    mask_ratio: float,
+    device: torch.device,
+):
+    """Run *model* with *input_data* and create images from output tokens (mask, reconstructed + visible).
+    Args:
+        model: MAE model to run.
+        input_data: torch.Tensor with shape (B, C, T, H, W).
+        mask_ratio: mask ratio to use.
+        device: device where model should run.
+    Returns:
+        3 torch.Tensor with shape (B, C, T, H, W).
+    """
+    with torch.no_grad():
+        x = input_data.to(device)
+        _, pred, mask = model(x, temporal_coords, location_coords, mask_ratio)
+    # Create mask and prediction images (un-patchify)
+    mask_img = (
+        model.unpatchify(mask.unsqueeze(-1).repeat(1, 1, pred.shape[-1])).detach().cpu()
+    )
+    pred_img = model.unpatchify(pred).detach().cpu()
+    # Mix visible and predicted patches
+    rec_img = input_data.clone()
+    rec_img[mask_img == 1] = pred_img[
+        mask_img == 1
+    ]  # binary mask: 0 is keep, 1 is remove
+    # Switch zeros/ones in mask images so masked patches appear darker in plots (better visualization)
+    mask_img = (~(mask_img.to(torch.bool))).to(torch.float)
+    return rec_img, mask_img
+def save_rgb_imgs(
+    input_img, rec_img, mask_img, channels, mean, std, output_dir, meta_data
+):
+    """Wrapper function to save Geotiff images (original, reconstructed, masked) per timestamp.
+    Args:
+        input_img: input torch.Tensor with shape (C, T, H, W).
+        rec_img: reconstructed torch.Tensor with shape (C, T, H, W).
+        mask_img: mask torch.Tensor with shape (C, T, H, W).
+        channels: list of indices representing RGB channels.
+        mean: list of mean values for each band.
+        std: list of std values for each band.
+        output_dir: directory where to save outputs.
+        meta_data: list of dicts with geotiff meta info.
+    """
+    for t in range(input_img.shape[1]):
+        rgb_orig, rgb_pred = process_channel_group(
+            orig_img=input_img[:, t, :, :],
+            new_img=rec_img[:, t, :, :],
+            channels=channels,
+            mean=mean,
+            std=std,
+        )
+        rgb_mask = mask_img[channels, t, :, :] * rgb_orig
+        # Saving images
+        save_geotiff(
+            image=_convert_np_uint8(rgb_orig),
+            output_path=os.path.join(output_dir, f"original_rgb_t{t}.tiff"),
+            meta=meta_data[t],
+        )
+        save_geotiff(
+            image=_convert_np_uint8(rgb_pred),
+            output_path=os.path.join(output_dir, f"predicted_rgb_t{t}.tiff"),
+            meta=meta_data[t],
+        )
+        save_geotiff(
+            image=_convert_np_uint8(rgb_mask),
+            output_path=os.path.join(output_dir, f"masked_rgb_t{t}.tiff"),
+            meta=meta_data[t],
+        )
+def save_imgs(rec_img, mask_img, mean, std, output_dir, meta_data):
+    """Wrapper function to save Geotiff images (reconstructed, mask) per timestamp.
+    Args:
+        rec_img: reconstructed torch.Tensor with shape (C, T, H, W).
+        mask_img: mask torch.Tensor with shape (C, T, H, W).
+        mean: list of mean values for each band.
+        std: list of std values for each band.
+        output_dir: directory where to save outputs.
+        meta_data: list of dicts with geotiff meta info.
+    """
+    mean = torch.tensor(np.asarray(mean)[:, None, None])  # C H W
+    std = torch.tensor(np.asarray(std)[:, None, None])
+    for t in range(rec_img.shape[1]):
+        # Back to original data range
+        rec_img_t = ((rec_img[:, t, :, :] * std) + mean).to(torch.int16)
+        mask_img_t = mask_img[:, t, :, :].to(torch.int16)
+        # Saving images
+        save_geotiff(
+            image=rec_img_t,
+            output_path=os.path.join(output_dir, f"predicted_t{t}.tiff"),
+            meta=meta_data[t],
+        )
+        save_geotiff(
+            image=mask_img_t,
+            output_path=os.path.join(output_dir, f"mask_t{t}.tiff"),
+            meta=meta_data[t],
+        )
+def main(
+    data_files: List[str],
+    config_path: str,
+    checkpoint: str,
+    output_dir: str,
+    rgb_outputs: bool,
+    mask_ratio: float = None,
+    input_indices: list[int] = None,
+):
+    os.makedirs(output_dir, exist_ok=True)
+    # Get parameters --------
+    import json
+    with open(config_path, "r") as f:
+        config = yaml.safe_load(f)['pretrained_cfg']
+    batch_size = 1
+    bands = config['bands']
+    num_frames = len(data_files)
+    mean = config['mean']
+    std = config['std']
+    coords_encoding = config['coords_encoding']
+    img_size = config['img_size']
+    mask_ratio = mask_ratio or config['mask_ratio']
+    print(
+        f"\nTreating {len(data_files)} files as {len(data_files)} time steps from the same location\n"
+    )
+    if len(data_files) != 4:
+        print(
+            "The original model was trained for four time steps. \nResults with different numbers of time steps may vary"
+        )
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+    else:
+        device = torch.device("cpu")
+    print(f"Using {device} device.\n")
+    # Loading data ---------------------------------------------------------------------------------
+    input_data, temporal_coords, location_coords, meta_data = load_example(
+        file_paths=data_files, indices=input_indices, mean=mean, std=std
+    )
+    if len(temporal_coords) != num_frames and 'time' in coords_encoding:
+        coords_encoding.pop('time')
+    if not len(location_coords) and 'location' in coords_encoding:
+        coords_encoding.pop('location')
+    # Create model and load checkpoint -------------------------------------------------------------
+    config.update(
+        coords_encoding=coords_encoding,
+        num_frames=num_frames,
+        in_chans=len(bands),
+    )
+    model = PrithviMAE(**config)
+    total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f"\n--> Model has {total_params:,} parameters.\n")
+    model.to(device)
+    state_dict = torch.load(checkpoint, map_location=device, weights_only=True)
+    # discard fixed pos_embedding weight
+    for k in list(state_dict.keys()):
+        if k == 'encoder.pos_embed':
+            state_dict[k] = model.encoder.pos_embed
+        elif k == 'decoder.decoder_pos_embed':
+            state_dict[k] = model.decoder.decoder_pos_embed
+    model.load_state_dict(state_dict, strict=True)
+    print(f"Loaded checkpoint from {checkpoint}")
+    # Running model --------------------------------------------------------------------------------
+    model.eval()
+    channels = [bands.index(b) for b in ["B04", "B03", "B02"]]  # BGR -> RGB
+    # Reflect pad if not divisible by img_size
+    original_h, original_w = input_data.shape[-2:]
+    pad_h = img_size - (original_h % img_size)
+    pad_w = img_size - (original_w % img_size)
+    input_data = np.pad(
+        input_data, ((0, 0), (0, 0), (0, 0), (0, pad_h), (0, pad_w)), mode="reflect"
+    )
+    # Build sliding window
+    batch = torch.tensor(input_data, device="cpu")
+    windows = batch.unfold(3, img_size, img_size).unfold(4, img_size, img_size)
+    h1, w1 = windows.shape[3:5]
+    windows = rearrange(
+        windows, "b c t h1 w1 h w -> (b h1 w1) c t h w", h=img_size, w=img_size
+    )
+    # Split into batches if number of windows > batch_size
+    num_batches = windows.shape[0] // batch_size if windows.shape[0] > batch_size else 1
+    windows = torch.tensor_split(windows, num_batches, dim=0)
+    temporal_coords = torch.Tensor(temporal_coords, device=device).unsqueeze(0)
+    location_coords = torch.Tensor(location_coords[0], device=device).unsqueeze(0)
+    # Run model
+    rec_imgs = []
+    mask_imgs = []
+    for x in windows:
+        rec_img, mask_img = run_model(model, x, temporal_coords, location_coords, mask_ratio, device)
+        rec_imgs.append(rec_img)
+        mask_imgs.append(mask_img)
+    rec_imgs = torch.concat(rec_imgs, dim=0)
+    mask_imgs = torch.concat(mask_imgs, dim=0)
+    # Build images from patches
+    rec_imgs = rearrange(
+        rec_imgs,
+        "(b h1 w1) c t h w -> b c t (h1 h) (w1 w)",
+        h=img_size,
+        w=img_size,
+        b=1,
+        c=len(bands),
+        t=num_frames,
+        h1=h1,
+        w1=w1,
+    )
+    mask_imgs = rearrange(
+        mask_imgs,
+        "(b h1 w1) c t h w -> b c t (h1 h) (w1 w)",
+        h=img_size,
+        w=img_size,
+        b=1,
+        c=len(bands),
+        t=num_frames,
+        h1=h1,
+        w1=w1,
+    )
+    # Cut padded images back to original size
+    rec_imgs_full = rec_imgs[..., :original_h, :original_w]
+    mask_imgs_full = mask_imgs[..., :original_h, :original_w]
+    batch_full = batch[..., :original_h, :original_w]
+    # Build output images
+    if rgb_outputs:
+        for d in meta_data:
+            d.update(count=3, dtype="uint8", compress="lzw", nodata=0)
+        save_rgb_imgs(
+            batch_full[0, ...],
+            rec_imgs_full[0, ...],
+            mask_imgs_full[0, ...],
+            channels,
+            mean,
+            std,
+            output_dir,
+            meta_data,
+        )
+    else:
+        for d in meta_data:
+            d.update(compress="lzw", nodata=0)
+        save_imgs(
+            rec_imgs_full[0, ...],
+            mask_imgs_full[0, ...],
+            mean,
+            std,
+            output_dir,
+            meta_data,
+        )
+    print("Done!")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("MAE run inference", add_help=False)
+    parser.add_argument(
+        "--data_files",
+        type=str,
+        nargs="+",
+        default=["examples/Mexico_HLS.S30.T13REM.2018026T173609.v2.0_cropped.tif",
+                 "examples/Mexico_HLS.S30.T13REM.2018106T172859.v2.0_cropped.tif",
+                 "examples/Mexico_HLS.S30.T13REM.2018201T172901.v2.0_cropped.tif",
+                 "examples/Mexico_HLS.S30.T13REM.2018266T173029.v2.0_cropped.tif",
+                 ],
+        help="Path to the data files. Assumes multi-band files.",
+    )
+    parser.add_argument(
+        "--config_path",
+        "-c",
+        type=str,
+        default="config.json",
+        help="Path to json file containing model training parameters.",
+    )
+    parser.add_argument(
+        "--checkpoint",
+        type=str,
+        default="Prithvi_EO_V2_tiny.pt",
+        help="Path to a checkpoint file to load from.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="output",
+        help="Path to the directory where to save outputs.",
+    )
+    parser.add_argument(
+        "--mask_ratio",
+        default=0.75,
+        type=float,
+        help="Masking ratio (percentage of removed patches). "
+        "If None (default) use same value used for pretraining.",
+    )
+    parser.add_argument(
+        "--input_indices",
+        default=None,
+        type=int,
+        nargs="+",
+        help="0-based indices of channels to be selected from the input. By default takes all.",
+    )
+    parser.add_argument(
+        "--rgb_outputs",
+        action="store_true",
+        help="If present, output files will only contain RGB channels. "
+        "Otherwise, all bands will be saved.",
+    )
+    args = parser.parse_args()
+    main(**vars(args))

prithvi_mae.py ADDED Viewed

	@@ -0,0 +1,766 @@

+# Copyright (c) IBM Corp. 2024. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------
+# References:
+# timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm
+# transformers: https://github.com/huggingface/transformers
+# --------------------------------------------------------
+import warnings
+import logging
+import numpy as np
+import torch
+import torch.nn as nn
+from einops import rearrange
+from timm.layers import to_2tuple
+from timm.models.vision_transformer import Block
+logger = logging.getLogger(__name__)
+def get_3d_sincos_pos_embed(embed_dim, grid_size, add_cls_token=False):
+    """
+    Create 3D sin/cos positional embeddings.
+    Args:
+        embed_dim (int):
+            Embedding dimension.
+        grid_size (tuple[int, int, int] | list[int]):
+            The grid depth, height and width.
+        add_cls_token (bool, *optional*, defaults to False):
+            Whether or not to add a classification (CLS) token.
+    Returns:
+        (`torch.FloatTensor` of shape (grid_size[0]*grid_size[1]*grid_size[2], embed_dim) or
+        (1+grid_size[0]*grid_size[1]*grid_size[2], embed_dim): the position embeddings (with or without cls token)
+    """
+    assert embed_dim % 16 == 0
+    t_size, h_size, w_size = grid_size
+    w_embed_dim = embed_dim // 16 * 6
+    h_embed_dim = embed_dim // 16 * 6
+    t_embed_dim = embed_dim // 16 * 4
+    w_pos_embed = get_1d_sincos_pos_embed_from_grid(w_embed_dim, np.arange(w_size))
+    h_pos_embed = get_1d_sincos_pos_embed_from_grid(h_embed_dim, np.arange(h_size))
+    t_pos_embed = get_1d_sincos_pos_embed_from_grid(t_embed_dim, np.arange(t_size))
+    w_pos_embed = np.tile(w_pos_embed, (t_size * h_size, 1))
+    h_pos_embed = np.tile(np.repeat(h_pos_embed, w_size, axis=0), (t_size, 1))
+    t_pos_embed = np.repeat(t_pos_embed, h_size * w_size, axis=0)
+    pos_embed = np.concatenate((w_pos_embed, h_pos_embed, t_pos_embed), axis=1)
+    if add_cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D)
+    """
+    if embed_dim % 2 != 0:
+        raise ValueError("embed_dim must be even")
+    omega = np.arange(embed_dim // 2, dtype=float)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+def _get_1d_sincos_embed_from_grid_torch(embed_dim: int, pos: torch.Tensor):
+    """ Modified torch version of *get_1d_sincos_pos_embed_from_grid()*.
+        embed_dim: output dimension for each position
+        pos: a list of positions to be encoded: size (M,) - must be float dtype!
+        out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    assert pos.dtype in [torch.float32, torch.float16, torch.bfloat16]
+    omega = torch.arange(embed_dim // 2, dtype=pos.dtype).to(pos.device)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = torch.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = torch.sin(out)  # (M, D/2)
+    emb_cos = torch.cos(out)  # (M, D/2)
+    emb = torch.cat([emb_sin, emb_cos], dim=1)  # (M, D)
+    return emb
+def _init_weights(module):
+    """Initialize the weights"""
+    if isinstance(module, nn.Linear):
+        nn.init.xavier_uniform_(module.weight)
+        if module.bias is not None:
+            module.bias.data.zero_()
+    elif isinstance(module, nn.LayerNorm):
+        module.bias.data.zero_()
+        module.weight.data.fill_(1.0)
+def _interpolate_pos_encoding(
+        pos_embed: torch.Tensor,
+        grid_size: tuple[int, int, int] | list[int],
+        patch_size: tuple[int, int, int] | list[int],
+        shape: tuple[int, int, int],
+        embed_dim: int,
+):
+    """
+    Adapted from:
+    - transformers.models.vit.modeling_vit.ViTEmbeddings.interpolate_pos_encoding,
+    - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194
+    """
+    t, h, w = shape
+    t_patches = t // patch_size[0]
+    h_patches = h // patch_size[1]
+    w_patches = w // patch_size[2]
+    if [t_patches, h_patches, w_patches] == grid_size:
+        # No interpolation needed
+        return pos_embed
+    if t_patches != grid_size[0]:
+        # Re-compute pos embedding to handle changed num_frames
+        new_grid_size = (t_patches, *grid_size[1:])
+        new_pos_embed = get_3d_sincos_pos_embed(pos_embed.shape[-1], new_grid_size, add_cls_token=True)
+        new_pos_embed = torch.from_numpy(new_pos_embed).float().unsqueeze(0)
+    else:
+        new_grid_size = grid_size
+        new_pos_embed = pos_embed
+    class_pos_embed, patch_pos_embed = new_pos_embed[:, :1], new_pos_embed[:, 1:]
+    patch_pos_embed = patch_pos_embed.reshape(*new_grid_size, embed_dim).permute(0, 3, 1, 2)
+    patch_pos_embed = nn.functional.interpolate(
+        patch_pos_embed,
+        size=(h_patches, w_patches),
+        mode='bicubic',
+        align_corners=True,
+    )
+    patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, embed_dim)
+    return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+class PatchEmbed(nn.Module):
+    """3D version of timm.models.vision_transformer.PatchEmbed"""
+    def __init__(
+            self,
+            input_size: tuple[int, int, int] = (1, 224, 224),
+            patch_size: tuple[int, int, int] = (1, 16, 16),
+            in_chans: int = 3,
+            embed_dim: int = 768,
+            norm_layer: nn.Module | None = None,
+            flatten: bool = True,
+            bias: bool = True,
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.patch_size = patch_size
+        self.grid_size = [s // p for s, p in zip(self.input_size, self.patch_size)]
+        assert self.grid_size >= [1, 1, 1], "Patch size is bigger than input size."
+        self.num_patches = self.grid_size[0] * self.grid_size[1] * self.grid_size[2]
+        self.flatten = flatten
+        self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x):
+        B, C, T, H, W = x.shape
+        if T / self.patch_size[0] % 1 or H / self.patch_size[1] % 1 or W / self.patch_size[2] % 1:
+            warnings.warn(f"Input {x.shape[-3:]} is not divisible by patch size {self.patch_size}."
+                          f"The border will be ignored, add backbone_padding for pixel-wise tasks.")
+        x = self.proj(x)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # B,C,T,H,W -> B,C,L -> B,L,C
+        x = self.norm(x)
+        return x
+class TemporalEncoder(nn.Module):
+    def __init__(self, embed_dim: int, trainable_scale: bool = False):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.year_embed_dim = embed_dim // 2
+        self.julian_day_embed_dim = embed_dim - self.year_embed_dim
+        # If trainable, initialize scale with small number
+        if trainable_scale:
+            self.scale = nn.Parameter(torch.full((1,), 0.1))
+        else:
+            self.register_buffer('scale', torch.ones(1))
+    def forward(self, temporal_coords: torch.Tensor, tokens_per_frame: int | None = None):
+        """
+        temporal_coords: year and day-of-year info with shape (B, T, 2).
+        tokens_per_frame: number of tokens for each frame in the sample. If provided, embeddings will be
+            repeated over T dimension, and final shape is (B, T*tokens_per_frame, embed_dim).
+        """
+        shape = temporal_coords.shape[:2] + (-1,)  # B, T, -1
+        year = _get_1d_sincos_embed_from_grid_torch(
+            self.year_embed_dim, temporal_coords[:, :, 0].flatten()).reshape(shape)
+        julian_day = _get_1d_sincos_embed_from_grid_torch(
+            self.julian_day_embed_dim, temporal_coords[:, :, 1].flatten()).reshape(shape)
+        embedding = self.scale * torch.cat([year, julian_day], dim=-1)
+        if tokens_per_frame is not None:
+            embedding = torch.repeat_interleave(embedding, tokens_per_frame, dim=1)
+        return embedding  # B, T*tokens_per_frame, embed_dim
+class LocationEncoder(nn.Module):
+    def __init__(self, embed_dim: int, trainable_scale: bool = False):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.lat_embed_dim = embed_dim // 2
+        self.lon_embed_dim = embed_dim - self.lat_embed_dim
+        # If trainable, initialize scale with small number
+        if trainable_scale:
+            self.scale = nn.Parameter(torch.full((1,), 0.1))
+        else:
+            self.register_buffer('scale', torch.ones(1))
+    def forward(self, location_coords: torch.Tensor):
+        """
+        location_coords: lat and lon info with shape (B, 2).
+        """
+        shape = location_coords.shape[:1] + (1, -1)  # B, 1, -1
+        lat = _get_1d_sincos_embed_from_grid_torch(
+                self.lat_embed_dim, location_coords[:, 0].flatten()).reshape(shape)
+        lon = _get_1d_sincos_embed_from_grid_torch(
+                self.lon_embed_dim, location_coords[:, 1].flatten()).reshape(shape)
+        embedding = self.scale * torch.cat([lat, lon], dim=-1)
+        return embedding  # B, 1, embed_dim
+class PrithviViT(nn.Module):
+    """ Prithvi ViT Encoder"""
+    def __init__(self,
+                 img_size: int | tuple[int, int] = 224,
+                 patch_size: int | tuple[int, int, int] = (1, 16, 16),
+                 num_frames: int = 1,
+                 in_chans: int = 3,
+                 embed_dim: int = 1024,
+                 depth: int = 24,
+                 num_heads: int = 16,
+                 mlp_ratio: float = 4.,
+                 norm_layer: nn.Module = nn.LayerNorm,
+                 coords_encoding: list[str] | None = None,
+                 coords_scale_learn: bool = False,
+                 drop_path: float = 0.,
+                 ** kwargs,
+                ):
+        super().__init__()
+        self.in_chans = in_chans
+        self.num_frames = num_frames
+        self.embed_dim = embed_dim
+        self.img_size = to_2tuple(img_size)
+        if isinstance(patch_size, int):
+            patch_size = (1, patch_size, patch_size)
+        # 3D patch embedding
+        self.patch_embed = PatchEmbed(
+            input_size=(num_frames,) + self.img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+        self.out_channels = [embed_dim * self.patch_embed.grid_size[0]] * depth
+        # Optional temporal and location embedding
+        coords_encoding = coords_encoding or []
+        self.temporal_encoding = 'time' in coords_encoding
+        self.location_encoding = 'location' in coords_encoding
+        if self.temporal_encoding:
+            assert patch_size[0] == 1, f"With temporal encoding, patch_size[0] must be 1, received {patch_size[0]}"
+            self.temporal_embed_enc = TemporalEncoder(embed_dim, coords_scale_learn)
+        if self.location_encoding:
+            self.location_embed_enc = LocationEncoder(embed_dim, coords_scale_learn)
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.register_buffer("pos_embed", torch.zeros(1, self.patch_embed.num_patches + 1, embed_dim))
+        # Transformer layers
+        self.blocks = []
+        for i in range(depth):
+            self.blocks.append(Block(embed_dim, num_heads, mlp_ratio, qkv_bias=True, norm_layer=norm_layer,
+                                     drop_path=drop_path,))
+        self.blocks = nn.ModuleList(self.blocks)
+        self.norm = norm_layer(embed_dim)
+        self.initialize_weights()
+    def initialize_weights(self):
+        # initialize (and freeze) position embeddings by sin-cos embedding
+        pos_embed = get_3d_sincos_pos_embed(
+            self.pos_embed.shape[-1], self.patch_embed.grid_size, add_cls_token=True
+        )
+        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
+        # initialize patch_embeddings like nn.Linear (instead of nn.Conv2d)
+        w = self.patch_embed.proj.weight.data
+        torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        # timm's trunc_normal_(std=.02) is effectively normal_(std=0.02) as cutoff is too big (2.)
+        torch.nn.init.normal_(self.cls_token, std=0.02)
+        self.apply(_init_weights)
+    def random_masking(self, sequence, mask_ratio, noise=None):
+        """
+        Perform per-sample random masking by per-sample shuffling. Per-sample shuffling is done by argsort random
+        noise.
+        Args:
+            sequence (`torch.FloatTensor` of shape `(batch_size, sequence_length, dim)`)
+            mask_ratio (float): mask ratio to use.
+            noise (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*) which is
+                mainly used for testing purposes to control randomness and maintain the reproducibility
+        """
+        batch_size, seq_length, dim = sequence.shape
+        len_keep = int(seq_length * (1 - mask_ratio))
+        if noise is None:
+            noise = torch.rand(batch_size, seq_length, device=sequence.device)  # noise in [0, 1]
+        # sort noise for each sample
+        ids_shuffle = torch.argsort(noise, dim=1).to(sequence.device)  # ascend: small is keep, large is remove
+        ids_restore = torch.argsort(ids_shuffle, dim=1).to(sequence.device)
+        # keep the first subset
+        ids_keep = ids_shuffle[:, :len_keep]
+        sequence_unmasked = torch.gather(sequence, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, dim))
+        # generate the binary mask: 0 is keep, 1 is remove
+        mask = torch.ones([batch_size, seq_length], device=sequence.device)
+        mask[:, :len_keep] = 0
+        # unshuffle to get the binary mask
+        mask = torch.gather(mask, dim=1, index=ids_restore)
+        return sequence_unmasked, mask, ids_restore
+    def interpolate_pos_encoding(self, sample_shape: tuple[int, int, int]):
+        pos_embed = _interpolate_pos_encoding(
+            pos_embed=self.pos_embed,
+            grid_size=self.patch_embed.grid_size,
+            patch_size=self.patch_embed.patch_size,
+            shape=sample_shape,
+            embed_dim=self.embed_dim,
+        )
+        return pos_embed
+    def forward(
+        self, x: torch.Tensor,
+        temporal_coords: None | torch.Tensor = None,
+        location_coords: None | torch.Tensor = None,
+        mask_ratio=0.75
+    ):
+        if len(x.shape) == 4 and self.patch_embed.input_size[0] == 1:
+            # add time dim
+            x = x.unsqueeze(2)
+        sample_shape = x.shape[-3:]
+        # embed patches
+        x = self.patch_embed(x)
+        pos_embed = self.interpolate_pos_encoding(sample_shape)
+        # add pos embed w/o cls token
+        x = x + pos_embed[:, 1:, :]
+        if self.temporal_encoding and temporal_coords is not None:
+            num_tokens_per_frame = x.shape[1] // self.num_frames
+            temporal_encoding = self.temporal_embed_enc(temporal_coords, num_tokens_per_frame)
+            x = x + temporal_encoding
+        if self.location_encoding and location_coords is not None:
+            location_encoding = self.location_embed_enc(location_coords)
+            x = x + location_encoding
+        # masking: length -> length * mask_ratio
+        x, mask, ids_restore = self.random_masking(x, mask_ratio)
+        # append cls token
+        cls_token = self.cls_token + pos_embed[:, :1, :]
+        cls_tokens = cls_token.expand(x.shape[0], -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+        # apply Transformer blocks
+        for block in self.blocks:
+            x = block(x)
+        x = self.norm(x)
+        return x, mask, ids_restore
+    def forward_features(
+        self,
+        x: torch.Tensor,
+        temporal_coords: None | torch.Tensor = None,
+        location_coords: None | torch.Tensor = None,
+    ) -> list[torch.Tensor]:
+        if len(x.shape) == 4 and self.patch_embed.input_size[0] == 1:
+            # add time dim
+            x = x.unsqueeze(2)
+        sample_shape = x.shape[-3:]
+        # embed patches
+        x = self.patch_embed(x)
+        pos_embed = self.interpolate_pos_encoding(sample_shape)
+        # add pos embed w/o cls token
+        x = x + pos_embed[:, 1:, :]
+        if self.temporal_encoding and temporal_coords is not None:
+            num_tokens_per_frame = x.shape[1] // self.num_frames
+            temporal_encoding = self.temporal_embed_enc(temporal_coords, num_tokens_per_frame)
+            x = x + temporal_encoding
+        if self.location_encoding and location_coords is not None:
+            location_encoding = self.location_embed_enc(location_coords)
+            x = x + location_encoding
+        # append cls token
+        cls_token = self.cls_token + pos_embed[:, :1, :]
+        cls_tokens = cls_token.expand(x.shape[0], -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+        # apply Transformer blocks
+        out = []
+        for block in self.blocks:
+            x = block(x)
+            out.append(x.clone())
+        x = self.norm(x)
+        out[-1] = x
+        return out
+    def prepare_features_for_image_model(self, features: list[torch.Tensor]) -> list[torch.Tensor]:
+        out = []
+        effective_time_dim = self.patch_embed.input_size[0] // self.patch_embed.patch_size[0]
+        for x in features:
+            x_no_token = x[:, 1:, :]
+            number_of_tokens = x_no_token.shape[1]
+            tokens_per_timestep = number_of_tokens // effective_time_dim
+            h = int(np.sqrt(tokens_per_timestep))
+            encoded = rearrange(
+                x_no_token,
+                "batch (t h w) e -> batch (t e) h w",
+                e=self.embed_dim,
+                t=effective_time_dim,
+                h=h,
+            )
+            out.append(encoded)
+        return out
+class MAEDecoder(nn.Module):
+    """ Transformer Decoder used in the Prithvi MAE"""
+    def __init__(self,
+                 patch_size: int | tuple[int, int, int] = (1, 16, 16),
+                 grid_size: list[int] | tuple[int, int, int] = (3, 14, 14),
+                 in_chans: int = 3,
+                 encoder_embed_dim: int = 1024,
+                 decoder_embed_dim: int = 512,
+                 depth: int = 8,
+                 num_heads: int = 16,
+                 mlp_ratio: float = 4.,
+                 norm_layer: nn.Module = nn.LayerNorm,
+                 coords_encoding: list[str] | None = None,
+                 coords_scale_learn: bool = False,
+                 ):
+        super().__init__()
+        self.decoder_embed = nn.Linear(encoder_embed_dim, decoder_embed_dim, bias=True)
+        self.decoder_embed_dim = decoder_embed_dim
+        self.grid_size = grid_size
+        if isinstance(patch_size, int):
+            patch_size = (1, patch_size, patch_size)
+        self.patch_size = patch_size
+        self.num_frames = self.grid_size[0] * patch_size[0]
+        num_patches = self.grid_size[0] * self.grid_size[1] * self.grid_size[2]
+        # Optional temporal and location embedding
+        coords_encoding = coords_encoding or []
+        self.temporal_encoding = 'time' in coords_encoding
+        self.location_encoding = 'location' in coords_encoding
+        if self.temporal_encoding:
+            self.temporal_embed_dec = TemporalEncoder(decoder_embed_dim, coords_scale_learn)
+        if self.location_encoding:
+            self.location_embed_dec = LocationEncoder(decoder_embed_dim, coords_scale_learn)
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, decoder_embed_dim))
+        self.register_buffer("decoder_pos_embed", torch.zeros(1, num_patches + 1, decoder_embed_dim))
+        self.decoder_blocks = nn.ModuleList(
+            [Block(decoder_embed_dim, num_heads, mlp_ratio, qkv_bias=True, norm_layer=norm_layer) for _ in range(depth)]
+        )
+        self.decoder_norm = norm_layer(decoder_embed_dim)
+        self.decoder_pred = nn.Linear(decoder_embed_dim,
+                                      patch_size[0] * patch_size[1] * patch_size[2] * in_chans,
+                                      bias=True)
+        self.initialize_weights()
+    def initialize_weights(self):
+        # initialize (and freeze) position embeddings by sin-cos embedding
+        decoder_pos_embed = get_3d_sincos_pos_embed(
+            self.decoder_pos_embed.shape[-1], self.grid_size, add_cls_token=True
+        )
+        self.decoder_pos_embed.data.copy_(torch.from_numpy(decoder_pos_embed).float().unsqueeze(0))
+        # timm's trunc_normal_(std=.02) is effectively normal_(std=0.02) as cutoff is too big (2.)
+        torch.nn.init.normal_(self.mask_token, std=0.02)
+        self.apply(_init_weights)
+    def interpolate_pos_encoding(self, sample_shape: tuple[int, int, int]):
+        pos_embed = _interpolate_pos_encoding(
+            pos_embed=self.decoder_pos_embed,
+            grid_size=self.grid_size,
+            patch_size=self.patch_size,
+            shape=sample_shape,
+            embed_dim=self.decoder_embed_dim,
+        )
+        return pos_embed
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        ids_restore: torch.Tensor,
+        temporal_coords: None | torch.Tensor = None,
+        location_coords: None | torch.Tensor = None,
+        input_size: list[int] = None,
+    ):
+        # embed tokens
+        x = self.decoder_embed(hidden_states)
+        cls_token = x[:, :1, :]
+        # append mask tokens to sequence
+        mask_tokens = self.mask_token.repeat(x.shape[0], ids_restore.shape[1] + 1 - x.shape[1], 1)
+        x = torch.cat([x[:, 1:, :], mask_tokens], dim=1)  # no cls token
+        # unshuffle
+        x = torch.gather(x, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, x.shape[2]).to(x.device))
+        # add pos embed
+        decoder_pos_embed = self.interpolate_pos_encoding(input_size[-3:])
+        cls_token = cls_token + decoder_pos_embed[:, :1, :]
+        x = x + decoder_pos_embed[:, 1:, :]
+        if self.temporal_encoding and temporal_coords is not None:
+            num_tokens_per_frame = x.shape[1] // self.num_frames
+            temporal_encoding = self.temporal_embed_dec(temporal_coords, num_tokens_per_frame)
+            # Add temporal encoding w/o cls token
+            x = x + temporal_encoding
+        if self.location_encoding and location_coords is not None:
+            location_encoding = self.location_embed_dec(location_coords)
+            # Add location encoding w/o cls token
+            x = x + location_encoding
+        # append cls token
+        x = torch.cat([cls_token, x], dim=1)
+        # apply Transformer layers (blocks)
+        for block in self.decoder_blocks:
+            x = block(x)
+        x = self.decoder_norm(x)
+        # predictor projection
+        pred = self.decoder_pred(x)
+        # remove cls token
+        pred = pred[:, 1:, :]
+        return pred
+class PrithviMAE(nn.Module):
+    """ Prithvi Masked Autoencoder"""
+    def __init__(self,
+                 img_size: int | tuple[int, int] = 224,
+                 patch_size: int | tuple[int, int, int] = (1, 16, 16),
+                 num_frames: int = 4,
+                 in_chans: int = 6,
+                 embed_dim: int = 768,
+                 depth: int = 12,
+                 num_heads: int = 12,
+                 decoder_embed_dim: int = 512,
+                 decoder_depth: int = 8,
+                 decoder_num_heads: int = 16,
+                 mlp_ratio: float = 4.,
+                 norm_layer: nn.Module = nn.LayerNorm,
+                 norm_pix_loss: bool = False,
+                 coords_encoding: list[str] | None = None,
+                 coords_scale_learn: bool = False,
+                 drop_path: float = 0.,
+                 mask_ratio: float = 0.75,
+                 **kwargs,
+                 ):
+        super().__init__()
+        self.encoder = PrithviViT(
+            img_size=img_size,
+            num_frames=num_frames,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            depth=depth,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            norm_layer=norm_layer,
+            coords_encoding=coords_encoding,
+            coords_scale_learn=coords_scale_learn,
+            drop_path=drop_path,
+        )
+        self.decoder = MAEDecoder(
+            patch_size=patch_size,
+            grid_size=self.encoder.patch_embed.grid_size,
+            in_chans=in_chans,
+            encoder_embed_dim=embed_dim,
+            decoder_embed_dim=decoder_embed_dim,
+            depth=decoder_depth,
+            num_heads=decoder_num_heads,
+            mlp_ratio=mlp_ratio,
+            norm_layer=norm_layer,
+            coords_encoding=coords_encoding,
+            coords_scale_learn=coords_scale_learn,
+        )
+        self.mask_ratio = mask_ratio
+        self.norm_pix_loss = norm_pix_loss
+        self.out_channels = self.encoder.out_channels
+    def patchify(self, pixel_values):
+        """
+        Args:
+            pixel_values (torch.FloatTensor of shape `(batch_size, num_channels, time, height, width)`):
+                Pixel values.
+        Returns:
+            torch.FloatTensor of shape
+                `(batch_size, num_patches, patch_size[0]*patch_size[1]*patch_size[2] * num_channels)`:
+                Patchified pixel values.
+        """
+        patch_size_t, patch_size_h, patch_size_w = self.encoder.patch_embed.patch_size
+        num_channels = self.encoder.in_chans
+        # patchify
+        patchified_pixel_values = rearrange(pixel_values, 'b c (t s) (h p) (w q) -> b (t h w) (s p q c)',
+                                            c=num_channels, s=patch_size_t, p=patch_size_h, q=patch_size_w)
+        return patchified_pixel_values
+    def unpatchify(self, patchified_pixel_values, image_size: tuple[int, int] | None = None):
+        """
+        Args:
+            patchified_pixel_values (`torch.FloatTensor` of shape
+                `(batch_size, num_patches, patch_size[0]*patch_size[1]*patch_size[2] * num_channels))`:
+                Patchified pixel values.
+            image_size (`tuple[int, int]`, *optional*):
+                Original image size.
+        Returns:
+            `torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`:
+                Pixel values.
+        """
+        patch_size_t, patch_size_h, patch_size_w = self.encoder.patch_embed.patch_size
+        image_size = to_2tuple(image_size) if image_size is not None else self.encoder.img_size
+        original_height, original_width = image_size
+        num_patches_h = original_height // patch_size_h
+        num_patches_w = original_width // patch_size_w
+        num_channels = self.encoder.in_chans
+        pixel_values = rearrange(patchified_pixel_values, 'b (t h w) (s p q c) -> b c (t s) (h p) (w q)',
+                                 c=num_channels, h=num_patches_h, w=num_patches_w,
+                                 s=patch_size_t, p=patch_size_h, q=patch_size_w)
+        return pixel_values
+    def forward_loss(self, pixel_values, pred, mask):
+        """
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, time, height, width)`):
+                Pixel values.
+            pred (`torch.FloatTensor` of shape
+                `(batch_size, num_patches, patch_size[0]*patch_size[1]*patch_size[2] * num_channels)`:
+                Predicted pixel values.
+            mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+                Tensor indicating which patches are masked (1) and which are not (0).
+        Returns:
+            `torch.FloatTensor`: Pixel reconstruction loss.
+        """
+        target = self.patchify(pixel_values)
+        if self.norm_pix_loss:
+            mean = target.mean(dim=-1, keepdim=True)
+            var = target.var(dim=-1, keepdim=True)
+            target = (target - mean) / (var + 1.0e-6) ** 0.5
+        loss = (pred - target) ** 2
+        loss = loss.mean(dim=-1)  # [N, L], mean loss per patch
+        loss = (loss * mask).sum() / mask.sum()  # mean loss on removed patches
+        return loss
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        temporal_coords: None | torch.Tensor = None,
+        location_coords: None | torch.Tensor = None,
+        mask_ratio: float = None,
+    ):
+        if len(pixel_values.shape) == 4 and self.encoder.patch_embed.input_size[0] == 1:
+            # add time dim
+            pixel_values = pixel_values.unsqueeze(2)
+        mask_ratio = mask_ratio or self.mask_ratio
+        latent, mask, ids_restore = self.encoder(pixel_values, temporal_coords, location_coords, mask_ratio)
+        pred = self.decoder(latent, ids_restore, temporal_coords, location_coords, input_size=pixel_values.shape)
+        loss = self.forward_loss(pixel_values, pred, mask)
+        return loss, pred, mask
+    def forward_features(
+        self,
+        x: torch.Tensor,
+        temporal_coords: None | torch.Tensor = None,
+        location_coords: None | torch.Tensor = None,
+    ) -> list[torch.Tensor]:
+        return self.encoder.forward_features(x, temporal_coords, location_coords)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch
+torchvision
+timm
+einops
+rasterio