# Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. import torch import numpy as np def get_3d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0): """ grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) """ if isinstance(grid_size, tuple): grid_size_h, grid_size_w = grid_size else: grid_size_h = grid_size_w = grid_size grid_h = np.arange(grid_size_h, dtype=np.float32) grid_w = np.arange(grid_size_w, dtype=np.float32) grid = np.meshgrid(grid_w, grid_h) # here w goes first grid = np.stack(grid, axis=0) grid = grid.reshape([2, 1, grid_size_h, grid_size_w]) pos_embed = get_3d_sincos_pos_embed_from_grid(embed_dim, grid) if cls_token and extra_tokens > 0: pos_embed = np.concatenate( [np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0 ) return pos_embed def get_3d_sincos_pos_embed_from_grid(embed_dim, grid): assert embed_dim % 3 == 0 # use half of dimensions to encode grid_h B, S, N, _ = grid.shape gridx = grid[..., 0].view(B*S*N).detach().cpu().numpy() gridy = grid[..., 1].view(B*S*N).detach().cpu().numpy() gridz = grid[..., 2].view(B*S*N).detach().cpu().numpy() emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, gridx) # (N, D/3) emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, gridy) # (N, D/3) emb_z = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, gridz) # (N, D/3) emb = np.concatenate([emb_h, emb_w, emb_z], axis=1) # (N, D) emb = torch.from_numpy(emb).to(grid.device) return emb.view(B, S, N, embed_dim) def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0): """ grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) """ if isinstance(grid_size, tuple): grid_size_h, grid_size_w = grid_size else: grid_size_h = grid_size_w = grid_size grid_h = np.arange(grid_size_h, dtype=np.float32) grid_w = np.arange(grid_size_w, dtype=np.float32) grid = np.meshgrid(grid_w, grid_h) # here w goes first grid = np.stack(grid, axis=0) grid = grid.reshape([2, 1, grid_size_h, grid_size_w]) pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) if cls_token and extra_tokens > 0: pos_embed = np.concatenate( [np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0 ) return pos_embed def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): assert embed_dim % 2 == 0 # use half of dimensions to encode grid_h emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) return emb def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): """ embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D) """ assert embed_dim % 2 == 0 omega = np.arange(embed_dim // 2, dtype=np.float64) omega /= embed_dim / 2.0 omega = 1.0 / 10000 ** omega # (D/2,) pos = pos.reshape(-1) # (M,) out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product emb_sin = np.sin(out) # (M, D/2) emb_cos = np.cos(out) # (M, D/2) emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) return emb def get_2d_embedding(xy, C, cat_coords=True): B, N, D = xy.shape assert D == 2 x = xy[:, :, 0:1] y = xy[:, :, 1:2] div_term = ( torch.arange(0, C, 2, device=xy.device, dtype=torch.float32) * (1000.0 / C) ).reshape(1, 1, int(C / 2)) pe_x = torch.zeros(B, N, C, device=xy.device, dtype=torch.float32) pe_y = torch.zeros(B, N, C, device=xy.device, dtype=torch.float32) pe_x[:, :, 0::2] = torch.sin(x * div_term) pe_x[:, :, 1::2] = torch.cos(x * div_term) pe_y[:, :, 0::2] = torch.sin(y * div_term) pe_y[:, :, 1::2] = torch.cos(y * div_term) pe = torch.cat([pe_x, pe_y], dim=2) # B, N, C*3 if cat_coords: pe = torch.cat([xy, pe], dim=2) # B, N, C*3+3 return pe def get_3d_embedding(xyz, C, cat_coords=True): B, N, D = xyz.shape assert D == 3 x = xyz[:, :, 0:1] y = xyz[:, :, 1:2] z = xyz[:, :, 2:3] div_term = ( torch.arange(0, C, 2, device=xyz.device, dtype=torch.float32) * (1000.0 / C) ).reshape(1, 1, int(C / 2)) pe_x = torch.zeros(B, N, C, device=xyz.device, dtype=torch.float32) pe_y = torch.zeros(B, N, C, device=xyz.device, dtype=torch.float32) pe_z = torch.zeros(B, N, C, device=xyz.device, dtype=torch.float32) pe_x[:, :, 0::2] = torch.sin(x * div_term) pe_x[:, :, 1::2] = torch.cos(x * div_term) pe_y[:, :, 0::2] = torch.sin(y * div_term) pe_y[:, :, 1::2] = torch.cos(y * div_term) pe_z[:, :, 0::2] = torch.sin(z * div_term) pe_z[:, :, 1::2] = torch.cos(z * div_term) pe = torch.cat([pe_x, pe_y, pe_z], dim=2) # B, N, C*3 if cat_coords: pe = torch.cat([pe, xyz], dim=2) # B, N, C*3+3 return pe def get_4d_embedding(xyzw, C, cat_coords=True): B, N, D = xyzw.shape assert D == 4 x = xyzw[:, :, 0:1] y = xyzw[:, :, 1:2] z = xyzw[:, :, 2:3] w = xyzw[:, :, 3:4] div_term = ( torch.arange(0, C, 2, device=xyzw.device, dtype=torch.float32) * (1000.0 / C) ).reshape(1, 1, int(C / 2)) pe_x = torch.zeros(B, N, C, device=xyzw.device, dtype=torch.float32) pe_y = torch.zeros(B, N, C, device=xyzw.device, dtype=torch.float32) pe_z = torch.zeros(B, N, C, device=xyzw.device, dtype=torch.float32) pe_w = torch.zeros(B, N, C, device=xyzw.device, dtype=torch.float32) pe_x[:, :, 0::2] = torch.sin(x * div_term) pe_x[:, :, 1::2] = torch.cos(x * div_term) pe_y[:, :, 0::2] = torch.sin(y * div_term) pe_y[:, :, 1::2] = torch.cos(y * div_term) pe_z[:, :, 0::2] = torch.sin(z * div_term) pe_z[:, :, 1::2] = torch.cos(z * div_term) pe_w[:, :, 0::2] = torch.sin(w * div_term) pe_w[:, :, 1::2] = torch.cos(w * div_term) pe = torch.cat([pe_x, pe_y, pe_z, pe_w], dim=2) # B, N, C*3 if cat_coords: pe = torch.cat([pe, xyzw], dim=2) # B, N, C*3+3 return pe import torch.nn as nn class Embedder_Fourier(nn.Module): def __init__(self, input_dim, max_freq_log2, N_freqs, log_sampling=True, include_input=True, periodic_fns=(torch.sin, torch.cos)): ''' :param input_dim: dimension of input to be embedded :param max_freq_log2: log2 of max freq; min freq is 1 by default :param N_freqs: number of frequency bands :param log_sampling: if True, frequency bands are linerly sampled in log-space :param include_input: if True, raw input is included in the embedding :param periodic_fns: periodic functions used to embed input ''' super(Embedder_Fourier, self).__init__() self.input_dim = input_dim self.include_input = include_input self.periodic_fns = periodic_fns self.out_dim = 0 if self.include_input: self.out_dim += self.input_dim self.out_dim += self.input_dim * N_freqs * len(self.periodic_fns) if log_sampling: self.freq_bands = 2. ** torch.linspace(0., max_freq_log2, N_freqs) else: self.freq_bands = torch.linspace( 2. ** 0., 2. ** max_freq_log2, N_freqs) self.freq_bands = self.freq_bands.numpy().tolist() def forward(self, input: torch.Tensor, rescale: float = 1.0): ''' :param input: tensor of shape [..., self.input_dim] :return: tensor of shape [..., self.out_dim] ''' assert (input.shape[-1] == self.input_dim) out = [] if self.include_input: out.append(input/rescale) for i in range(len(self.freq_bands)): freq = self.freq_bands[i] for p_fn in self.periodic_fns: out.append(p_fn(input * freq)) out = torch.cat(out, dim=-1) assert (out.shape[-1] == self.out_dim) return out