Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,543 Bytes
c295391 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import torch
from .rotation import quat_to_mat, mat_to_quat
def extri_intri_to_pose_encoding(
extrinsics,
intrinsics,
image_size_hw=None, # e.g., (256, 512)
pose_encoding_type="absT_quaR_FoV",
):
"""Convert camera extrinsics and intrinsics to a compact pose encoding.
This function transforms camera parameters into a unified pose encoding format,
which can be used for various downstream tasks like pose prediction or representation.
Args:
extrinsics (torch.Tensor): Camera extrinsic parameters with shape BxSx3x4,
where B is batch size and S is sequence length.
In OpenCV coordinate system (x-right, y-down, z-forward), representing camera from world transformation.
The format is [R|t] where R is a 3x3 rotation matrix and t is a 3x1 translation vector.
intrinsics (torch.Tensor): Camera intrinsic parameters with shape BxSx3x3.
Defined in pixels, with format:
[[fx, 0, cx],
[0, fy, cy],
[0, 0, 1]]
where fx, fy are focal lengths and (cx, cy) is the principal point
image_size_hw (tuple): Tuple of (height, width) of the image in pixels.
Required for computing field of view values. For example: (256, 512).
pose_encoding_type (str): Type of pose encoding to use. Currently only
supports "absT_quaR_FoV" (absolute translation, quaternion rotation, field of view).
Returns:
torch.Tensor: Encoded camera pose parameters with shape BxSx9.
For "absT_quaR_FoV" type, the 9 dimensions are:
- [:3] = absolute translation vector T (3D)
- [3:7] = rotation as quaternion quat (4D)
- [7:] = field of view (2D)
"""
# extrinsics: BxSx3x4
# intrinsics: BxSx3x3
if pose_encoding_type == "absT_quaR_FoV":
R = extrinsics[:, :, :3, :3] # BxSx3x3
T = extrinsics[:, :, :3, 3] # BxSx3
quat = mat_to_quat(R)
# Note the order of h and w here
H, W = image_size_hw
fov_h = 2 * torch.atan((H / 2) / intrinsics[..., 1, 1])
fov_w = 2 * torch.atan((W / 2) / intrinsics[..., 0, 0])
pose_encoding = torch.cat([T, quat, fov_h[..., None], fov_w[..., None]], dim=-1).float()
else:
raise NotImplementedError
return pose_encoding
def pose_encoding_to_extri_intri(
pose_encoding,
image_size_hw=None, # e.g., (256, 512)
pose_encoding_type="absT_quaR_FoV",
build_intrinsics=True,
):
"""Convert a pose encoding back to camera extrinsics and intrinsics.
This function performs the inverse operation of extri_intri_to_pose_encoding,
reconstructing the full camera parameters from the compact encoding.
Args:
pose_encoding (torch.Tensor): Encoded camera pose parameters with shape BxSx9,
where B is batch size and S is sequence length.
For "absT_quaR_FoV" type, the 9 dimensions are:
- [:3] = absolute translation vector T (3D)
- [3:7] = rotation as quaternion quat (4D)
- [7:] = field of view (2D)
image_size_hw (tuple): Tuple of (height, width) of the image in pixels.
Required for reconstructing intrinsics from field of view values.
For example: (256, 512).
pose_encoding_type (str): Type of pose encoding used. Currently only
supports "absT_quaR_FoV" (absolute translation, quaternion rotation, field of view).
build_intrinsics (bool): Whether to reconstruct the intrinsics matrix.
If False, only extrinsics are returned and intrinsics will be None.
Returns:
tuple: (extrinsics, intrinsics)
- extrinsics (torch.Tensor): Camera extrinsic parameters with shape BxSx3x4.
In OpenCV coordinate system (x-right, y-down, z-forward), representing camera from world
transformation. The format is [R|t] where R is a 3x3 rotation matrix and t is
a 3x1 translation vector.
- intrinsics (torch.Tensor or None): Camera intrinsic parameters with shape BxSx3x3,
or None if build_intrinsics is False. Defined in pixels, with format:
[[fx, 0, cx],
[0, fy, cy],
[0, 0, 1]]
where fx, fy are focal lengths and (cx, cy) is the principal point,
assumed to be at the center of the image (W/2, H/2).
"""
intrinsics = None
if pose_encoding_type == "absT_quaR_FoV":
T = pose_encoding[..., :3]
quat = pose_encoding[..., 3:7]
fov_h = pose_encoding[..., 7]
fov_w = pose_encoding[..., 8]
R = quat_to_mat(quat)
extrinsics = torch.cat([R, T[..., None]], dim=-1)
if build_intrinsics:
H, W = image_size_hw
fy = (H / 2.0) / torch.tan(fov_h / 2.0)
fx = (W / 2.0) / torch.tan(fov_w / 2.0)
intrinsics = torch.zeros(pose_encoding.shape[:2] + (3, 3), device=pose_encoding.device)
intrinsics[..., 0, 0] = fx
intrinsics[..., 1, 1] = fy
intrinsics[..., 0, 2] = W / 2
intrinsics[..., 1, 2] = H / 2
intrinsics[..., 2, 2] = 1.0 # Set the homogeneous coordinate to 1
else:
raise NotImplementedError
return extrinsics, intrinsics
|