Spaces:
Runtime error
Runtime error
# Copyright (c) Meta Platforms, Inc. and affiliates. | |
# All rights reserved. | |
# | |
# This source code is licensed under the license found in the | |
# LICENSE file in the root directory of this source tree. | |
import random | |
import numpy as np | |
import glob | |
import os | |
import copy | |
import torch | |
import torch.nn.functional as F | |
# Configure CUDA settings | |
torch.backends.cudnn.enabled = True | |
torch.backends.cudnn.benchmark = True | |
torch.backends.cudnn.deterministic = False | |
import argparse | |
from pathlib import Path | |
import trimesh | |
import pycolmap | |
from vggt.models.vggt import VGGT | |
from vggt.utils.load_fn import load_and_preprocess_images_square | |
from vggt.utils.pose_enc import pose_encoding_to_extri_intri | |
from vggt.utils.geometry import unproject_depth_map_to_point_map | |
from vggt.utils.helper import create_pixel_coordinate_grid, randomly_limit_trues | |
from vggt.dependency.track_predict import predict_tracks | |
from vggt.dependency.np_to_pycolmap import batch_np_matrix_to_pycolmap, batch_np_matrix_to_pycolmap_wo_track | |
# TODO: add support for masks | |
# TODO: add iterative BA | |
# TODO: add support for radial distortion, which needs extra_params | |
# TODO: test with more cases | |
# TODO: test different camera types | |
def parse_args(): | |
parser = argparse.ArgumentParser(description="VGGT Demo") | |
parser.add_argument("--scene_dir", type=str, required=True, help="Directory containing the scene images") | |
parser.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility") | |
parser.add_argument("--use_ba", action="store_true", default=False, help="Use BA for reconstruction") | |
######### BA parameters ######### | |
parser.add_argument( | |
"--max_reproj_error", type=float, default=8.0, help="Maximum reprojection error for reconstruction" | |
) | |
parser.add_argument("--shared_camera", action="store_true", default=False, help="Use shared camera for all images") | |
parser.add_argument("--camera_type", type=str, default="SIMPLE_PINHOLE", help="Camera type for reconstruction") | |
parser.add_argument("--vis_thresh", type=float, default=0.2, help="Visibility threshold for tracks") | |
parser.add_argument("--query_frame_num", type=int, default=8, help="Number of frames to query") | |
parser.add_argument("--max_query_pts", type=int, default=4096, help="Maximum number of query points") | |
parser.add_argument( | |
"--fine_tracking", action="store_true", default=True, help="Use fine tracking (slower but more accurate)" | |
) | |
parser.add_argument( | |
"--conf_thres_value", type=float, default=5.0, help="Confidence threshold value for depth filtering (wo BA)" | |
) | |
return parser.parse_args() | |
def run_VGGT(model, images, dtype, resolution=518): | |
# images: [B, 3, H, W] | |
assert len(images.shape) == 4 | |
assert images.shape[1] == 3 | |
# hard-coded to use 518 for VGGT | |
images = F.interpolate(images, size=(resolution, resolution), mode="bilinear", align_corners=False) | |
with torch.no_grad(): | |
with torch.cuda.amp.autocast(dtype=dtype): | |
images = images[None] # add batch dimension | |
aggregated_tokens_list, ps_idx = model.aggregator(images) | |
# Predict Cameras | |
pose_enc = model.camera_head(aggregated_tokens_list)[-1] | |
# Extrinsic and intrinsic matrices, following OpenCV convention (camera from world) | |
extrinsic, intrinsic = pose_encoding_to_extri_intri(pose_enc, images.shape[-2:]) | |
# Predict Depth Maps | |
depth_map, depth_conf = model.depth_head(aggregated_tokens_list, images, ps_idx) | |
extrinsic = extrinsic.squeeze(0).cpu().numpy() | |
intrinsic = intrinsic.squeeze(0).cpu().numpy() | |
depth_map = depth_map.squeeze(0).cpu().numpy() | |
depth_conf = depth_conf.squeeze(0).cpu().numpy() | |
return extrinsic, intrinsic, depth_map, depth_conf | |
def demo_fn(args): | |
# Print configuration | |
print("Arguments:", vars(args)) | |
# Set seed for reproducibility | |
np.random.seed(args.seed) | |
torch.manual_seed(args.seed) | |
random.seed(args.seed) | |
if torch.cuda.is_available(): | |
torch.cuda.manual_seed(args.seed) | |
torch.cuda.manual_seed_all(args.seed) # for multi-GPU | |
print(f"Setting seed as: {args.seed}") | |
# Set device and dtype | |
dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16 | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
print(f"Using device: {device}") | |
print(f"Using dtype: {dtype}") | |
# Run VGGT for camera and depth estimation | |
model = VGGT() | |
_URL = "https://huggingface.co/facebook/VGGT-1B/resolve/main/model.pt" | |
model.load_state_dict(torch.hub.load_state_dict_from_url(_URL)) | |
model.eval() | |
model = model.to(device) | |
print(f"Model loaded") | |
# Get image paths and preprocess them | |
image_dir = os.path.join(args.scene_dir, "images") | |
image_path_list = glob.glob(os.path.join(image_dir, "*")) | |
if len(image_path_list) == 0: | |
raise ValueError(f"No images found in {image_dir}") | |
base_image_path_list = [os.path.basename(path) for path in image_path_list] | |
# Load images and original coordinates | |
# Load Image in 1024, while running VGGT with 518 | |
vggt_fixed_resolution = 518 | |
img_load_resolution = 1024 | |
images, original_coords = load_and_preprocess_images_square(image_path_list, img_load_resolution) | |
images = images.to(device) | |
original_coords = original_coords.to(device) | |
print(f"Loaded {len(images)} images from {image_dir}") | |
# Run VGGT to estimate camera and depth | |
# Run with 518x518 images | |
extrinsic, intrinsic, depth_map, depth_conf = run_VGGT(model, images, dtype, vggt_fixed_resolution) | |
points_3d = unproject_depth_map_to_point_map(depth_map, extrinsic, intrinsic) | |
if args.use_ba: | |
image_size = np.array(images.shape[-2:]) | |
scale = img_load_resolution / vggt_fixed_resolution | |
shared_camera = args.shared_camera | |
with torch.cuda.amp.autocast(dtype=dtype): | |
# Predicting Tracks | |
# Using VGGSfM tracker instead of VGGT tracker for efficiency | |
# VGGT tracker requires multiple backbone runs to query different frames (this is a problem caused by the training process) | |
# Will be fixed in VGGT v2 | |
# You can also change the pred_tracks to tracks from any other methods | |
# e.g., from COLMAP, from CoTracker, or by chaining 2D matches from Lightglue/LoFTR. | |
pred_tracks, pred_vis_scores, pred_confs, points_3d, points_rgb = predict_tracks( | |
images, | |
conf=depth_conf, | |
points_3d=points_3d, | |
masks=None, | |
max_query_pts=args.max_query_pts, | |
query_frame_num=args.query_frame_num, | |
keypoint_extractor="aliked+sp", | |
fine_tracking=args.fine_tracking, | |
) | |
torch.cuda.empty_cache() | |
# rescale the intrinsic matrix from 518 to 1024 | |
intrinsic[:, :2, :] *= scale | |
track_mask = pred_vis_scores > args.vis_thresh | |
# TODO: radial distortion, iterative BA, masks | |
reconstruction, valid_track_mask = batch_np_matrix_to_pycolmap( | |
points_3d, | |
extrinsic, | |
intrinsic, | |
pred_tracks, | |
image_size, | |
masks=track_mask, | |
max_reproj_error=args.max_reproj_error, | |
shared_camera=shared_camera, | |
camera_type=args.camera_type, | |
points_rgb=points_rgb, | |
) | |
if reconstruction is None: | |
raise ValueError("No reconstruction can be built with BA") | |
# Bundle Adjustment | |
ba_options = pycolmap.BundleAdjustmentOptions() | |
pycolmap.bundle_adjustment(reconstruction, ba_options) | |
reconstruction_resolution = img_load_resolution | |
else: | |
conf_thres_value = args.conf_thres_value | |
max_points_for_colmap = 100000 # randomly sample 3D points | |
shared_camera = False # in the feedforward manner, we do not support shared camera | |
camera_type = "PINHOLE" # in the feedforward manner, we only support PINHOLE camera | |
image_size = np.array([vggt_fixed_resolution, vggt_fixed_resolution]) | |
num_frames, height, width, _ = points_3d.shape | |
points_rgb = F.interpolate( | |
images, size=(vggt_fixed_resolution, vggt_fixed_resolution), mode="bilinear", align_corners=False | |
) | |
points_rgb = (points_rgb.cpu().numpy() * 255).astype(np.uint8) | |
points_rgb = points_rgb.transpose(0, 2, 3, 1) | |
# (S, H, W, 3), with x, y coordinates and frame indices | |
points_xyf = create_pixel_coordinate_grid(num_frames, height, width) | |
conf_mask = depth_conf >= conf_thres_value | |
# at most writing 100000 3d points to colmap reconstruction object | |
conf_mask = randomly_limit_trues(conf_mask, max_points_for_colmap) | |
points_3d = points_3d[conf_mask] | |
points_xyf = points_xyf[conf_mask] | |
points_rgb = points_rgb[conf_mask] | |
print("Converting to COLMAP format") | |
reconstruction = batch_np_matrix_to_pycolmap_wo_track( | |
points_3d, | |
points_xyf, | |
points_rgb, | |
extrinsic, | |
intrinsic, | |
image_size, | |
shared_camera=shared_camera, | |
camera_type=camera_type, | |
) | |
reconstruction_resolution = vggt_fixed_resolution | |
reconstruction = rename_colmap_recons_and_rescale_camera( | |
reconstruction, | |
base_image_path_list, | |
original_coords.cpu().numpy(), | |
img_size=reconstruction_resolution, | |
shift_point2d_to_original_res=True, | |
shared_camera=shared_camera, | |
) | |
print(f"Saving reconstruction to {args.scene_dir}/sparse") | |
sparse_reconstruction_dir = os.path.join(args.scene_dir, "sparse") | |
os.makedirs(sparse_reconstruction_dir, exist_ok=True) | |
reconstruction.write(sparse_reconstruction_dir) | |
# Save point cloud for fast visualization | |
trimesh.PointCloud(points_3d, colors=points_rgb).export(os.path.join(args.scene_dir, "sparse/points.ply")) | |
return True | |
def rename_colmap_recons_and_rescale_camera( | |
reconstruction, image_paths, original_coords, img_size, shift_point2d_to_original_res=False, shared_camera=False | |
): | |
rescale_camera = True | |
for pyimageid in reconstruction.images: | |
# Reshaped the padded&resized image to the original size | |
# Rename the images to the original names | |
pyimage = reconstruction.images[pyimageid] | |
pycamera = reconstruction.cameras[pyimage.camera_id] | |
pyimage.name = image_paths[pyimageid - 1] | |
if rescale_camera: | |
# Rescale the camera parameters | |
pred_params = copy.deepcopy(pycamera.params) | |
real_image_size = original_coords[pyimageid - 1, -2:] | |
resize_ratio = max(real_image_size) / img_size | |
pred_params = pred_params * resize_ratio | |
real_pp = real_image_size / 2 | |
pred_params[-2:] = real_pp # center of the image | |
pycamera.params = pred_params | |
pycamera.width = real_image_size[0] | |
pycamera.height = real_image_size[1] | |
if shift_point2d_to_original_res: | |
# Also shift the point2D to original resolution | |
top_left = original_coords[pyimageid - 1, :2] | |
for point2D in pyimage.points2D: | |
point2D.xy = (point2D.xy - top_left) * resize_ratio | |
if shared_camera: | |
# If shared_camera, all images share the same camera | |
# no need to rescale any more | |
rescale_camera = False | |
return reconstruction | |
if __name__ == "__main__": | |
args = parse_args() | |
with torch.no_grad(): | |
demo_fn(args) | |
# Work in Progress (WIP) | |
""" | |
VGGT Runner Script | |
================= | |
A script to run the VGGT model for 3D reconstruction from image sequences. | |
Directory Structure | |
------------------ | |
Input: | |
input_folder/ | |
βββ images/ # Source images for reconstruction | |
Output: | |
output_folder/ | |
βββ images/ | |
βββ sparse/ # Reconstruction results | |
β βββ cameras.bin # Camera parameters (COLMAP format) | |
β βββ images.bin # Pose for each image (COLMAP format) | |
β βββ points3D.bin # 3D points (COLMAP format) | |
β βββ points.ply # Point cloud visualization file | |
βββ visuals/ # Visualization outputs TODO | |
Key Features | |
----------- | |
β’ Dual-mode Support: Run reconstructions using either VGGT or VGGT+BA | |
β’ Resolution Preservation: Maintains original image resolution in camera parameters and tracks | |
β’ COLMAP Compatibility: Exports results in standard COLMAP sparse reconstruction format | |
""" | |