Spaces:
Running
Running
import os | |
import torch | |
import torchvision | |
import huggingface_hub | |
from torchvision.transforms import InterpolationMode | |
from network.models.facexformer import FaceXFormer | |
from dataclasses import dataclass | |
import numpy as np | |
# import mediapipe as mp | |
# import cv2 | |
# device = "cuda:0" | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
dtype = torch.float32 | |
weights_path = "ckpts/model.pt" | |
# weights_path = "ckpts/pytorch_model.bin" | |
# face_model_path = "ckpts/blaze_face_short_range.tflite" | |
# import mediapipe as mp | |
# BaseOptions = mp.tasks.BaseOptions | |
# FaceDetector = mp.tasks.vision.FaceDetector | |
# FaceDetectorOptions = mp.tasks.vision.FaceDetectorOptions | |
# FaceDetectorResult = mp.tasks.vision.FaceDetectorResult | |
# VisionRunningMode = mp.tasks.vision.RunningMode | |
# options = FaceDetectorOptions( | |
# base_options=BaseOptions(model_asset_path=face_model_path), | |
# running_mode=VisionRunningMode.LIVE_STREAM, | |
# ) | |
# face_detector = FaceDetector.create_from_options(options) | |
transforms_image = torchvision.transforms.Compose( | |
[ | |
torchvision.transforms.ToPILImage(), | |
torchvision.transforms.Resize( | |
size=(224, 224), interpolation=InterpolationMode.BICUBIC | |
), | |
torchvision.transforms.ToTensor(), | |
torchvision.transforms.Normalize( | |
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] | |
), | |
] | |
) | |
def load_model(weights_path): | |
model = FaceXFormer().to(device) | |
if not os.path.exists(weights_path): | |
huggingface_hub.hf_hub_download( | |
"kartiknarayan/facexformer", | |
"ckpts/model.pt", | |
repo_type="model", | |
local_dir=".", | |
) | |
checkpoint = torch.load(weights_path, map_location=device) | |
# model.load_state_dict(checkpoint) | |
model.load_state_dict(checkpoint["state_dict_backbone"]) | |
model = model.eval() | |
model = model.to(dtype=dtype) | |
# model = torch.compile(model, mode="reduce-overhead") | |
return model | |
model = load_model(weights_path) | |
def adjust_bbox( | |
x_min, y_min, x_max, y_max, image_width, image_height, margin_percentage=50 | |
): | |
width = x_max - x_min | |
height = y_max - y_min | |
increase_width = width * (margin_percentage / 100.0) / 2 | |
increase_height = height * (margin_percentage / 100.0) / 2 | |
x_min_adjusted = int(max(0, x_min - increase_width)) | |
y_min_adjusted = int(max(0, y_min - increase_height)) | |
x_max_adjusted = int(min(image_width, x_max + increase_width)) | |
y_max_adjusted = int(min(image_height, y_max + increase_height)) | |
return x_min_adjusted, y_min_adjusted, x_max_adjusted, y_max_adjusted | |
def denorm_points(points, h, w, align_corners=False): | |
if align_corners: | |
denorm_points = ( | |
(points + 1) / 2 * torch.tensor([w - 1, h - 1]).to(points).view(1, 1, 2) | |
) | |
else: | |
denorm_points = ( | |
(points + 1) * torch.tensor([w, h]).to(points).view(1, 1, 2) - 1 | |
) / 2 | |
return denorm_points | |
class BoundingBox: | |
x_min: int | |
y_min: int | |
x_max: int | |
y_max: int | |
class FaceImg: | |
image: np.ndarray | |
x_min: int | |
y_min: int | |
def get_faces_img(img: np.ndarray, boxes: list[BoundingBox]): | |
if boxes is None or len(boxes) == 0: | |
return [] | |
results: list[FaceImg] = [] | |
for box in boxes: | |
x_min, y_min, x_max, y_max = box.x_min, box.y_min, box.x_max, box.y_max | |
# Padding | |
x_min, y_min, x_max, y_max = adjust_bbox( | |
x_min, y_min, x_max, y_max, img.shape[1], img.shape[0] | |
) | |
image = img[y_min:y_max, x_min:x_max] | |
results.append(FaceImg(image, int(x_min), int(y_min))) | |
return results | |
class Face: | |
image: torch.Tensor | |
x_min: int | |
y_min: int | |
original_w: int | |
original_h: int | |
def get_faces(img: np.ndarray, boxes: list[BoundingBox]): | |
images = get_faces_img(img, boxes) | |
images = [ | |
Face( | |
transforms_image(face_image.image), | |
face_image.x_min, | |
face_image.y_min, | |
face_image.image.shape[1], | |
face_image.image.shape[0], | |
) | |
for face_image in images | |
] | |
return images | |
def get_landmarks(faces: list[Face]): | |
if len(faces) == 0: | |
return [] | |
images = torch.stack([face.image for face in faces]).to(device=device, dtype=dtype) | |
tasks = torch.tensor([1] * len(faces), device=device, dtype=dtype) | |
with torch.inference_mode(): | |
# with torch.amp.autocast("cuda"): | |
( | |
batch_landmarks, | |
headposes, | |
attributes, | |
visibilities, | |
ages, | |
geders, | |
races, | |
segs, | |
) = model.predict(images, None, tasks) | |
batch_denormed = [ | |
denorm_points(landmarks, face.original_h, face.original_w)[0] | |
for landmarks, face in zip(batch_landmarks.view(-1, 68, 2), faces) | |
] | |
results = [] | |
for landmarks, face in zip(batch_denormed, faces): | |
results.append( | |
[(int(x + face.x_min), int(y + face.y_min)) for x, y in landmarks] | |
) | |
return results | |