Spaces:
Runtime error
Runtime error
Delete src/generation.py
Browse files- src/generation.py +0 -128
src/generation.py
DELETED
|
@@ -1,128 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
|
| 3 |
-
import sys
|
| 4 |
-
sys.path.insert(1, os.path.join(sys.path[0], '..'))
|
| 5 |
-
|
| 6 |
-
import warnings
|
| 7 |
-
|
| 8 |
-
import cv2
|
| 9 |
-
import numpy as np
|
| 10 |
-
import tqdm
|
| 11 |
-
import torch
|
| 12 |
-
import torch.nn.functional as F
|
| 13 |
-
import torchvision.io as vision_io
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
from models.pipelines import TextToVideoSDPipelineSpatialAware
|
| 18 |
-
from diffusers.utils import export_to_video
|
| 19 |
-
from PIL import Image
|
| 20 |
-
import torchvision
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
import warnings
|
| 25 |
-
warnings.filterwarnings("ignore")
|
| 26 |
-
|
| 27 |
-
OUTPUT_PATH = "/scr/demo"
|
| 28 |
-
|
| 29 |
-
def generate_video(pipe, overall_prompt, latents, get_latents=False, num_frames=24, num_inference_steps=50, fg_masks=None,
|
| 30 |
-
fg_masked_latents=None, frozen_steps=0, frozen_prompt=None, custom_attention_mask=None, fg_prompt=None):
|
| 31 |
-
|
| 32 |
-
video_frames = pipe(overall_prompt, num_frames=num_frames, latents=latents, num_inference_steps=num_inference_steps, frozen_mask=fg_masks,
|
| 33 |
-
frozen_steps=frozen_steps, latents_all_input=fg_masked_latents, frozen_prompt=frozen_prompt, custom_attention_mask=custom_attention_mask, fg_prompt=fg_prompt,
|
| 34 |
-
make_attention_mask_2d=True, attention_mask_block_diagonal=True, height=320, width=576 ).frames
|
| 35 |
-
if get_latents:
|
| 36 |
-
video_latents = pipe(overall_prompt, num_frames=num_frames, latents=latents, num_inference_steps=num_inference_steps, output_type="latent").frames
|
| 37 |
-
return video_frames, video_latents
|
| 38 |
-
|
| 39 |
-
return video_frames
|
| 40 |
-
|
| 41 |
-
def save_frames(path):
|
| 42 |
-
video, audio, video_info = vision_io.read_video(f"{path}.mp4", pts_unit='sec')
|
| 43 |
-
|
| 44 |
-
# Number of frames
|
| 45 |
-
num_frames = video.size(0)
|
| 46 |
-
|
| 47 |
-
# Save each frame
|
| 48 |
-
os.makedirs(f"{path}", exist_ok=True)
|
| 49 |
-
for i in range(num_frames):
|
| 50 |
-
frame = video[i, :, :, :].numpy()
|
| 51 |
-
# Convert from C x H x W to H x W x C and from torch tensor to PIL Image
|
| 52 |
-
# frame = frame.permute(1, 2, 0).numpy()
|
| 53 |
-
img = Image.fromarray(frame.astype('uint8'))
|
| 54 |
-
img.save(f"{path}/frame_{i:04d}.png")
|
| 55 |
-
|
| 56 |
-
if __name__ == "__main__":
|
| 57 |
-
# Example usage
|
| 58 |
-
num_frames = 24
|
| 59 |
-
save_path = "video"
|
| 60 |
-
torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 61 |
-
random_latents = torch.randn([1, 4, num_frames, 40, 72], generator=torch.Generator().manual_seed(2)).to(torch_device)
|
| 62 |
-
|
| 63 |
-
try:
|
| 64 |
-
pipe = TextToVideoSDPipelineSpatialAware.from_pretrained(
|
| 65 |
-
"cerspense/zeroscope_v2_576w", torch_dtype=torch.float, variant="fp32").to(torch_device)
|
| 66 |
-
except:
|
| 67 |
-
pipe = TextToVideoSDPipelineSpatialAware.from_pretrained(
|
| 68 |
-
"cerspense/zeroscope_v2_576w", torch_dtype=torch.float, variant="fp32").to(torch_device)
|
| 69 |
-
|
| 70 |
-
# Generate video
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
bbox_mask = torch.zeros([24, 1, 40, 72], device=torch_device)
|
| 74 |
-
bbox_mask_2 = torch.zeros([24, 1, 40, 72], device=torch_device)
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
x_start = [10 + (i % 3) for i in range(num_frames)] # Simulating slight movement in x
|
| 78 |
-
x_end = [30 + (i % 3) for i in range(num_frames)] # Simulating slight movement in x
|
| 79 |
-
y_start = [10 for _ in range(num_frames)] # Static y start as the bear is seated/standing
|
| 80 |
-
y_end = [25 for _ in range(num_frames)] # Static y end, considering the size of the guitar
|
| 81 |
-
|
| 82 |
-
# Populate the bbox_mask tensor with ones where the bounding box is located
|
| 83 |
-
for i in range(num_frames):
|
| 84 |
-
bbox_mask[i, :, x_start[i]:x_end[i], y_start[i]:y_end[i]] = 1
|
| 85 |
-
bbox_mask_2[i, :, x_start[i]:x_end[i], 72-y_end[i]:72-y_start[i]] = 1
|
| 86 |
-
|
| 87 |
-
# fg_masks = bbox_mask
|
| 88 |
-
fg_masks = [bbox_mask, bbox_mask_2]
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
frozen_prompt = None
|
| 93 |
-
fg_masked_latents = None
|
| 94 |
-
fg_objects = []
|
| 95 |
-
prompts = []
|
| 96 |
-
prompts = [
|
| 97 |
-
(["cat", "goldfish bowl"], "A cat curiously staring at a goldfish bowl on a sunny windowsill."),
|
| 98 |
-
(["Superman", "Batman"], "Superman and Batman standing side by side in a heroic pose against a city skyline."),
|
| 99 |
-
(["rose", "daisy"], "A rose and a daisy in a small vase on a rustic wooden table."),
|
| 100 |
-
(["Harry Potter", "Hermione Granger"], "Harry Potter and Hermione Granger studying a magical map."),
|
| 101 |
-
(["butterfly", "dragonfly"], "A butterfly and a dragonfly resting on a leaf in a vibrant garden."),
|
| 102 |
-
(["teddy bear", "toy train"], "A teddy bear and a toy train on a child's playmat in a brightly lit room."),
|
| 103 |
-
(["frog", "turtle"], "A frog and a turtle sitting on a lily pad in a serene pond."),
|
| 104 |
-
(["Mickey Mouse", "Donald Duck"], "Mickey Mouse and Donald Duck enjoying a day at the beach, building a sandcastle."),
|
| 105 |
-
(["penguin", "seal"], "A penguin and a seal lounging on an iceberg in the Antarctic."),
|
| 106 |
-
(["lion", "zebra"], "A lion and a zebra peacefully drinking water from the same pond in the savannah.")
|
| 107 |
-
]
|
| 108 |
-
|
| 109 |
-
for fg_object, overall_prompt in prompts:
|
| 110 |
-
os.makedirs(f"{OUTPUT_PATH}/{save_path}/{overall_prompt}-mask", exist_ok=True)
|
| 111 |
-
try:
|
| 112 |
-
for i in range(num_frames):
|
| 113 |
-
torchvision.utils.save_image(fg_masks[0][i,0], f"{OUTPUT_PATH}/{save_path}/{overall_prompt}-mask/frame_{i:04d}_0.png")
|
| 114 |
-
torchvision.utils.save_image(fg_masks[1][i,0], f"{OUTPUT_PATH}/{save_path}/{overall_prompt}-mask/frame_{i:04d}_1.png")
|
| 115 |
-
except:
|
| 116 |
-
pass
|
| 117 |
-
print(fg_object, overall_prompt)
|
| 118 |
-
seed = 2
|
| 119 |
-
random_latents = torch.randn([1, 4, num_frames, 40, 72], generator=torch.Generator().manual_seed(seed)).to(torch_device)
|
| 120 |
-
for num_inference_steps in range(40,50,10):
|
| 121 |
-
for frozen_steps in [0, 1, 2]:
|
| 122 |
-
video_frames = generate_video(pipe, overall_prompt, random_latents, get_latents=False, num_frames=num_frames, num_inference_steps=num_inference_steps,
|
| 123 |
-
fg_masks=fg_masks, fg_masked_latents=fg_masked_latents, frozen_steps=frozen_steps, frozen_prompt=frozen_prompt, fg_prompt=fg_object)
|
| 124 |
-
# Save video frames
|
| 125 |
-
os.makedirs(f"{OUTPUT_PATH}/{save_path}/{overall_prompt}", exist_ok=True)
|
| 126 |
-
video_path = export_to_video(video_frames, f"{OUTPUT_PATH}/{save_path}/{overall_prompt}/{frozen_steps}_of_{num_inference_steps}_{seed}_masked.mp4")
|
| 127 |
-
save_frames(f"{OUTPUT_PATH}/{save_path}/{overall_prompt}/{frozen_steps}_of_{num_inference_steps}_{seed}_masked")
|
| 128 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|