Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -29,7 +29,7 @@ config_file_path = "configs/ltxv-13b-0.9.7-distilled.yaml"
|
|
29 |
with open(config_file_path, "r") as file:
|
30 |
PIPELINE_CONFIG_YAML = yaml.safe_load(file)
|
31 |
LTX_REPO = "Lightricks/LTX-Video"
|
32 |
-
MAX_IMAGE_SIZE =
|
33 |
MAX_NUM_FRAMES = 257
|
34 |
FPS = 24.0
|
35 |
|
@@ -65,52 +65,42 @@ def video_to_url(video_path):
|
|
65 |
def calculate_new_dimensions(orig_w, orig_h):
|
66 |
"""
|
67 |
Calculates new dimensions for height and width based on original media dimensions.
|
|
|
|
|
|
|
|
|
68 |
"""
|
69 |
if orig_w == 0 or orig_h == 0:
|
70 |
return TARGET_FIXED_SIDE, TARGET_FIXED_SIDE
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
"""
|
87 |
-
Resize and
|
|
|
88 |
"""
|
89 |
img = Image.open(image_path)
|
90 |
-
|
91 |
-
|
92 |
-
# Calculate the aspect ratios
|
93 |
-
orig_ratio = orig_w / orig_h
|
94 |
-
target_ratio = target_width / target_height
|
95 |
-
|
96 |
-
# Determine the crop dimensions
|
97 |
-
if orig_ratio > target_ratio:
|
98 |
-
# Crop width to match target ratio
|
99 |
-
new_w = int(orig_h * target_ratio)
|
100 |
-
new_h = orig_h
|
101 |
-
else:
|
102 |
-
# Crop height to match target ratio
|
103 |
-
new_w = orig_w
|
104 |
-
new_h = int(orig_w / target_ratio)
|
105 |
-
|
106 |
-
# Calculate crop coordinates
|
107 |
-
left = (orig_w - new_w) / 2
|
108 |
-
top = (orig_h - new_h) / 2
|
109 |
-
right = (orig_w + new_w) / 2
|
110 |
-
bottom = (orig_h + new_h) / 2
|
111 |
-
|
112 |
-
# Crop and resize
|
113 |
-
img = img.crop((left, top, right, bottom))
|
114 |
img = img.resize((target_width, target_height), Image.LANCZOS)
|
115 |
|
116 |
# Save to temporary file
|
@@ -129,7 +119,6 @@ def initialize_models():
|
|
129 |
local_dir_use_symlinks=False
|
130 |
)
|
131 |
PIPELINE_CONFIG_YAML["checkpoint_path"] = distilled_model_actual_path
|
132 |
-
|
133 |
spatial_upscaler_actual_path = hf_hub_download(
|
134 |
repo_id=LTX_REPO,
|
135 |
filename=PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"],
|
@@ -137,7 +126,6 @@ def initialize_models():
|
|
137 |
local_dir_use_symlinks=False
|
138 |
)
|
139 |
PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"] = spatial_upscaler_actual_path
|
140 |
-
|
141 |
print("Creating LTX Video pipeline on CPU...")
|
142 |
pipeline_instance = create_ltx_video_pipeline(
|
143 |
ckpt_path=PIPELINE_CONFIG_YAML["checkpoint_path"],
|
@@ -149,14 +137,12 @@ def initialize_models():
|
|
149 |
prompt_enhancer_image_caption_model_name_or_path=PIPELINE_CONFIG_YAML["prompt_enhancer_image_caption_model_name_or_path"],
|
150 |
prompt_enhancer_llm_model_name_or_path=PIPELINE_CONFIG_YAML["prompt_enhancer_llm_model_name_or_path"],
|
151 |
)
|
152 |
-
|
153 |
if PIPELINE_CONFIG_YAML.get("spatial_upscaler_model_path"):
|
154 |
print("Creating latent upsampler on CPU...")
|
155 |
latent_upsampler_instance = create_latent_upsampler(
|
156 |
PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"],
|
157 |
device="cpu"
|
158 |
)
|
159 |
-
|
160 |
target_inference_device = "cuda"
|
161 |
print(f"Target inference device: {target_inference_device}")
|
162 |
pipeline_instance.to(target_inference_device)
|
@@ -182,6 +168,7 @@ def generate(prompt, input_image_url=None, final_image_url=None, duration_ui=2,
|
|
182 |
|
183 |
if randomize_seed:
|
184 |
seed_ui = random.randint(0, 2**32 - 1)
|
|
|
185 |
seed_everething(int(seed_ui))
|
186 |
|
187 |
# Calculate target frames
|
@@ -219,8 +206,9 @@ def generate(prompt, input_image_url=None, final_image_url=None, duration_ui=2,
|
|
219 |
else:
|
220 |
actual_height, actual_width = TARGET_FIXED_SIDE, TARGET_FIXED_SIDE
|
221 |
|
222 |
-
|
223 |
-
|
|
|
224 |
num_frames_padded = ((actual_num_frames - 2) // 8 + 1) * 8 + 1
|
225 |
padding_values = calculate_padding(actual_height, actual_width, height_padded, width_padded)
|
226 |
|
@@ -264,9 +252,19 @@ def generate(prompt, input_image_url=None, final_image_url=None, duration_ui=2,
|
|
264 |
# Add initial frame conditioning if provided
|
265 |
if input_image_filepath:
|
266 |
try:
|
|
|
|
|
|
|
|
|
|
|
267 |
media_tensor = load_image_to_tensor_with_resize_and_crop(
|
268 |
-
|
269 |
)
|
|
|
|
|
|
|
|
|
|
|
270 |
media_tensor = torch.nn.functional.pad(media_tensor, padding_values)
|
271 |
conditioning_items.append(ConditioningItem(media_tensor.to("cuda"), 0, 1.0))
|
272 |
except Exception as e:
|
@@ -276,19 +274,23 @@ def generate(prompt, input_image_url=None, final_image_url=None, duration_ui=2,
|
|
276 |
# Add final frame conditioning if provided
|
277 |
if final_image_filepath:
|
278 |
try:
|
279 |
-
#
|
280 |
-
resized_final_path =
|
281 |
final_image_filepath, actual_width, actual_height
|
282 |
)
|
|
|
|
|
|
|
283 |
final_media_tensor = load_image_to_tensor_with_resize_and_crop(
|
284 |
resized_final_path, actual_height, actual_width
|
285 |
)
|
286 |
-
final_media_tensor = torch.nn.functional.pad(final_media_tensor, padding_values)
|
287 |
-
conditioning_items.append(ConditioningItem(final_media_tensor.to("cuda"), num_frames_padded - 1, 1.0))
|
288 |
|
289 |
# Clean up temporary file
|
290 |
if os.path.exists(resized_final_path):
|
291 |
os.remove(resized_final_path)
|
|
|
|
|
|
|
292 |
except Exception as e:
|
293 |
print(f"Error loading final image: {e}")
|
294 |
raise gr.Error(f"Could not load final image: {e}")
|
@@ -382,7 +384,6 @@ css = """
|
|
382 |
with gr.Blocks(css=css) as demo:
|
383 |
gr.Markdown("# LTX Video Generator")
|
384 |
gr.Markdown("Generate videos from images using AI. Provide at least one input image (first frame or last frame) and a prompt.")
|
385 |
-
|
386 |
with gr.Row():
|
387 |
with gr.Column():
|
388 |
gr.Markdown("### Input Options")
|
@@ -398,14 +399,12 @@ with gr.Blocks(css=css) as demo:
|
|
398 |
info="Target video duration (1s to 8s)"
|
399 |
)
|
400 |
generate_button = gr.Button("Generate Video", variant="primary")
|
401 |
-
|
402 |
with gr.Column():
|
403 |
gr.Markdown("### Output")
|
404 |
video_output = gr.Textbox(label="Generated Video URL", interactive=False)
|
405 |
video_preview = gr.Video(label="Video Preview", interactive=False, visible=False)
|
406 |
|
407 |
gr.Markdown("**Note:** You must provide at least one input image (either first frame or last frame).")
|
408 |
-
|
409 |
generate_button.click(
|
410 |
fn=generate,
|
411 |
inputs=[prompt_input, input_image_input, final_image_input, duration_input],
|
|
|
29 |
with open(config_file_path, "r") as file:
|
30 |
PIPELINE_CONFIG_YAML = yaml.safe_load(file)
|
31 |
LTX_REPO = "Lightricks/LTX-Video"
|
32 |
+
MAX_IMAGE_SIZE = 1024
|
33 |
MAX_NUM_FRAMES = 257
|
34 |
FPS = 24.0
|
35 |
|
|
|
65 |
def calculate_new_dimensions(orig_w, orig_h):
|
66 |
"""
|
67 |
Calculates new dimensions for height and width based on original media dimensions.
|
68 |
+
Rules:
|
69 |
+
1. If any dimension > 1024, resize so max dimension is 1024, maintaining aspect ratio.
|
70 |
+
2. Then ensure both dimensions are multiples of 32 by rounding to the nearest multiple.
|
71 |
+
3. Ensure dimensions are within [256, 1024].
|
72 |
"""
|
73 |
if orig_w == 0 or orig_h == 0:
|
74 |
return TARGET_FIXED_SIDE, TARGET_FIXED_SIDE
|
75 |
+
|
76 |
+
# Step 1: Handle dimensions > 1024
|
77 |
+
new_w, new_h = orig_w, orig_h
|
78 |
+
if max(orig_w, orig_h) > 1024:
|
79 |
+
max_dim = max(orig_w, orig_h)
|
80 |
+
scale = 1024 / max_dim
|
81 |
+
new_w = int(orig_w * scale)
|
82 |
+
new_h = int(orig_h * scale)
|
83 |
+
|
84 |
+
# Step 2: Round to nearest multiples of 32
|
85 |
+
def round_to_multiple(x, multiple=32):
|
86 |
+
return round(x / multiple) * multiple
|
87 |
+
|
88 |
+
new_w = round_to_multiple(new_w)
|
89 |
+
new_h = round_to_multiple(new_h)
|
90 |
+
|
91 |
+
# Step 3: Ensure within bounds
|
92 |
+
new_w = max(256, min(new_w, MAX_IMAGE_SIZE))
|
93 |
+
new_h = max(256, min(new_h, MAX_IMAGE_SIZE))
|
94 |
+
|
95 |
+
return new_h, new_w
|
96 |
+
|
97 |
+
def resize_and_squash_image(image_path, target_width, target_height):
|
98 |
"""
|
99 |
+
Resize and potentially squash/stretch an image to the exact target dimensions.
|
100 |
+
This ensures no cropping will be needed later.
|
101 |
"""
|
102 |
img = Image.open(image_path)
|
103 |
+
# Resize to exact dimensions, possibly distorting aspect ratio
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
img = img.resize((target_width, target_height), Image.LANCZOS)
|
105 |
|
106 |
# Save to temporary file
|
|
|
119 |
local_dir_use_symlinks=False
|
120 |
)
|
121 |
PIPELINE_CONFIG_YAML["checkpoint_path"] = distilled_model_actual_path
|
|
|
122 |
spatial_upscaler_actual_path = hf_hub_download(
|
123 |
repo_id=LTX_REPO,
|
124 |
filename=PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"],
|
|
|
126 |
local_dir_use_symlinks=False
|
127 |
)
|
128 |
PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"] = spatial_upscaler_actual_path
|
|
|
129 |
print("Creating LTX Video pipeline on CPU...")
|
130 |
pipeline_instance = create_ltx_video_pipeline(
|
131 |
ckpt_path=PIPELINE_CONFIG_YAML["checkpoint_path"],
|
|
|
137 |
prompt_enhancer_image_caption_model_name_or_path=PIPELINE_CONFIG_YAML["prompt_enhancer_image_caption_model_name_or_path"],
|
138 |
prompt_enhancer_llm_model_name_or_path=PIPELINE_CONFIG_YAML["prompt_enhancer_llm_model_name_or_path"],
|
139 |
)
|
|
|
140 |
if PIPELINE_CONFIG_YAML.get("spatial_upscaler_model_path"):
|
141 |
print("Creating latent upsampler on CPU...")
|
142 |
latent_upsampler_instance = create_latent_upsampler(
|
143 |
PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"],
|
144 |
device="cpu"
|
145 |
)
|
|
|
146 |
target_inference_device = "cuda"
|
147 |
print(f"Target inference device: {target_inference_device}")
|
148 |
pipeline_instance.to(target_inference_device)
|
|
|
168 |
|
169 |
if randomize_seed:
|
170 |
seed_ui = random.randint(0, 2**32 - 1)
|
171 |
+
|
172 |
seed_everething(int(seed_ui))
|
173 |
|
174 |
# Calculate target frames
|
|
|
206 |
else:
|
207 |
actual_height, actual_width = TARGET_FIXED_SIDE, TARGET_FIXED_SIDE
|
208 |
|
209 |
+
# Since we're handling all resizing ourselves, we don't need padding
|
210 |
+
height_padded = actual_height
|
211 |
+
width_padded = actual_width
|
212 |
num_frames_padded = ((actual_num_frames - 2) // 8 + 1) * 8 + 1
|
213 |
padding_values = calculate_padding(actual_height, actual_width, height_padded, width_padded)
|
214 |
|
|
|
252 |
# Add initial frame conditioning if provided
|
253 |
if input_image_filepath:
|
254 |
try:
|
255 |
+
# First resize and squash the image to the exact dimensions we want
|
256 |
+
resized_image_path = resize_and_squash_image(input_image_filepath, actual_width, actual_height)
|
257 |
+
|
258 |
+
# Now load this pre-resized image with load_image_to_tensor_with_resize_and_crop
|
259 |
+
# Since it's already the correct size, the "crop" part will be a no-op
|
260 |
media_tensor = load_image_to_tensor_with_resize_and_crop(
|
261 |
+
resized_image_path, actual_height, actual_width
|
262 |
)
|
263 |
+
|
264 |
+
# Clean up temporary file
|
265 |
+
if os.path.exists(resized_image_path):
|
266 |
+
os.remove(resized_image_path)
|
267 |
+
|
268 |
media_tensor = torch.nn.functional.pad(media_tensor, padding_values)
|
269 |
conditioning_items.append(ConditioningItem(media_tensor.to("cuda"), 0, 1.0))
|
270 |
except Exception as e:
|
|
|
274 |
# Add final frame conditioning if provided
|
275 |
if final_image_filepath:
|
276 |
try:
|
277 |
+
# First resize and squash the final image to match the initial image dimensions
|
278 |
+
resized_final_path = resize_and_squash_image(
|
279 |
final_image_filepath, actual_width, actual_height
|
280 |
)
|
281 |
+
|
282 |
+
# Now load this pre-resized image with load_image_to_tensor_with_resize_and_crop
|
283 |
+
# Since it's already the correct size, the "crop" part will be a no-op
|
284 |
final_media_tensor = load_image_to_tensor_with_resize_and_crop(
|
285 |
resized_final_path, actual_height, actual_width
|
286 |
)
|
|
|
|
|
287 |
|
288 |
# Clean up temporary file
|
289 |
if os.path.exists(resized_final_path):
|
290 |
os.remove(resized_final_path)
|
291 |
+
|
292 |
+
final_media_tensor = torch.nn.functional.pad(final_media_tensor, padding_values)
|
293 |
+
conditioning_items.append(ConditioningItem(final_media_tensor.to("cuda"), num_frames_padded - 1, 1.0))
|
294 |
except Exception as e:
|
295 |
print(f"Error loading final image: {e}")
|
296 |
raise gr.Error(f"Could not load final image: {e}")
|
|
|
384 |
with gr.Blocks(css=css) as demo:
|
385 |
gr.Markdown("# LTX Video Generator")
|
386 |
gr.Markdown("Generate videos from images using AI. Provide at least one input image (first frame or last frame) and a prompt.")
|
|
|
387 |
with gr.Row():
|
388 |
with gr.Column():
|
389 |
gr.Markdown("### Input Options")
|
|
|
399 |
info="Target video duration (1s to 8s)"
|
400 |
)
|
401 |
generate_button = gr.Button("Generate Video", variant="primary")
|
|
|
402 |
with gr.Column():
|
403 |
gr.Markdown("### Output")
|
404 |
video_output = gr.Textbox(label="Generated Video URL", interactive=False)
|
405 |
video_preview = gr.Video(label="Video Preview", interactive=False, visible=False)
|
406 |
|
407 |
gr.Markdown("**Note:** You must provide at least one input image (either first frame or last frame).")
|
|
|
408 |
generate_button.click(
|
409 |
fn=generate,
|
410 |
inputs=[prompt_input, input_image_input, final_image_input, duration_input],
|