ltx-video-distilled

Running on Zero

App Files Files Community

KingNish commited on about 17 hours ago

Commit

41c24bd

verified ·

1 Parent(s): 2f5046d

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -55

app.py CHANGED Viewed

@@ -29,7 +29,7 @@ config_file_path = "configs/ltxv-13b-0.9.7-distilled.yaml"
 with open(config_file_path, "r") as file:
     PIPELINE_CONFIG_YAML = yaml.safe_load(file)
 LTX_REPO = "Lightricks/LTX-Video"
-MAX_IMAGE_SIZE = PIPELINE_CONFIG_YAML.get("max_resolution", 1280)
 MAX_NUM_FRAMES = 257
 FPS = 24.0
@@ -65,52 +65,42 @@ def video_to_url(video_path):
 def calculate_new_dimensions(orig_w, orig_h):
     """
     Calculates new dimensions for height and width based on original media dimensions.
     """
     if orig_w == 0 or orig_h == 0:
         return TARGET_FIXED_SIDE, TARGET_FIXED_SIDE
-    if orig_w >= orig_h:  # Landscape or square
-        new_h = TARGET_FIXED_SIDE
-        aspect_ratio = orig_w / orig_h
-        new_w_ideal = new_h * aspect_ratio
-        new_w = round(new_w_ideal / 32) * 32
-        new_w = max(256, min(new_w, MAX_IMAGE_SIZE))
-    else:  # Portrait
-        new_w = TARGET_FIXED_SIDE
-        aspect_ratio = orig_h / orig_w
-        new_h_ideal = new_w * aspect_ratio
-        new_h = round(new_h_ideal / 32) * 32
-        new_h = max(256, min(new_h, MAX_IMAGE_SIZE))
-    return int(new_h), int(new_w)
-def resize_and_crop_image(image_path, target_width, target_height):
     """
-    Resize and crop an image to match the target dimensions.
     """
     img = Image.open(image_path)
-    orig_w, orig_h = img.size
-    # Calculate the aspect ratios
-    orig_ratio = orig_w / orig_h
-    target_ratio = target_width / target_height
-    # Determine the crop dimensions
-    if orig_ratio > target_ratio:
-        # Crop width to match target ratio
-        new_w = int(orig_h * target_ratio)
-        new_h = orig_h
-    else:
-        # Crop height to match target ratio
-        new_w = orig_w
-        new_h = int(orig_w / target_ratio)
-    # Calculate crop coordinates
-    left = (orig_w - new_w) / 2
-    top = (orig_h - new_h) / 2
-    right = (orig_w + new_w) / 2
-    bottom = (orig_h + new_h) / 2
-    # Crop and resize
-    img = img.crop((left, top, right, bottom))
     img = img.resize((target_width, target_height), Image.LANCZOS)
     # Save to temporary file
@@ -129,7 +119,6 @@ def initialize_models():
         local_dir_use_symlinks=False
     )
     PIPELINE_CONFIG_YAML["checkpoint_path"] = distilled_model_actual_path
     spatial_upscaler_actual_path = hf_hub_download(
         repo_id=LTX_REPO,
         filename=PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"],
@@ -137,7 +126,6 @@ def initialize_models():
         local_dir_use_symlinks=False
     )
     PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"] = spatial_upscaler_actual_path
     print("Creating LTX Video pipeline on CPU...")
     pipeline_instance = create_ltx_video_pipeline(
         ckpt_path=PIPELINE_CONFIG_YAML["checkpoint_path"],
@@ -149,14 +137,12 @@ def initialize_models():
         prompt_enhancer_image_caption_model_name_or_path=PIPELINE_CONFIG_YAML["prompt_enhancer_image_caption_model_name_or_path"],
         prompt_enhancer_llm_model_name_or_path=PIPELINE_CONFIG_YAML["prompt_enhancer_llm_model_name_or_path"],
     )
     if PIPELINE_CONFIG_YAML.get("spatial_upscaler_model_path"):
         print("Creating latent upsampler on CPU...")
         latent_upsampler_instance = create_latent_upsampler(
             PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"],
             device="cpu"
         )
     target_inference_device = "cuda"
     print(f"Target inference device: {target_inference_device}")
     pipeline_instance.to(target_inference_device)
@@ -182,6 +168,7 @@ def generate(prompt, input_image_url=None, final_image_url=None, duration_ui=2,
     if randomize_seed:
         seed_ui = random.randint(0, 2**32 - 1)
     seed_everething(int(seed_ui))
     # Calculate target frames
@@ -219,8 +206,9 @@ def generate(prompt, input_image_url=None, final_image_url=None, duration_ui=2,
     else:
         actual_height, actual_width = TARGET_FIXED_SIDE, TARGET_FIXED_SIDE
-    height_padded = ((actual_height - 1) // 32 + 1) * 32
-    width_padded = ((actual_width - 1) // 32 + 1) * 32
     num_frames_padded = ((actual_num_frames - 2) // 8 + 1) * 8 + 1
     padding_values = calculate_padding(actual_height, actual_width, height_padded, width_padded)
@@ -264,9 +252,19 @@ def generate(prompt, input_image_url=None, final_image_url=None, duration_ui=2,
     # Add initial frame conditioning if provided
     if input_image_filepath:
         try:
             media_tensor = load_image_to_tensor_with_resize_and_crop(
-                input_image_filepath, actual_height, actual_width
             )
             media_tensor = torch.nn.functional.pad(media_tensor, padding_values)
             conditioning_items.append(ConditioningItem(media_tensor.to("cuda"), 0, 1.0))
         except Exception as e:
@@ -276,19 +274,23 @@ def generate(prompt, input_image_url=None, final_image_url=None, duration_ui=2,
     # Add final frame conditioning if provided
     if final_image_filepath:
         try:
-            # Resize and crop final image to match initial image dimensions
-            resized_final_path = resize_and_crop_image(
                 final_image_filepath, actual_width, actual_height
             )
             final_media_tensor = load_image_to_tensor_with_resize_and_crop(
                 resized_final_path, actual_height, actual_width
             )
-            final_media_tensor = torch.nn.functional.pad(final_media_tensor, padding_values)
-            conditioning_items.append(ConditioningItem(final_media_tensor.to("cuda"), num_frames_padded - 1, 1.0))
             # Clean up temporary file
             if os.path.exists(resized_final_path):
                 os.remove(resized_final_path)
         except Exception as e:
             print(f"Error loading final image: {e}")
             raise gr.Error(f"Could not load final image: {e}")
@@ -382,7 +384,6 @@ css = """
 with gr.Blocks(css=css) as demo:
     gr.Markdown("# LTX Video Generator")
     gr.Markdown("Generate videos from images using AI. Provide at least one input image (first frame or last frame) and a prompt.")
     with gr.Row():
         with gr.Column():
             gr.Markdown("### Input Options")
@@ -398,14 +399,12 @@ with gr.Blocks(css=css) as demo:
                 info="Target video duration (1s to 8s)"
             )
             generate_button = gr.Button("Generate Video", variant="primary")
         with gr.Column():
             gr.Markdown("### Output")
             video_output = gr.Textbox(label="Generated Video URL", interactive=False)
             video_preview = gr.Video(label="Video Preview", interactive=False, visible=False)
     gr.Markdown("**Note:** You must provide at least one input image (either first frame or last frame).")
     generate_button.click(
         fn=generate,
         inputs=[prompt_input, input_image_input, final_image_input, duration_input],

 with open(config_file_path, "r") as file:
     PIPELINE_CONFIG_YAML = yaml.safe_load(file)
 LTX_REPO = "Lightricks/LTX-Video"
+MAX_IMAGE_SIZE = 1024
 MAX_NUM_FRAMES = 257
 FPS = 24.0
 def calculate_new_dimensions(orig_w, orig_h):
     """
     Calculates new dimensions for height and width based on original media dimensions.
+    Rules:
+    1. If any dimension > 1024, resize so max dimension is 1024, maintaining aspect ratio.
+    2. Then ensure both dimensions are multiples of 32 by rounding to the nearest multiple.
+    3. Ensure dimensions are within [256, 1024].
     """
     if orig_w == 0 or orig_h == 0:
         return TARGET_FIXED_SIDE, TARGET_FIXED_SIDE
+    # Step 1: Handle dimensions > 1024
+    new_w, new_h = orig_w, orig_h
+    if max(orig_w, orig_h) > 1024:
+        max_dim = max(orig_w, orig_h)
+        scale = 1024 / max_dim
+        new_w = int(orig_w * scale)
+        new_h = int(orig_h * scale)
+    # Step 2: Round to nearest multiples of 32
+    def round_to_multiple(x, multiple=32):
+        return round(x / multiple) * multiple
+    new_w = round_to_multiple(new_w)
+    new_h = round_to_multiple(new_h)
+    # Step 3: Ensure within bounds
+    new_w = max(256, min(new_w, MAX_IMAGE_SIZE))
+    new_h = max(256, min(new_h, MAX_IMAGE_SIZE))
+    return new_h, new_w
+def resize_and_squash_image(image_path, target_width, target_height):
     """
+    Resize and potentially squash/stretch an image to the exact target dimensions.
+    This ensures no cropping will be needed later.
     """
     img = Image.open(image_path)
+    # Resize to exact dimensions, possibly distorting aspect ratio
     img = img.resize((target_width, target_height), Image.LANCZOS)
     # Save to temporary file
         local_dir_use_symlinks=False
     )
     PIPELINE_CONFIG_YAML["checkpoint_path"] = distilled_model_actual_path
     spatial_upscaler_actual_path = hf_hub_download(
         repo_id=LTX_REPO,
         filename=PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"],
         local_dir_use_symlinks=False
     )
     PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"] = spatial_upscaler_actual_path
     print("Creating LTX Video pipeline on CPU...")
     pipeline_instance = create_ltx_video_pipeline(
         ckpt_path=PIPELINE_CONFIG_YAML["checkpoint_path"],
         prompt_enhancer_image_caption_model_name_or_path=PIPELINE_CONFIG_YAML["prompt_enhancer_image_caption_model_name_or_path"],
         prompt_enhancer_llm_model_name_or_path=PIPELINE_CONFIG_YAML["prompt_enhancer_llm_model_name_or_path"],
     )
     if PIPELINE_CONFIG_YAML.get("spatial_upscaler_model_path"):
         print("Creating latent upsampler on CPU...")
         latent_upsampler_instance = create_latent_upsampler(
             PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"],
             device="cpu"
         )
     target_inference_device = "cuda"
     print(f"Target inference device: {target_inference_device}")
     pipeline_instance.to(target_inference_device)
     if randomize_seed:
         seed_ui = random.randint(0, 2**32 - 1)
     seed_everething(int(seed_ui))
     # Calculate target frames
     else:
         actual_height, actual_width = TARGET_FIXED_SIDE, TARGET_FIXED_SIDE
+    # Since we're handling all resizing ourselves, we don't need padding
+    height_padded = actual_height
+    width_padded = actual_width
     num_frames_padded = ((actual_num_frames - 2) // 8 + 1) * 8 + 1
     padding_values = calculate_padding(actual_height, actual_width, height_padded, width_padded)
     # Add initial frame conditioning if provided
     if input_image_filepath:
         try:
+            # First resize and squash the image to the exact dimensions we want
+            resized_image_path = resize_and_squash_image(input_image_filepath, actual_width, actual_height)
+            # Now load this pre-resized image with load_image_to_tensor_with_resize_and_crop
+            # Since it's already the correct size, the "crop" part will be a no-op
             media_tensor = load_image_to_tensor_with_resize_and_crop(
+                resized_image_path, actual_height, actual_width
             )
+            # Clean up temporary file
+            if os.path.exists(resized_image_path):
+                os.remove(resized_image_path)
             media_tensor = torch.nn.functional.pad(media_tensor, padding_values)
             conditioning_items.append(ConditioningItem(media_tensor.to("cuda"), 0, 1.0))
         except Exception as e:
     # Add final frame conditioning if provided
     if final_image_filepath:
         try:
+            # First resize and squash the final image to match the initial image dimensions
+            resized_final_path = resize_and_squash_image(
                 final_image_filepath, actual_width, actual_height
             )
+            # Now load this pre-resized image with load_image_to_tensor_with_resize_and_crop
+            # Since it's already the correct size, the "crop" part will be a no-op
             final_media_tensor = load_image_to_tensor_with_resize_and_crop(
                 resized_final_path, actual_height, actual_width
             )
             # Clean up temporary file
             if os.path.exists(resized_final_path):
                 os.remove(resized_final_path)
+            final_media_tensor = torch.nn.functional.pad(final_media_tensor, padding_values)
+            conditioning_items.append(ConditioningItem(final_media_tensor.to("cuda"), num_frames_padded - 1, 1.0))
         except Exception as e:
             print(f"Error loading final image: {e}")
             raise gr.Error(f"Could not load final image: {e}")
 with gr.Blocks(css=css) as demo:
     gr.Markdown("# LTX Video Generator")
     gr.Markdown("Generate videos from images using AI. Provide at least one input image (first frame or last frame) and a prompt.")
     with gr.Row():
         with gr.Column():
             gr.Markdown("### Input Options")
                 info="Target video duration (1s to 8s)"
             )
             generate_button = gr.Button("Generate Video", variant="primary")
         with gr.Column():
             gr.Markdown("### Output")
             video_output = gr.Textbox(label="Generated Video URL", interactive=False)
             video_preview = gr.Video(label="Video Preview", interactive=False, visible=False)
     gr.Markdown("**Note:** You must provide at least one input image (either first frame or last frame).")
     generate_button.click(
         fn=generate,
         inputs=[prompt_input, input_image_input, final_image_input, duration_input],