KingNish commited on
Commit
41c24bd
·
verified ·
1 Parent(s): 2f5046d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -55
app.py CHANGED
@@ -29,7 +29,7 @@ config_file_path = "configs/ltxv-13b-0.9.7-distilled.yaml"
29
  with open(config_file_path, "r") as file:
30
  PIPELINE_CONFIG_YAML = yaml.safe_load(file)
31
  LTX_REPO = "Lightricks/LTX-Video"
32
- MAX_IMAGE_SIZE = PIPELINE_CONFIG_YAML.get("max_resolution", 1280)
33
  MAX_NUM_FRAMES = 257
34
  FPS = 24.0
35
 
@@ -65,52 +65,42 @@ def video_to_url(video_path):
65
  def calculate_new_dimensions(orig_w, orig_h):
66
  """
67
  Calculates new dimensions for height and width based on original media dimensions.
 
 
 
 
68
  """
69
  if orig_w == 0 or orig_h == 0:
70
  return TARGET_FIXED_SIDE, TARGET_FIXED_SIDE
71
- if orig_w >= orig_h: # Landscape or square
72
- new_h = TARGET_FIXED_SIDE
73
- aspect_ratio = orig_w / orig_h
74
- new_w_ideal = new_h * aspect_ratio
75
- new_w = round(new_w_ideal / 32) * 32
76
- new_w = max(256, min(new_w, MAX_IMAGE_SIZE))
77
- else: # Portrait
78
- new_w = TARGET_FIXED_SIDE
79
- aspect_ratio = orig_h / orig_w
80
- new_h_ideal = new_w * aspect_ratio
81
- new_h = round(new_h_ideal / 32) * 32
82
- new_h = max(256, min(new_h, MAX_IMAGE_SIZE))
83
- return int(new_h), int(new_w)
84
-
85
- def resize_and_crop_image(image_path, target_width, target_height):
 
 
 
 
 
 
 
 
86
  """
87
- Resize and crop an image to match the target dimensions.
 
88
  """
89
  img = Image.open(image_path)
90
- orig_w, orig_h = img.size
91
-
92
- # Calculate the aspect ratios
93
- orig_ratio = orig_w / orig_h
94
- target_ratio = target_width / target_height
95
-
96
- # Determine the crop dimensions
97
- if orig_ratio > target_ratio:
98
- # Crop width to match target ratio
99
- new_w = int(orig_h * target_ratio)
100
- new_h = orig_h
101
- else:
102
- # Crop height to match target ratio
103
- new_w = orig_w
104
- new_h = int(orig_w / target_ratio)
105
-
106
- # Calculate crop coordinates
107
- left = (orig_w - new_w) / 2
108
- top = (orig_h - new_h) / 2
109
- right = (orig_w + new_w) / 2
110
- bottom = (orig_h + new_h) / 2
111
-
112
- # Crop and resize
113
- img = img.crop((left, top, right, bottom))
114
  img = img.resize((target_width, target_height), Image.LANCZOS)
115
 
116
  # Save to temporary file
@@ -129,7 +119,6 @@ def initialize_models():
129
  local_dir_use_symlinks=False
130
  )
131
  PIPELINE_CONFIG_YAML["checkpoint_path"] = distilled_model_actual_path
132
-
133
  spatial_upscaler_actual_path = hf_hub_download(
134
  repo_id=LTX_REPO,
135
  filename=PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"],
@@ -137,7 +126,6 @@ def initialize_models():
137
  local_dir_use_symlinks=False
138
  )
139
  PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"] = spatial_upscaler_actual_path
140
-
141
  print("Creating LTX Video pipeline on CPU...")
142
  pipeline_instance = create_ltx_video_pipeline(
143
  ckpt_path=PIPELINE_CONFIG_YAML["checkpoint_path"],
@@ -149,14 +137,12 @@ def initialize_models():
149
  prompt_enhancer_image_caption_model_name_or_path=PIPELINE_CONFIG_YAML["prompt_enhancer_image_caption_model_name_or_path"],
150
  prompt_enhancer_llm_model_name_or_path=PIPELINE_CONFIG_YAML["prompt_enhancer_llm_model_name_or_path"],
151
  )
152
-
153
  if PIPELINE_CONFIG_YAML.get("spatial_upscaler_model_path"):
154
  print("Creating latent upsampler on CPU...")
155
  latent_upsampler_instance = create_latent_upsampler(
156
  PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"],
157
  device="cpu"
158
  )
159
-
160
  target_inference_device = "cuda"
161
  print(f"Target inference device: {target_inference_device}")
162
  pipeline_instance.to(target_inference_device)
@@ -182,6 +168,7 @@ def generate(prompt, input_image_url=None, final_image_url=None, duration_ui=2,
182
 
183
  if randomize_seed:
184
  seed_ui = random.randint(0, 2**32 - 1)
 
185
  seed_everething(int(seed_ui))
186
 
187
  # Calculate target frames
@@ -219,8 +206,9 @@ def generate(prompt, input_image_url=None, final_image_url=None, duration_ui=2,
219
  else:
220
  actual_height, actual_width = TARGET_FIXED_SIDE, TARGET_FIXED_SIDE
221
 
222
- height_padded = ((actual_height - 1) // 32 + 1) * 32
223
- width_padded = ((actual_width - 1) // 32 + 1) * 32
 
224
  num_frames_padded = ((actual_num_frames - 2) // 8 + 1) * 8 + 1
225
  padding_values = calculate_padding(actual_height, actual_width, height_padded, width_padded)
226
 
@@ -264,9 +252,19 @@ def generate(prompt, input_image_url=None, final_image_url=None, duration_ui=2,
264
  # Add initial frame conditioning if provided
265
  if input_image_filepath:
266
  try:
 
 
 
 
 
267
  media_tensor = load_image_to_tensor_with_resize_and_crop(
268
- input_image_filepath, actual_height, actual_width
269
  )
 
 
 
 
 
270
  media_tensor = torch.nn.functional.pad(media_tensor, padding_values)
271
  conditioning_items.append(ConditioningItem(media_tensor.to("cuda"), 0, 1.0))
272
  except Exception as e:
@@ -276,19 +274,23 @@ def generate(prompt, input_image_url=None, final_image_url=None, duration_ui=2,
276
  # Add final frame conditioning if provided
277
  if final_image_filepath:
278
  try:
279
- # Resize and crop final image to match initial image dimensions
280
- resized_final_path = resize_and_crop_image(
281
  final_image_filepath, actual_width, actual_height
282
  )
 
 
 
283
  final_media_tensor = load_image_to_tensor_with_resize_and_crop(
284
  resized_final_path, actual_height, actual_width
285
  )
286
- final_media_tensor = torch.nn.functional.pad(final_media_tensor, padding_values)
287
- conditioning_items.append(ConditioningItem(final_media_tensor.to("cuda"), num_frames_padded - 1, 1.0))
288
 
289
  # Clean up temporary file
290
  if os.path.exists(resized_final_path):
291
  os.remove(resized_final_path)
 
 
 
292
  except Exception as e:
293
  print(f"Error loading final image: {e}")
294
  raise gr.Error(f"Could not load final image: {e}")
@@ -382,7 +384,6 @@ css = """
382
  with gr.Blocks(css=css) as demo:
383
  gr.Markdown("# LTX Video Generator")
384
  gr.Markdown("Generate videos from images using AI. Provide at least one input image (first frame or last frame) and a prompt.")
385
-
386
  with gr.Row():
387
  with gr.Column():
388
  gr.Markdown("### Input Options")
@@ -398,14 +399,12 @@ with gr.Blocks(css=css) as demo:
398
  info="Target video duration (1s to 8s)"
399
  )
400
  generate_button = gr.Button("Generate Video", variant="primary")
401
-
402
  with gr.Column():
403
  gr.Markdown("### Output")
404
  video_output = gr.Textbox(label="Generated Video URL", interactive=False)
405
  video_preview = gr.Video(label="Video Preview", interactive=False, visible=False)
406
 
407
  gr.Markdown("**Note:** You must provide at least one input image (either first frame or last frame).")
408
-
409
  generate_button.click(
410
  fn=generate,
411
  inputs=[prompt_input, input_image_input, final_image_input, duration_input],
 
29
  with open(config_file_path, "r") as file:
30
  PIPELINE_CONFIG_YAML = yaml.safe_load(file)
31
  LTX_REPO = "Lightricks/LTX-Video"
32
+ MAX_IMAGE_SIZE = 1024
33
  MAX_NUM_FRAMES = 257
34
  FPS = 24.0
35
 
 
65
  def calculate_new_dimensions(orig_w, orig_h):
66
  """
67
  Calculates new dimensions for height and width based on original media dimensions.
68
+ Rules:
69
+ 1. If any dimension > 1024, resize so max dimension is 1024, maintaining aspect ratio.
70
+ 2. Then ensure both dimensions are multiples of 32 by rounding to the nearest multiple.
71
+ 3. Ensure dimensions are within [256, 1024].
72
  """
73
  if orig_w == 0 or orig_h == 0:
74
  return TARGET_FIXED_SIDE, TARGET_FIXED_SIDE
75
+
76
+ # Step 1: Handle dimensions > 1024
77
+ new_w, new_h = orig_w, orig_h
78
+ if max(orig_w, orig_h) > 1024:
79
+ max_dim = max(orig_w, orig_h)
80
+ scale = 1024 / max_dim
81
+ new_w = int(orig_w * scale)
82
+ new_h = int(orig_h * scale)
83
+
84
+ # Step 2: Round to nearest multiples of 32
85
+ def round_to_multiple(x, multiple=32):
86
+ return round(x / multiple) * multiple
87
+
88
+ new_w = round_to_multiple(new_w)
89
+ new_h = round_to_multiple(new_h)
90
+
91
+ # Step 3: Ensure within bounds
92
+ new_w = max(256, min(new_w, MAX_IMAGE_SIZE))
93
+ new_h = max(256, min(new_h, MAX_IMAGE_SIZE))
94
+
95
+ return new_h, new_w
96
+
97
+ def resize_and_squash_image(image_path, target_width, target_height):
98
  """
99
+ Resize and potentially squash/stretch an image to the exact target dimensions.
100
+ This ensures no cropping will be needed later.
101
  """
102
  img = Image.open(image_path)
103
+ # Resize to exact dimensions, possibly distorting aspect ratio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  img = img.resize((target_width, target_height), Image.LANCZOS)
105
 
106
  # Save to temporary file
 
119
  local_dir_use_symlinks=False
120
  )
121
  PIPELINE_CONFIG_YAML["checkpoint_path"] = distilled_model_actual_path
 
122
  spatial_upscaler_actual_path = hf_hub_download(
123
  repo_id=LTX_REPO,
124
  filename=PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"],
 
126
  local_dir_use_symlinks=False
127
  )
128
  PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"] = spatial_upscaler_actual_path
 
129
  print("Creating LTX Video pipeline on CPU...")
130
  pipeline_instance = create_ltx_video_pipeline(
131
  ckpt_path=PIPELINE_CONFIG_YAML["checkpoint_path"],
 
137
  prompt_enhancer_image_caption_model_name_or_path=PIPELINE_CONFIG_YAML["prompt_enhancer_image_caption_model_name_or_path"],
138
  prompt_enhancer_llm_model_name_or_path=PIPELINE_CONFIG_YAML["prompt_enhancer_llm_model_name_or_path"],
139
  )
 
140
  if PIPELINE_CONFIG_YAML.get("spatial_upscaler_model_path"):
141
  print("Creating latent upsampler on CPU...")
142
  latent_upsampler_instance = create_latent_upsampler(
143
  PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"],
144
  device="cpu"
145
  )
 
146
  target_inference_device = "cuda"
147
  print(f"Target inference device: {target_inference_device}")
148
  pipeline_instance.to(target_inference_device)
 
168
 
169
  if randomize_seed:
170
  seed_ui = random.randint(0, 2**32 - 1)
171
+
172
  seed_everething(int(seed_ui))
173
 
174
  # Calculate target frames
 
206
  else:
207
  actual_height, actual_width = TARGET_FIXED_SIDE, TARGET_FIXED_SIDE
208
 
209
+ # Since we're handling all resizing ourselves, we don't need padding
210
+ height_padded = actual_height
211
+ width_padded = actual_width
212
  num_frames_padded = ((actual_num_frames - 2) // 8 + 1) * 8 + 1
213
  padding_values = calculate_padding(actual_height, actual_width, height_padded, width_padded)
214
 
 
252
  # Add initial frame conditioning if provided
253
  if input_image_filepath:
254
  try:
255
+ # First resize and squash the image to the exact dimensions we want
256
+ resized_image_path = resize_and_squash_image(input_image_filepath, actual_width, actual_height)
257
+
258
+ # Now load this pre-resized image with load_image_to_tensor_with_resize_and_crop
259
+ # Since it's already the correct size, the "crop" part will be a no-op
260
  media_tensor = load_image_to_tensor_with_resize_and_crop(
261
+ resized_image_path, actual_height, actual_width
262
  )
263
+
264
+ # Clean up temporary file
265
+ if os.path.exists(resized_image_path):
266
+ os.remove(resized_image_path)
267
+
268
  media_tensor = torch.nn.functional.pad(media_tensor, padding_values)
269
  conditioning_items.append(ConditioningItem(media_tensor.to("cuda"), 0, 1.0))
270
  except Exception as e:
 
274
  # Add final frame conditioning if provided
275
  if final_image_filepath:
276
  try:
277
+ # First resize and squash the final image to match the initial image dimensions
278
+ resized_final_path = resize_and_squash_image(
279
  final_image_filepath, actual_width, actual_height
280
  )
281
+
282
+ # Now load this pre-resized image with load_image_to_tensor_with_resize_and_crop
283
+ # Since it's already the correct size, the "crop" part will be a no-op
284
  final_media_tensor = load_image_to_tensor_with_resize_and_crop(
285
  resized_final_path, actual_height, actual_width
286
  )
 
 
287
 
288
  # Clean up temporary file
289
  if os.path.exists(resized_final_path):
290
  os.remove(resized_final_path)
291
+
292
+ final_media_tensor = torch.nn.functional.pad(final_media_tensor, padding_values)
293
+ conditioning_items.append(ConditioningItem(final_media_tensor.to("cuda"), num_frames_padded - 1, 1.0))
294
  except Exception as e:
295
  print(f"Error loading final image: {e}")
296
  raise gr.Error(f"Could not load final image: {e}")
 
384
  with gr.Blocks(css=css) as demo:
385
  gr.Markdown("# LTX Video Generator")
386
  gr.Markdown("Generate videos from images using AI. Provide at least one input image (first frame or last frame) and a prompt.")
 
387
  with gr.Row():
388
  with gr.Column():
389
  gr.Markdown("### Input Options")
 
399
  info="Target video duration (1s to 8s)"
400
  )
401
  generate_button = gr.Button("Generate Video", variant="primary")
 
402
  with gr.Column():
403
  gr.Markdown("### Output")
404
  video_output = gr.Textbox(label="Generated Video URL", interactive=False)
405
  video_preview = gr.Video(label="Video Preview", interactive=False, visible=False)
406
 
407
  gr.Markdown("**Note:** You must provide at least one input image (either first frame or last frame).")
 
408
  generate_button.click(
409
  fn=generate,
410
  inputs=[prompt_input, input_image_input, final_image_input, duration_input],