multimodalart HF Staff commited on
Commit
98b03ff
·
verified ·
1 Parent(s): 351420d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -23
app.py CHANGED
@@ -65,12 +65,6 @@ from diffusers.models import AutoencoderKLCogVideoX
65
  from transformers import SiglipImageProcessor, SiglipVisionModel
66
  from diffposetalk.diffposetalk import DiffPoseTalk
67
 
68
- def cleanup_resources():
69
- """Clear CUDA cache and garbage collect"""
70
- if torch.cuda.is_available():
71
- torch.cuda.empty_cache()
72
- gc.collect()
73
-
74
  # Helper functions from the original script
75
  def parse_video(driving_frames, max_frame_num, fps=25):
76
  video_length = len(driving_frames)
@@ -180,8 +174,8 @@ def process_image_audio(image_path, audio_path, guidance_scale=3.0, steps=10, pr
180
  final_output_path = temp_output_file.name
181
 
182
  # Set seed
183
- seed = 43
184
- generator = torch.Generator(device="cuda").manual_seed(seed)
185
 
186
  progress(0.2, desc="Processing image...")
187
  # Load and process image
@@ -244,20 +238,20 @@ def process_image_audio(image_path, audio_path, guidance_scale=3.0, steps=10, pr
244
 
245
  progress(0.6, desc="Generating animation (this may take a while)...")
246
  # Generate video
247
- #with torch.no_grad():
248
- sample = pipe(
249
- image=image,
250
- image_face=image_face,
251
- control_video=input_video,
252
- prompt="",
253
- negative_prompt="",
254
- height=480,
255
- width=720,
256
- num_frames=49,
257
- generator=generator,
258
- guidance_scale=guidance_scale,
259
- num_inference_steps=steps,
260
- )
261
  out_samples = sample.frames[0]
262
 
263
  out_samples = out_samples[2:] # Skip first two frames
@@ -290,7 +284,10 @@ def process_image_audio(image_path, audio_path, guidance_scale=3.0, steps=10, pr
290
  comparison_with_audio = save_video_with_audio(comparison_path, audio_path, comparison_with_audio)
291
 
292
  progress(1.0, desc="Done!")
293
- cleanup_resources()
 
 
 
294
  return result_path, comparison_with_audio
295
 
296
  # Create Gradio interface
 
65
  from transformers import SiglipImageProcessor, SiglipVisionModel
66
  from diffposetalk.diffposetalk import DiffPoseTalk
67
 
 
 
 
 
 
 
68
  # Helper functions from the original script
69
  def parse_video(driving_frames, max_frame_num, fps=25):
70
  video_length = len(driving_frames)
 
174
  final_output_path = temp_output_file.name
175
 
176
  # Set seed
177
+ # seed = 43
178
+ # generator = torch.Generator(device="cuda").manual_seed(seed)
179
 
180
  progress(0.2, desc="Processing image...")
181
  # Load and process image
 
238
 
239
  progress(0.6, desc="Generating animation (this may take a while)...")
240
  # Generate video
241
+ with torch.no_grad():
242
+ sample = pipe(
243
+ image=image,
244
+ image_face=image_face,
245
+ control_video=input_video,
246
+ prompt="",
247
+ negative_prompt="",
248
+ height=480,
249
+ width=720,
250
+ num_frames=49,
251
+ # generator=generator,
252
+ guidance_scale=guidance_scale,
253
+ num_inference_steps=steps,
254
+ )
255
  out_samples = sample.frames[0]
256
 
257
  out_samples = out_samples[2:] # Skip first two frames
 
284
  comparison_with_audio = save_video_with_audio(comparison_path, audio_path, comparison_with_audio)
285
 
286
  progress(1.0, desc="Done!")
287
+
288
+ torch.cuda.empty_cache()
289
+ gc.collect()
290
+
291
  return result_path, comparison_with_audio
292
 
293
  # Create Gradio interface