Spaces:

Skywork
/

skyreels-a1-talking-head

Running on L40S

App Files Files Community

multimodalart HF Staff commited on Mar 4

Commit

98b03ff

verified ·

1 Parent(s): 351420d

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -23

app.py CHANGED Viewed

@@ -65,12 +65,6 @@ from diffusers.models import AutoencoderKLCogVideoX
 from transformers import SiglipImageProcessor, SiglipVisionModel
 from diffposetalk.diffposetalk import DiffPoseTalk
-def cleanup_resources():
-    """Clear CUDA cache and garbage collect"""
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-    gc.collect()
 # Helper functions from the original script
 def parse_video(driving_frames, max_frame_num, fps=25):
     video_length = len(driving_frames)
@@ -180,8 +174,8 @@ def process_image_audio(image_path, audio_path, guidance_scale=3.0, steps=10, pr
         final_output_path = temp_output_file.name
     # Set seed
-    seed = 43
-    generator = torch.Generator(device="cuda").manual_seed(seed)
     progress(0.2, desc="Processing image...")
     # Load and process image
@@ -244,20 +238,20 @@ def process_image_audio(image_path, audio_path, guidance_scale=3.0, steps=10, pr
     progress(0.6, desc="Generating animation (this may take a while)...")
     # Generate video
-    #with torch.no_grad():
-    sample = pipe(
-        image=image,
-        image_face=image_face,
-        control_video=input_video,
-        prompt="",
-        negative_prompt="",
-        height=480,
-        width=720,
-        num_frames=49,
-        generator=generator,
-        guidance_scale=guidance_scale,
-        num_inference_steps=steps,
-    )
     out_samples = sample.frames[0]
     out_samples = out_samples[2:]  # Skip first two frames
@@ -290,7 +284,10 @@ def process_image_audio(image_path, audio_path, guidance_scale=3.0, steps=10, pr
     comparison_with_audio = save_video_with_audio(comparison_path, audio_path, comparison_with_audio)
     progress(1.0, desc="Done!")
-    cleanup_resources()
     return result_path, comparison_with_audio
 # Create Gradio interface

 from transformers import SiglipImageProcessor, SiglipVisionModel
 from diffposetalk.diffposetalk import DiffPoseTalk
 # Helper functions from the original script
 def parse_video(driving_frames, max_frame_num, fps=25):
     video_length = len(driving_frames)
         final_output_path = temp_output_file.name
     # Set seed
+    # seed = 43
+    # generator = torch.Generator(device="cuda").manual_seed(seed)
     progress(0.2, desc="Processing image...")
     # Load and process image
     progress(0.6, desc="Generating animation (this may take a while)...")
     # Generate video
+    with torch.no_grad():
+        sample = pipe(
+            image=image,
+            image_face=image_face,
+            control_video=input_video,
+            prompt="",
+            negative_prompt="",
+            height=480,
+            width=720,
+            num_frames=49,
+            # generator=generator,
+            guidance_scale=guidance_scale,
+            num_inference_steps=steps,
+        )
     out_samples = sample.frames[0]
     out_samples = out_samples[2:]  # Skip first two frames
     comparison_with_audio = save_video_with_audio(comparison_path, audio_path, comparison_with_audio)
     progress(1.0, desc="Done!")
+    torch.cuda.empty_cache()
+    gc.collect()
     return result_path, comparison_with_audio
 # Create Gradio interface