File size: 6,465 Bytes
9ac31b8
aa6b486
07c1f7a
9ac31b8
3b06696
 
 
 
 
9ac31b8
 
3b06696
 
a9235bb
25d3956
e3d310b
aa6b486
07c1f7a
d56d267
9ac31b8
c237193
d56d267
9ac31b8
41915d4
8010ebe
9ac31b8
 
d56d267
a1fdd0e
d56d267
0cd72ee
9ac31b8
25d3956
3b06696
492fffc
 
3b06696
9ac31b8
d56d267
 
ff23fe9
3b06696
e792963
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9ac31b8
 
 
25d3956
 
9ac31b8
d56d267
9ac31b8
 
 
0cd72ee
efa319b
9ac31b8
 
 
0cd72ee
3b06696
0cd72ee
e792963
 
 
 
 
 
 
 
 
 
d56d267
 
 
3b06696
d56d267
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3b06696
d56d267
 
 
ff46702
d56d267
d6e8a5f
 
3b06696
25d3956
 
0cd72ee
25d3956
 
c8d4706
 
3780d1e
492fffc
 
25d3956
d56d267
492fffc
9ac31b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3184c40
9ac31b8
 
d56d267
c3af865
e792963
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import gradio as gr
import spaces
#import gradio.helpers
import torch
import os
from glob import glob
from pathlib import Path
from typing import Optional

from diffusers import StableVideoDiffusionPipeline
from diffusers.utils import load_image, export_to_video
from PIL import Image

import uuid
import random
from huggingface_hub import hf_hub_download

#gradio.helpers.CACHED_FOLDER = '/data/cache'

pipe = StableVideoDiffusionPipeline.from_pretrained(
    "multimodalart/stable-video-diffusion", torch_dtype=torch.float16, variant="fp16"
)
pipe.to("cuda")
#pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
#pipe.vae = torch.compile(pipe.vae, mode="reduce-overhead", fullgraph=True)

max_64_bit_int = 2**63 - 1

@spaces.GPU(duration=120)
def sample(
    image: Image,
    seed: Optional[int] = 42,
    randomize_seed: bool = True,
    motion_bucket_id: int = 127,
    fps_id: int = 6,
    version: str = "svd_xt",
    cond_aug: float = 0.02,
    decoding_t: int = 3,  # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.
    device: str = "cuda",
    output_folder: str = "outputs",
    progress=gr.Progress(track_tqdm=True)
):
    """
    Generate a video from a single image using Stable Video Diffusion.
    
    Args:
        image: Input PIL Image to generate video from
        seed: Random seed for generation reproducibility
        randomize_seed: Whether to randomize the seed
        motion_bucket_id: Controls the amount of motion in the generated video (1-255)
        fps_id: Frames per second for the output video
        version: Model version to use
        cond_aug: Conditioning augmentation strength
        decoding_t: Number of frames decoded at a time (affects VRAM usage)
        device: Device to run the model on
        output_folder: Directory to save the output video
        progress: Gradio progress tracker
        
    Returns:
        tuple: (video_path, seed) - Path to the generated video and the seed used
    """
    if image.mode == "RGBA":
        image = image.convert("RGB")
        
    if(randomize_seed):
        seed = random.randint(0, max_64_bit_int)
    generator = torch.manual_seed(seed)
    
    os.makedirs(output_folder, exist_ok=True)
    base_count = len(glob(os.path.join(output_folder, "*.mp4")))
    video_path = os.path.join(output_folder, f"{base_count:06d}.mp4")

    frames = pipe(image, decode_chunk_size=decoding_t, generator=generator, motion_bucket_id=motion_bucket_id, noise_aug_strength=0.1, num_frames=25).frames[0]
    export_to_video(frames, video_path, fps=fps_id)
    torch.manual_seed(seed)
    
    return video_path, seed

def resize_image(image, output_size=(1024, 576)):
    """
    Resize and crop an image to the specified output size while maintaining aspect ratio.
    
    Args:
        image: PIL Image to resize
        output_size: Target size as (width, height) tuple
        
    Returns:
        PIL.Image: Resized and cropped image
    """
    # Calculate aspect ratios
    target_aspect = output_size[0] / output_size[1]  # Aspect ratio of the desired size
    image_aspect = image.width / image.height  # Aspect ratio of the original image

    # Resize then crop if the original image is larger
    if image_aspect > target_aspect:
        # Resize the image to match the target height, maintaining aspect ratio
        new_height = output_size[1]
        new_width = int(new_height * image_aspect)
        resized_image = image.resize((new_width, new_height), Image.LANCZOS)
        # Calculate coordinates for cropping
        left = (new_width - output_size[0]) / 2
        top = 0
        right = (new_width + output_size[0]) / 2
        bottom = output_size[1]
    else:
        # Resize the image to match the target width, maintaining aspect ratio
        new_width = output_size[0]
        new_height = int(new_width / image_aspect)
        resized_image = image.resize((new_width, new_height), Image.LANCZOS)
        # Calculate coordinates for cropping
        left = 0
        top = (new_height - output_size[1]) / 2
        right = output_size[0]
        bottom = (new_height + output_size[1]) / 2

    # Crop the image
    cropped_image = resized_image.crop((left, top, right, bottom))
    return cropped_image

with gr.Blocks() as demo:
  gr.Markdown('''# Community demo for Stable Video Diffusion - Img2Vid - XT ([model](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt), [paper](https://stability.ai/research/stable-video-diffusion-scaling-latent-video-diffusion-models-to-large-datasets), [stability's ui waitlist](https://stability.ai/contact))
#### Research release ([_non-commercial_](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt/blob/main/LICENSE)): generate `4s` vid from a single image at (`25 frames` at `6 fps`). this demo uses [🧨 diffusers for low VRAM and fast generation](https://huggingface.co/docs/diffusers/main/en/using-diffusers/svd).
  ''')
  with gr.Row():
    with gr.Column():
        image = gr.Image(label="Upload your image", type="pil")
        generate_btn = gr.Button("Generate")
    video = gr.Video()
  with gr.Accordion("Advanced options", open=False):
      seed = gr.Slider(label="Seed", value=42, randomize=True, minimum=0, maximum=max_64_bit_int, step=1)
      randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
      motion_bucket_id = gr.Slider(label="Motion bucket id", info="Controls how much motion to add/remove from the image", value=127, minimum=1, maximum=255)
      fps_id = gr.Slider(label="Frames per second", info="The length of your video in seconds will be 25/fps", value=6, minimum=5, maximum=30)
      
  image.upload(fn=resize_image, inputs=image, outputs=image, queue=False)
  generate_btn.click(fn=sample, inputs=[image, seed, randomize_seed, motion_bucket_id, fps_id], outputs=[video, seed], api_name="video")
  gr.Examples(
    examples=[
        "images/blink_meme.png",
        "images/confused2_meme.png",
        "images/disaster_meme.png",
        "images/distracted_meme.png",
        "images/hide_meme.png",
        "images/nazare_meme.png",
        "images/success_meme.png",
        "images/willy_meme.png",
        "images/wink_meme.png"
    ],
    inputs=image,
    outputs=[video, seed],
    fn=sample,
    cache_examples="lazy",
  )

if __name__ == "__main__":
    #demo.queue(max_size=20, api_open=False)
    demo.launch(share=True, show_api=False, mcp_server=True)