ginipick commited on
Commit
b66d096
Β·
verified Β·
1 Parent(s): 5163a74

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +389 -367
app.py CHANGED
@@ -16,18 +16,40 @@ import gradio as gr
16
  import tempfile
17
  from huggingface_hub import hf_hub_download
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  from src.pipeline_wan_nag import NAGWanPipeline
20
  from src.transformer_wan_nag import NagWanTransformer3DModel
21
 
22
  # MMAudio imports
23
  try:
24
- import mmaudio
25
  except ImportError:
26
- os.system("pip install -e .")
27
- import mmaudio
28
 
29
  from mmaudio.eval_utils import (ModelConfig, all_model_cfg, generate as mmaudio_generate,
30
- load_video, make_video, setup_eval_logging)
31
  from mmaudio.model.flow_matching import FlowMatching
32
  from mmaudio.model.networks import MMAudio, get_my_mmaudio
33
  from mmaudio.model.sequence_config import SequenceConfig
@@ -72,421 +94,421 @@ setup_eval_logging()
72
 
73
  # Initialize NAG Video Model
74
  try:
75
- vae = AutoencoderKLWan.from_pretrained(MODEL_ID, subfolder="vae", torch_dtype=torch.float32)
76
- wan_path = hf_hub_download(repo_id=SUB_MODEL_ID, filename=SUB_MODEL_FILENAME)
77
- transformer = NagWanTransformer3DModel.from_single_file(wan_path, torch_dtype=torch.bfloat16)
78
- pipe = NAGWanPipeline.from_pretrained(
79
- MODEL_ID, vae=vae, transformer=transformer, torch_dtype=torch.bfloat16
80
- )
81
- pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=5.0)
82
- pipe.to("cuda")
83
 
84
- pipe.transformer.__class__.attn_processors = NagWanTransformer3DModel.attn_processors
85
- pipe.transformer.__class__.set_attn_processor = NagWanTransformer3DModel.set_attn_processor
86
- pipe.transformer.__class__.forward = NagWanTransformer3DModel.forward
87
- print("NAG Video Model loaded successfully!")
88
  except Exception as e:
89
- print(f"Error loading NAG Video Model: {e}")
90
- pipe = None
91
 
92
  # Initialize MMAudio Model
93
  def get_mmaudio_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
94
- seq_cfg = audio_model_config.seq_cfg
95
-
96
- net: MMAudio = get_my_mmaudio(audio_model_config.model_name).to(device, dtype).eval()
97
- net.load_weights(torch.load(audio_model_config.model_path, map_location=device, weights_only=True))
98
- log.info(f'Loaded MMAudio weights from {audio_model_config.model_path}')
99
-
100
- feature_utils = FeaturesUtils(tod_vae_ckpt=audio_model_config.vae_path,
101
- synchformer_ckpt=audio_model_config.synchformer_ckpt,
102
- enable_conditions=True,
103
- mode=audio_model_config.mode,
104
- bigvgan_vocoder_ckpt=audio_model_config.bigvgan_16k_path,
105
- need_vae_encoder=False)
106
- feature_utils = feature_utils.to(device, dtype).eval()
107
-
108
- return net, feature_utils, seq_cfg
109
 
110
  try:
111
- audio_net, audio_feature_utils, audio_seq_cfg = get_mmaudio_model()
112
- print("MMAudio Model loaded successfully!")
113
  except Exception as e:
114
- print(f"Error loading MMAudio Model: {e}")
115
- audio_net = None
116
 
117
  # Audio generation function
118
  @torch.inference_mode()
119
  def add_audio_to_video(video_path, prompt, audio_negative_prompt, audio_steps, audio_cfg_strength, duration):
120
- """Generate and add audio to video using MMAudio"""
121
- if audio_net is None:
122
- print("MMAudio model not loaded, returning video without audio")
123
- return video_path
124
-
125
- try:
126
- rng = torch.Generator(device=device)
127
- rng.seed() # Random seed for audio
128
- fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=audio_steps)
129
-
130
- video_info = load_video(video_path, duration)
131
- clip_frames = video_info.clip_frames
132
- sync_frames = video_info.sync_frames
133
- duration = video_info.duration_sec
134
- clip_frames = clip_frames.unsqueeze(0)
135
- sync_frames = sync_frames.unsqueeze(0)
136
- audio_seq_cfg.duration = duration
137
- audio_net.update_seq_lengths(audio_seq_cfg.latent_seq_len, audio_seq_cfg.clip_seq_len, audio_seq_cfg.sync_seq_len)
138
-
139
- audios = mmaudio_generate(clip_frames,
140
- sync_frames, [prompt],
141
- negative_text=[audio_negative_prompt],
142
- feature_utils=audio_feature_utils,
143
- net=audio_net,
144
- fm=fm,
145
- rng=rng,
146
- cfg_strength=audio_cfg_strength)
147
- audio = audios.float().cpu()[0]
148
-
149
- # Create video with audio
150
- video_with_audio_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
151
- make_video(video_info, video_with_audio_path, audio, sampling_rate=audio_seq_cfg.sampling_rate)
152
-
153
- return video_with_audio_path
154
- except Exception as e:
155
- print(f"Error in audio generation: {e}")
156
- return video_path
157
 
158
  # Combined generation function
159
  def get_duration(prompt, nag_negative_prompt, nag_scale, height, width, duration_seconds,
160
- steps, seed, randomize_seed, enable_audio, audio_negative_prompt,
161
- audio_steps, audio_cfg_strength):
162
- # Calculate total duration including audio processing if enabled
163
- video_duration = int(duration_seconds) * int(steps) * 2.25 + 5
164
- audio_duration = 30 if enable_audio else 0 # Additional time for audio processing
165
- return video_duration + audio_duration
166
 
167
  @spaces.GPU(duration=get_duration)
168
  def generate_video_with_audio(
169
- prompt,
170
- nag_negative_prompt, nag_scale,
171
- height=DEFAULT_H_SLIDER_VALUE, width=DEFAULT_W_SLIDER_VALUE, duration_seconds=DEFAULT_DURATION_SECONDS,
172
- steps=DEFAULT_STEPS,
173
- seed=DEFAULT_SEED, randomize_seed=False,
174
- enable_audio=True, audio_negative_prompt=DEFAULT_AUDIO_NEGATIVE_PROMPT,
175
- audio_steps=25, audio_cfg_strength=4.5,
176
  ):
177
- if pipe is None:
178
- return None, DEFAULT_SEED
179
-
180
- try:
181
- # Generate video first
182
- target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
183
- target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
184
-
185
- num_frames = np.clip(int(round(int(duration_seconds) * FIXED_FPS) + 1), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL)
186
-
187
- current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
188
-
189
- print(f"Generating video with: prompt='{prompt}', resolution={target_w}x{target_h}, frames={num_frames}")
190
-
191
- with torch.inference_mode():
192
- nag_output_frames_list = pipe(
193
- prompt=prompt,
194
- nag_negative_prompt=nag_negative_prompt,
195
- nag_scale=nag_scale,
196
- nag_tau=3.5,
197
- nag_alpha=0.5,
198
- height=target_h, width=target_w, num_frames=num_frames,
199
- guidance_scale=0.,
200
- num_inference_steps=int(steps),
201
- generator=torch.Generator(device="cuda").manual_seed(current_seed)
202
- ).frames[0]
203
-
204
- # Save initial video without audio
205
- with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
206
- temp_video_path = tmpfile.name
207
- export_to_video(nag_output_frames_list, temp_video_path, fps=FIXED_FPS)
208
- print(f"Video saved to: {temp_video_path}")
209
-
210
- # Add audio if enabled
211
- if enable_audio:
212
- try:
213
- print("Adding audio to video...")
214
- final_video_path = add_audio_to_video(
215
- temp_video_path,
216
- prompt, # Use the same prompt for audio generation
217
- audio_negative_prompt,
218
- audio_steps,
219
- audio_cfg_strength,
220
- duration_seconds
221
- )
222
- # Clean up temp video
223
- if os.path.exists(temp_video_path) and final_video_path != temp_video_path:
224
- os.remove(temp_video_path)
225
- print(f"Final video with audio: {final_video_path}")
226
- except Exception as e:
227
- log.error(f"Audio generation failed: {e}")
228
- final_video_path = temp_video_path
229
- else:
230
- final_video_path = temp_video_path
231
-
232
- return final_video_path, current_seed
233
- except Exception as e:
234
- print(f"Error in video generation: {e}")
235
- return None, current_seed
236
 
237
  # Example generation function - simplified
238
  def set_example(prompt, nag_negative_prompt, nag_scale):
239
- """Set example values in the UI without triggering generation"""
240
- return (
241
- prompt,
242
- nag_negative_prompt,
243
- nag_scale,
244
- DEFAULT_H_SLIDER_VALUE,
245
- DEFAULT_W_SLIDER_VALUE,
246
- DEFAULT_DURATION_SECONDS,
247
- DEFAULT_STEPS,
248
- DEFAULT_SEED,
249
- True, # randomize_seed
250
- True, # enable_audio
251
- DEFAULT_AUDIO_NEGATIVE_PROMPT,
252
- 25, # audio_steps
253
- 4.5 # audio_cfg_strength
254
- )
255
 
256
  # Examples with audio descriptions
257
  examples = [
258
- ["Midnight highway outside a neon-lit city. A black 1973 Porsche 911 Carrera RS speeds at 120 km/h. Inside, a stylish singer-guitarist sings while driving, vintage sunburst guitar on the passenger seat. Sodium streetlights streak over the hood; RGB panels shift magenta to blue on the driver. Camera: drone dive, Russian-arm low wheel shot, interior gimbal, FPV barrel roll, overhead spiral. Neo-noir palette, rain-slick asphalt reflections, roaring flat-six engine blended with live guitar.", DEFAULT_NAG_NEGATIVE_PROMPT, 11],
259
- ["Arena rock concert packed with 20 000 fans. A flamboyant lead guitarist in leather jacket and mirrored aviators shreds a cherry-red Flying V on a thrust stage. Pyro flames shoot up on every downbeat, COβ‚‚ jets burst behind. Moving-head spotlights swirl teal and amber, follow-spots rim-light the guitarist's hair. Steadicam 360-orbit, crane shot rising over crowd, ultra-slow-motion pick attack at 1 000 fps. Film-grain teal-orange grade, thunderous crowd roar mixes with screaming guitar solo.", DEFAULT_NAG_NEGATIVE_PROMPT, 11],
260
- ["Golden-hour countryside road winding through rolling wheat fields. A man and woman ride a vintage cafΓ©-racer motorcycle, hair and scarf fluttering in the warm breeze. Drone chase shot reveals endless patchwork farmland; low slider along rear wheel captures dust trail. Sun-flare back-lights the riders, lens blooms on highlights. Soft acoustic rock underscore; engine rumble mixed at –8 dB. Warm pastel color grade, gentle film-grain for nostalgic vibe.", DEFAULT_NAG_NEGATIVE_PROMPT, 11],
261
  ]
262
 
263
  # CSS styling - Fixed for better layout
264
  css = """
265
  /* Right column - video output */
266
  .video-output {
267
- border-radius: 15px;
268
- overflow: hidden;
269
- box-shadow: 0 10px 30px rgba(0, 0, 0, 0.2);
270
- width: 100% !important;
271
- height: auto !important;
272
- min-height: 400px;
273
  }
274
 
275
  /* Ensure video container is responsive */
276
  .video-output video {
277
- width: 100% !important;
278
- height: auto !important;
279
- max-height: 600px;
280
- object-fit: contain;
281
- display: block;
282
  }
283
 
284
  /* Remove any overlay or background from video container */
285
  .video-output > div {
286
- background: transparent !important;
287
- padding: 0 !important;
288
  }
289
 
290
  /* Remove gradio's default video player overlay */
291
  .video-output .wrap {
292
- background: transparent !important;
293
  }
294
 
295
  /* Ensure no gray overlay on video controls */
296
  .video-output video::-webkit-media-controls-enclosure {
297
- background: transparent;
298
  }
299
  """
300
 
301
  # Gradio interface - Fixed structure
302
  with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
303
- gr.HTML("""
304
- <div class="container">
305
- <h1 class="main-title">🎬 VEO3 Free</h1>
306
- <p class="subtitle">Wan2.1-T2V-14B + Fast 4-step with NAG + Automatic Audio Generation</p>
307
- </div>
308
- """)
309
-
310
- gr.HTML("""
311
- <div class='container' style='display:flex; justify-content:center; gap:12px; margin-bottom: 20px;'>
312
- <a href="https://huggingface.co/spaces/openfree/Best-AI" target="_blank">
313
- <img src="https://img.shields.io/static/v1?label=OpenFree&message=BEST%20AI%20Services&color=%230000ff&labelColor=%23000080&logo=huggingface&logoColor=%23ffa500&style=for-the-badge" alt="OpenFree badge">
314
- </a>
315
 
316
- <a href="https://discord.gg/openfreeai" target="_blank">
317
- <img src="https://img.shields.io/static/v1?label=Discord&message=Openfree%20AI&color=%230000ff&labelColor=%23800080&logo=discord&logoColor=white&style=for-the-badge" alt="Discord badge">
318
- </a>
319
- </div>
320
- """)
321
-
322
- with gr.Row(equal_height=True):
323
- with gr.Column(scale=5):
324
- with gr.Group(elem_classes="prompt-container"):
325
- prompt = gr.Textbox(
326
- label="✨ Video Prompt (also used for audio generation)",
327
- placeholder="Describe your video scene in detail...",
328
- lines=3,
329
- elem_classes="prompt-input"
330
- )
331
-
332
- with gr.Accordion("🎨 Advanced Video Settings", open=False):
333
- nag_negative_prompt = gr.Textbox(
334
- label="Video Negative Prompt",
335
- value=DEFAULT_NAG_NEGATIVE_PROMPT,
336
- lines=2,
337
- )
338
- nag_scale = gr.Slider(
339
- label="NAG Scale",
340
- minimum=1.0,
341
- maximum=20.0,
342
- step=0.25,
343
- value=11.0,
344
- info="Higher values = stronger guidance"
345
- )
346
-
347
- with gr.Group(elem_classes="settings-panel"):
348
- gr.Markdown("### βš™οΈ Video Settings")
349
-
350
- with gr.Row():
351
- duration_seconds_input = gr.Slider(
352
- minimum=1,
353
- maximum=8,
354
- step=1,
355
- value=DEFAULT_DURATION_SECONDS,
356
- label="πŸ“± Duration (seconds)",
357
- elem_classes="slider-container"
358
- )
359
- steps_slider = gr.Slider(
360
- minimum=1,
361
- maximum=8,
362
- step=1,
363
- value=DEFAULT_STEPS,
364
- label="πŸ”„ Inference Steps",
365
- elem_classes="slider-container"
366
- )
367
-
368
- with gr.Row():
369
- height_input = gr.Slider(
370
- minimum=SLIDER_MIN_H,
371
- maximum=SLIDER_MAX_H,
372
- step=MOD_VALUE,
373
- value=DEFAULT_H_SLIDER_VALUE,
374
- label=f"πŸ“ Height (Γ—{MOD_VALUE})",
375
- elem_classes="slider-container"
376
- )
377
- width_input = gr.Slider(
378
- minimum=SLIDER_MIN_W,
379
- maximum=SLIDER_MAX_W,
380
- step=MOD_VALUE,
381
- value=DEFAULT_W_SLIDER_VALUE,
382
- label=f"πŸ“ Width (Γ—{MOD_VALUE})",
383
- elem_classes="slider-container"
384
- )
385
-
386
- with gr.Row():
387
- seed_input = gr.Slider(
388
- label="🌱 Seed",
389
- minimum=0,
390
- maximum=MAX_SEED,
391
- step=1,
392
- value=DEFAULT_SEED,
393
- interactive=True
394
- )
395
- randomize_seed_checkbox = gr.Checkbox(
396
- label="🎲 Random Seed",
397
- value=True,
398
- interactive=True
399
- )
400
-
401
- with gr.Group(elem_classes="audio-settings"):
402
- gr.Markdown("### 🎡 Audio Generation Settings")
403
-
404
- enable_audio = gr.Checkbox(
405
- label="πŸ”Š Enable Automatic Audio Generation",
406
- value=True,
407
- interactive=True
408
- )
409
-
410
- with gr.Column(visible=True) as audio_settings_group:
411
- audio_negative_prompt = gr.Textbox(
412
- label="Audio Negative Prompt",
413
- value=DEFAULT_AUDIO_NEGATIVE_PROMPT,
414
- placeholder="Elements to avoid in audio (e.g., music, speech)",
415
- )
416
-
417
- with gr.Row():
418
- audio_steps = gr.Slider(
419
- minimum=10,
420
- maximum=50,
421
- step=5,
422
- value=25,
423
- label="🎚️ Audio Steps",
424
- info="More steps = better quality"
425
- )
426
- audio_cfg_strength = gr.Slider(
427
- minimum=1.0,
428
- maximum=10.0,
429
- step=0.5,
430
- value=4.5,
431
- label="πŸŽ›οΈ Audio Guidance",
432
- info="Strength of prompt guidance"
433
- )
434
-
435
- # Toggle audio settings visibility
436
- enable_audio.change(
437
- fn=lambda x: gr.update(visible=x),
438
- inputs=[enable_audio],
439
- outputs=[audio_settings_group]
440
- )
441
-
442
- generate_button = gr.Button(
443
- "🎬 Generate Video with Audio",
444
- variant="primary",
445
- elem_classes="generate-btn"
446
- )
447
-
448
- with gr.Column(scale=5):
449
- video_output = gr.Video(
450
- label="Generated Video with Audio",
451
- autoplay=True,
452
- interactive=False,
453
- elem_classes="video-output",
454
- height=600
455
- )
456
-
457
- gr.HTML("""
458
- <div style="text-align: center; margin-top: 20px; color: #6b7280;">
459
- <p>πŸ’‘ Tip: The same prompt is used for both video and audio generation!</p>
460
- <p>🎧 Audio is automatically matched to the visual content</p>
461
- </div>
462
- """)
463
-
464
- # Examples section moved outside of columns
465
- with gr.Row():
466
- gr.Markdown("### 🎯 Example Prompts")
467
-
468
- gr.Examples(
469
- examples=examples,
470
- inputs=[prompt, nag_negative_prompt, nag_scale],
471
- outputs=None, # Don't connect outputs to avoid index issues
472
- cache_examples=False
473
- )
474
-
475
- # Connect UI elements
476
- ui_inputs = [
477
- prompt,
478
- nag_negative_prompt, nag_scale,
479
- height_input, width_input, duration_seconds_input,
480
- steps_slider,
481
- seed_input, randomize_seed_checkbox,
482
- enable_audio, audio_negative_prompt, audio_steps, audio_cfg_strength,
483
- ]
484
-
485
- generate_button.click(
486
- fn=generate_video_with_audio,
487
- inputs=ui_inputs,
488
- outputs=[video_output, seed_input],
489
- )
490
 
491
  if __name__ == "__main__":
492
- demo.queue().launch()
 
16
  import tempfile
17
  from huggingface_hub import hf_hub_download
18
 
19
+ # Patch for scaled_dot_product_attention to fix enable_gqa issue
20
+ import torch.nn.functional as F
21
+
22
+ original_sdpa = F.scaled_dot_product_attention
23
+
24
+ def patched_scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None, enable_gqa=None):
25
+ # enable_gqa νŒŒλΌλ―Έν„°λ₯Ό λ¬΄μ‹œν•˜κ³  λ‚˜λ¨Έμ§€ νŒŒλΌλ―Έν„°λ§Œ 전달
26
+ kwargs = {}
27
+ if attn_mask is not None:
28
+ kwargs['attn_mask'] = attn_mask
29
+ if dropout_p != 0.0:
30
+ kwargs['dropout_p'] = dropout_p
31
+ if is_causal:
32
+ kwargs['is_causal'] = is_causal
33
+ if scale is not None:
34
+ kwargs['scale'] = scale
35
+
36
+ return original_sdpa(query, key, value, **kwargs)
37
+
38
+ # 패치 적용
39
+ F.scaled_dot_product_attention = patched_scaled_dot_product_attention
40
+
41
  from src.pipeline_wan_nag import NAGWanPipeline
42
  from src.transformer_wan_nag import NagWanTransformer3DModel
43
 
44
  # MMAudio imports
45
  try:
46
+ import mmaudio
47
  except ImportError:
48
+ os.system("pip install -e .")
49
+ import mmaudio
50
 
51
  from mmaudio.eval_utils import (ModelConfig, all_model_cfg, generate as mmaudio_generate,
52
+ load_video, make_video, setup_eval_logging)
53
  from mmaudio.model.flow_matching import FlowMatching
54
  from mmaudio.model.networks import MMAudio, get_my_mmaudio
55
  from mmaudio.model.sequence_config import SequenceConfig
 
94
 
95
  # Initialize NAG Video Model
96
  try:
97
+ vae = AutoencoderKLWan.from_pretrained(MODEL_ID, subfolder="vae", torch_dtype=torch.float32)
98
+ wan_path = hf_hub_download(repo_id=SUB_MODEL_ID, filename=SUB_MODEL_FILENAME)
99
+ transformer = NagWanTransformer3DModel.from_single_file(wan_path, torch_dtype=torch.bfloat16)
100
+ pipe = NAGWanPipeline.from_pretrained(
101
+ MODEL_ID, vae=vae, transformer=transformer, torch_dtype=torch.bfloat16
102
+ )
103
+ pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=5.0)
104
+ pipe.to("cuda")
105
 
106
+ pipe.transformer.__class__.attn_processors = NagWanTransformer3DModel.attn_processors
107
+ pipe.transformer.__class__.set_attn_processor = NagWanTransformer3DModel.set_attn_processor
108
+ pipe.transformer.__class__.forward = NagWanTransformer3DModel.forward
109
+ print("NAG Video Model loaded successfully!")
110
  except Exception as e:
111
+ print(f"Error loading NAG Video Model: {e}")
112
+ pipe = None
113
 
114
  # Initialize MMAudio Model
115
  def get_mmaudio_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
116
+ seq_cfg = audio_model_config.seq_cfg
117
+
118
+ net: MMAudio = get_my_mmaudio(audio_model_config.model_name).to(device, dtype).eval()
119
+ net.load_weights(torch.load(audio_model_config.model_path, map_location=device, weights_only=True))
120
+ log.info(f'Loaded MMAudio weights from {audio_model_config.model_path}')
121
+
122
+ feature_utils = FeaturesUtils(tod_vae_ckpt=audio_model_config.vae_path,
123
+ synchformer_ckpt=audio_model_config.synchformer_ckpt,
124
+ enable_conditions=True,
125
+ mode=audio_model_config.mode,
126
+ bigvgan_vocoder_ckpt=audio_model_config.bigvgan_16k_path,
127
+ need_vae_encoder=False)
128
+ feature_utils = feature_utils.to(device, dtype).eval()
129
+
130
+ return net, feature_utils, seq_cfg
131
 
132
  try:
133
+ audio_net, audio_feature_utils, audio_seq_cfg = get_mmaudio_model()
134
+ print("MMAudio Model loaded successfully!")
135
  except Exception as e:
136
+ print(f"Error loading MMAudio Model: {e}")
137
+ audio_net = None
138
 
139
  # Audio generation function
140
  @torch.inference_mode()
141
  def add_audio_to_video(video_path, prompt, audio_negative_prompt, audio_steps, audio_cfg_strength, duration):
142
+ """Generate and add audio to video using MMAudio"""
143
+ if audio_net is None:
144
+ print("MMAudio model not loaded, returning video without audio")
145
+ return video_path
146
+
147
+ try:
148
+ rng = torch.Generator(device=device)
149
+ rng.seed() # Random seed for audio
150
+ fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=audio_steps)
151
+
152
+ video_info = load_video(video_path, duration)
153
+ clip_frames = video_info.clip_frames
154
+ sync_frames = video_info.sync_frames
155
+ duration = video_info.duration_sec
156
+ clip_frames = clip_frames.unsqueeze(0)
157
+ sync_frames = sync_frames.unsqueeze(0)
158
+ audio_seq_cfg.duration = duration
159
+ audio_net.update_seq_lengths(audio_seq_cfg.latent_seq_len, audio_seq_cfg.clip_seq_len, audio_seq_cfg.sync_seq_len)
160
+
161
+ audios = mmaudio_generate(clip_frames,
162
+ sync_frames, [prompt],
163
+ negative_text=[audio_negative_prompt],
164
+ feature_utils=audio_feature_utils,
165
+ net=audio_net,
166
+ fm=fm,
167
+ rng=rng,
168
+ cfg_strength=audio_cfg_strength)
169
+ audio = audios.float().cpu()[0]
170
+
171
+ # Create video with audio
172
+ video_with_audio_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
173
+ make_video(video_info, video_with_audio_path, audio, sampling_rate=audio_seq_cfg.sampling_rate)
174
+
175
+ return video_with_audio_path
176
+ except Exception as e:
177
+ print(f"Error in audio generation: {e}")
178
+ return video_path
179
 
180
  # Combined generation function
181
  def get_duration(prompt, nag_negative_prompt, nag_scale, height, width, duration_seconds,
182
+ steps, seed, randomize_seed, enable_audio, audio_negative_prompt,
183
+ audio_steps, audio_cfg_strength):
184
+ # Calculate total duration including audio processing if enabled
185
+ video_duration = int(duration_seconds) * int(steps) * 2.25 + 5
186
+ audio_duration = 30 if enable_audio else 0 # Additional time for audio processing
187
+ return video_duration + audio_duration
188
 
189
  @spaces.GPU(duration=get_duration)
190
  def generate_video_with_audio(
191
+ prompt,
192
+ nag_negative_prompt, nag_scale,
193
+ height=DEFAULT_H_SLIDER_VALUE, width=DEFAULT_W_SLIDER_VALUE, duration_seconds=DEFAULT_DURATION_SECONDS,
194
+ steps=DEFAULT_STEPS,
195
+ seed=DEFAULT_SEED, randomize_seed=False,
196
+ enable_audio=True, audio_negative_prompt=DEFAULT_AUDIO_NEGATIVE_PROMPT,
197
+ audio_steps=25, audio_cfg_strength=4.5,
198
  ):
199
+ if pipe is None:
200
+ return None, DEFAULT_SEED
201
+
202
+ try:
203
+ # Generate video first
204
+ target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
205
+ target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
206
+
207
+ num_frames = np.clip(int(round(int(duration_seconds) * FIXED_FPS) + 1), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL)
208
+
209
+ current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
210
+
211
+ print(f"Generating video with: prompt='{prompt}', resolution={target_w}x{target_h}, frames={num_frames}")
212
+
213
+ with torch.inference_mode():
214
+ nag_output_frames_list = pipe(
215
+ prompt=prompt,
216
+ nag_negative_prompt=nag_negative_prompt,
217
+ nag_scale=nag_scale,
218
+ nag_tau=3.5,
219
+ nag_alpha=0.5,
220
+ height=target_h, width=target_w, num_frames=num_frames,
221
+ guidance_scale=0.,
222
+ num_inference_steps=int(steps),
223
+ generator=torch.Generator(device="cuda").manual_seed(current_seed)
224
+ ).frames[0]
225
+
226
+ # Save initial video without audio
227
+ with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
228
+ temp_video_path = tmpfile.name
229
+ export_to_video(nag_output_frames_list, temp_video_path, fps=FIXED_FPS)
230
+ print(f"Video saved to: {temp_video_path}")
231
+
232
+ # Add audio if enabled
233
+ if enable_audio:
234
+ try:
235
+ print("Adding audio to video...")
236
+ final_video_path = add_audio_to_video(
237
+ temp_video_path,
238
+ prompt, # Use the same prompt for audio generation
239
+ audio_negative_prompt,
240
+ audio_steps,
241
+ audio_cfg_strength,
242
+ duration_seconds
243
+ )
244
+ # Clean up temp video
245
+ if os.path.exists(temp_video_path) and final_video_path != temp_video_path:
246
+ os.remove(temp_video_path)
247
+ print(f"Final video with audio: {final_video_path}")
248
+ except Exception as e:
249
+ log.error(f"Audio generation failed: {e}")
250
+ final_video_path = temp_video_path
251
+ else:
252
+ final_video_path = temp_video_path
253
+
254
+ return final_video_path, current_seed
255
+ except Exception as e:
256
+ print(f"Error in video generation: {e}")
257
+ return None, current_seed
258
 
259
  # Example generation function - simplified
260
  def set_example(prompt, nag_negative_prompt, nag_scale):
261
+ """Set example values in the UI without triggering generation"""
262
+ return (
263
+ prompt,
264
+ nag_negative_prompt,
265
+ nag_scale,
266
+ DEFAULT_H_SLIDER_VALUE,
267
+ DEFAULT_W_SLIDER_VALUE,
268
+ DEFAULT_DURATION_SECONDS,
269
+ DEFAULT_STEPS,
270
+ DEFAULT_SEED,
271
+ True, # randomize_seed
272
+ True, # enable_audio
273
+ DEFAULT_AUDIO_NEGATIVE_PROMPT,
274
+ 25, # audio_steps
275
+ 4.5 # audio_cfg_strength
276
+ )
277
 
278
  # Examples with audio descriptions
279
  examples = [
280
+ ["Midnight highway outside a neon-lit city. A black 1973 Porsche 911 Carrera RS speeds at 120 km/h. Inside, a stylish singer-guitarist sings while driving, vintage sunburst guitar on the passenger seat. Sodium streetlights streak over the hood; RGB panels shift magenta to blue on the driver. Camera: drone dive, Russian-arm low wheel shot, interior gimbal, FPV barrel roll, overhead spiral. Neo-noir palette, rain-slick asphalt reflections, roaring flat-six engine blended with live guitar.", DEFAULT_NAG_NEGATIVE_PROMPT, 11],
281
+ ["Arena rock concert packed with 20 000 fans. A flamboyant lead guitarist in leather jacket and mirrored aviators shreds a cherry-red Flying V on a thrust stage. Pyro flames shoot up on every downbeat, COβ‚‚ jets burst behind. Moving-head spotlights swirl teal and amber, follow-spots rim-light the guitarist's hair. Steadicam 360-orbit, crane shot rising over crowd, ultra-slow-motion pick attack at 1 000 fps. Film-grain teal-orange grade, thunderous crowd roar mixes with screaming guitar solo.", DEFAULT_NAG_NEGATIVE_PROMPT, 11],
282
+ ["Golden-hour countryside road winding through rolling wheat fields. A man and woman ride a vintage cafΓ©-racer motorcycle, hair and scarf fluttering in the warm breeze. Drone chase shot reveals endless patchwork farmland; low slider along rear wheel captures dust trail. Sun-flare back-lights the riders, lens blooms on highlights. Soft acoustic rock underscore; engine rumble mixed at –8 dB. Warm pastel color grade, gentle film-grain for nostalgic vibe.", DEFAULT_NAG_NEGATIVE_PROMPT, 11],
283
  ]
284
 
285
  # CSS styling - Fixed for better layout
286
  css = """
287
  /* Right column - video output */
288
  .video-output {
289
+ border-radius: 15px;
290
+ overflow: hidden;
291
+ box-shadow: 0 10px 30px rgba(0, 0, 0, 0.2);
292
+ width: 100% !important;
293
+ height: auto !important;
294
+ min-height: 400px;
295
  }
296
 
297
  /* Ensure video container is responsive */
298
  .video-output video {
299
+ width: 100% !important;
300
+ height: auto !important;
301
+ max-height: 600px;
302
+ object-fit: contain;
303
+ display: block;
304
  }
305
 
306
  /* Remove any overlay or background from video container */
307
  .video-output > div {
308
+ background: transparent !important;
309
+ padding: 0 !important;
310
  }
311
 
312
  /* Remove gradio's default video player overlay */
313
  .video-output .wrap {
314
+ background: transparent !important;
315
  }
316
 
317
  /* Ensure no gray overlay on video controls */
318
  .video-output video::-webkit-media-controls-enclosure {
319
+ background: transparent;
320
  }
321
  """
322
 
323
  # Gradio interface - Fixed structure
324
  with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
325
+ gr.HTML("""
326
+ <div class="container">
327
+ <h1 class="main-title">🎬 VEO3 Free</h1>
328
+ <p class="subtitle">Wan2.1-T2V-14B + Fast 4-step with NAG + Automatic Audio Generation</p>
329
+ </div>
330
+ """)
331
+
332
+ gr.HTML("""
333
+ <div class='container' style='display:flex; justify-content:center; gap:12px; margin-bottom: 20px;'>
334
+ <a href="https://huggingface.co/spaces/openfree/Best-AI" target="_blank">
335
+ <img src="https://img.shields.io/static/v1?label=OpenFree&message=BEST%20AI%20Services&color=%230000ff&labelColor=%23000080&logo=huggingface&logoColor=%23ffa500&style=for-the-badge" alt="OpenFree badge">
336
+ </a>
337
 
338
+ <a href="https://discord.gg/openfreeai" target="_blank">
339
+ <img src="https://img.shields.io/static/v1?label=Discord&message=Openfree%20AI&color=%230000ff&labelColor=%23800080&logo=discord&logoColor=white&style=for-the-badge" alt="Discord badge">
340
+ </a>
341
+ </div>
342
+ """)
343
+
344
+ with gr.Row(equal_height=True):
345
+ with gr.Column(scale=5):
346
+ with gr.Group(elem_classes="prompt-container"):
347
+ prompt = gr.Textbox(
348
+ label="✨ Video Prompt (also used for audio generation)",
349
+ placeholder="Describe your video scene in detail...",
350
+ lines=3,
351
+ elem_classes="prompt-input"
352
+ )
353
+
354
+ with gr.Accordion("🎨 Advanced Video Settings", open=False):
355
+ nag_negative_prompt = gr.Textbox(
356
+ label="Video Negative Prompt",
357
+ value=DEFAULT_NAG_NEGATIVE_PROMPT,
358
+ lines=2,
359
+ )
360
+ nag_scale = gr.Slider(
361
+ label="NAG Scale",
362
+ minimum=1.0,
363
+ maximum=20.0,
364
+ step=0.25,
365
+ value=11.0,
366
+ info="Higher values = stronger guidance"
367
+ )
368
+
369
+ with gr.Group(elem_classes="settings-panel"):
370
+ gr.Markdown("### βš™οΈ Video Settings")
371
+
372
+ with gr.Row():
373
+ duration_seconds_input = gr.Slider(
374
+ minimum=1,
375
+ maximum=8,
376
+ step=1,
377
+ value=DEFAULT_DURATION_SECONDS,
378
+ label="πŸ“± Duration (seconds)",
379
+ elem_classes="slider-container"
380
+ )
381
+ steps_slider = gr.Slider(
382
+ minimum=1,
383
+ maximum=8,
384
+ step=1,
385
+ value=DEFAULT_STEPS,
386
+ label="πŸ”„ Inference Steps",
387
+ elem_classes="slider-container"
388
+ )
389
+
390
+ with gr.Row():
391
+ height_input = gr.Slider(
392
+ minimum=SLIDER_MIN_H,
393
+ maximum=SLIDER_MAX_H,
394
+ step=MOD_VALUE,
395
+ value=DEFAULT_H_SLIDER_VALUE,
396
+ label=f"πŸ“ Height (Γ—{MOD_VALUE})",
397
+ elem_classes="slider-container"
398
+ )
399
+ width_input = gr.Slider(
400
+ minimum=SLIDER_MIN_W,
401
+ maximum=SLIDER_MAX_W,
402
+ step=MOD_VALUE,
403
+ value=DEFAULT_W_SLIDER_VALUE,
404
+ label=f"πŸ“ Width (Γ—{MOD_VALUE})",
405
+ elem_classes="slider-container"
406
+ )
407
+
408
+ with gr.Row():
409
+ seed_input = gr.Slider(
410
+ label="🌱 Seed",
411
+ minimum=0,
412
+ maximum=MAX_SEED,
413
+ step=1,
414
+ value=DEFAULT_SEED,
415
+ interactive=True
416
+ )
417
+ randomize_seed_checkbox = gr.Checkbox(
418
+ label="🎲 Random Seed",
419
+ value=True,
420
+ interactive=True
421
+ )
422
+
423
+ with gr.Group(elem_classes="audio-settings"):
424
+ gr.Markdown("### 🎡 Audio Generation Settings")
425
+
426
+ enable_audio = gr.Checkbox(
427
+ label="πŸ”Š Enable Automatic Audio Generation",
428
+ value=True,
429
+ interactive=True
430
+ )
431
+
432
+ with gr.Column(visible=True) as audio_settings_group:
433
+ audio_negative_prompt = gr.Textbox(
434
+ label="Audio Negative Prompt",
435
+ value=DEFAULT_AUDIO_NEGATIVE_PROMPT,
436
+ placeholder="Elements to avoid in audio (e.g., music, speech)",
437
+ )
438
+
439
+ with gr.Row():
440
+ audio_steps = gr.Slider(
441
+ minimum=10,
442
+ maximum=50,
443
+ step=5,
444
+ value=25,
445
+ label="🎚️ Audio Steps",
446
+ info="More steps = better quality"
447
+ )
448
+ audio_cfg_strength = gr.Slider(
449
+ minimum=1.0,
450
+ maximum=10.0,
451
+ step=0.5,
452
+ value=4.5,
453
+ label="πŸŽ›οΈ Audio Guidance",
454
+ info="Strength of prompt guidance"
455
+ )
456
+
457
+ # Toggle audio settings visibility
458
+ enable_audio.change(
459
+ fn=lambda x: gr.update(visible=x),
460
+ inputs=[enable_audio],
461
+ outputs=[audio_settings_group]
462
+ )
463
+
464
+ generate_button = gr.Button(
465
+ "🎬 Generate Video with Audio",
466
+ variant="primary",
467
+ elem_classes="generate-btn"
468
+ )
469
+
470
+ with gr.Column(scale=5):
471
+ video_output = gr.Video(
472
+ label="Generated Video with Audio",
473
+ autoplay=True,
474
+ interactive=False,
475
+ elem_classes="video-output",
476
+ height=600
477
+ )
478
+
479
+ gr.HTML("""
480
+ <div style="text-align: center; margin-top: 20px; color: #6b7280;">
481
+ <p>πŸ’‘ Tip: The same prompt is used for both video and audio generation!</p>
482
+ <p>🎧 Audio is automatically matched to the visual content</p>
483
+ </div>
484
+ """)
485
+
486
+ # Examples section moved outside of columns
487
+ with gr.Row():
488
+ gr.Markdown("### 🎯 Example Prompts")
489
+
490
+ gr.Examples(
491
+ examples=examples,
492
+ inputs=[prompt, nag_negative_prompt, nag_scale],
493
+ outputs=None, # Don't connect outputs to avoid index issues
494
+ cache_examples=False
495
+ )
496
+
497
+ # Connect UI elements
498
+ ui_inputs = [
499
+ prompt,
500
+ nag_negative_prompt, nag_scale,
501
+ height_input, width_input, duration_seconds_input,
502
+ steps_slider,
503
+ seed_input, randomize_seed_checkbox,
504
+ enable_audio, audio_negative_prompt, audio_steps, audio_cfg_strength,
505
+ ]
506
+
507
+ generate_button.click(
508
+ fn=generate_video_with_audio,
509
+ inputs=ui_inputs,
510
+ outputs=[video_output, seed_input],
511
+ )
512
 
513
  if __name__ == "__main__":
514
+ demo.queue().launch()