ginipick commited on
Commit
340367d
ยท
verified ยท
1 Parent(s): b66d096

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -16
app.py CHANGED
@@ -5,6 +5,7 @@ import logging
5
  import os
6
  from pathlib import Path
7
  from datetime import datetime
 
8
 
9
  import torch
10
  import numpy as np
@@ -15,6 +16,7 @@ from diffusers import AutoModel
15
  import gradio as gr
16
  import tempfile
17
  from huggingface_hub import hf_hub_download
 
18
 
19
  # Patch for scaled_dot_product_attention to fix enable_gqa issue
20
  import torch.nn.functional as F
@@ -73,7 +75,7 @@ MIN_FRAMES_MODEL = 8
73
  MAX_FRAMES_MODEL = 129
74
 
75
  DEFAULT_NAG_NEGATIVE_PROMPT = "Static, motionless, still, ugly, bad quality, worst quality, poorly drawn, low resolution, blurry, lack of details"
76
- DEFAULT_AUDIO_NEGATIVE_PROMPT = "music"
77
 
78
  # NAG Model Settings
79
  MODEL_ID = "Wan-AI/Wan2.1-T2V-14B-Diffusers"
@@ -136,17 +138,83 @@ except Exception as e:
136
  print(f"Error loading MMAudio Model: {e}")
137
  audio_net = None
138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  # Audio generation function
140
  @torch.inference_mode()
141
- def add_audio_to_video(video_path, prompt, audio_negative_prompt, audio_steps, audio_cfg_strength, duration):
142
  """Generate and add audio to video using MMAudio"""
143
  if audio_net is None:
144
  print("MMAudio model not loaded, returning video without audio")
145
  return video_path
146
 
147
  try:
 
 
 
 
 
 
 
 
 
148
  rng = torch.Generator(device=device)
149
- rng.seed() # Random seed for audio
150
  fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=audio_steps)
151
 
152
  video_info = load_video(video_path, duration)
@@ -158,9 +226,12 @@ def add_audio_to_video(video_path, prompt, audio_negative_prompt, audio_steps, a
158
  audio_seq_cfg.duration = duration
159
  audio_net.update_seq_lengths(audio_seq_cfg.latent_seq_len, audio_seq_cfg.clip_seq_len, audio_seq_cfg.sync_seq_len)
160
 
 
 
 
161
  audios = mmaudio_generate(clip_frames,
162
- sync_frames, [prompt],
163
- negative_text=[audio_negative_prompt],
164
  feature_utils=audio_feature_utils,
165
  net=audio_net,
166
  fm=fm,
@@ -175,12 +246,13 @@ def add_audio_to_video(video_path, prompt, audio_negative_prompt, audio_steps, a
175
  return video_with_audio_path
176
  except Exception as e:
177
  print(f"Error in audio generation: {e}")
 
178
  return video_path
179
 
180
  # Combined generation function
181
  def get_duration(prompt, nag_negative_prompt, nag_scale, height, width, duration_seconds,
182
- steps, seed, randomize_seed, enable_audio, audio_negative_prompt,
183
- audio_steps, audio_cfg_strength):
184
  # Calculate total duration including audio processing if enabled
185
  video_duration = int(duration_seconds) * int(steps) * 2.25 + 5
186
  audio_duration = 30 if enable_audio else 0 # Additional time for audio processing
@@ -193,8 +265,9 @@ def generate_video_with_audio(
193
  height=DEFAULT_H_SLIDER_VALUE, width=DEFAULT_W_SLIDER_VALUE, duration_seconds=DEFAULT_DURATION_SECONDS,
194
  steps=DEFAULT_STEPS,
195
  seed=DEFAULT_SEED, randomize_seed=False,
196
- enable_audio=True, audio_negative_prompt=DEFAULT_AUDIO_NEGATIVE_PROMPT,
197
- audio_steps=25, audio_cfg_strength=4.5,
 
198
  ):
199
  if pipe is None:
200
  return None, DEFAULT_SEED
@@ -235,7 +308,8 @@ def generate_video_with_audio(
235
  print("Adding audio to video...")
236
  final_video_path = add_audio_to_video(
237
  temp_video_path,
238
- prompt, # Use the same prompt for audio generation
 
239
  audio_negative_prompt,
240
  audio_steps,
241
  audio_cfg_strength,
@@ -270,8 +344,9 @@ def set_example(prompt, nag_negative_prompt, nag_scale):
270
  DEFAULT_SEED,
271
  True, # randomize_seed
272
  True, # enable_audio
 
273
  DEFAULT_AUDIO_NEGATIVE_PROMPT,
274
- 25, # audio_steps
275
  4.5 # audio_cfg_strength
276
  )
277
 
@@ -430,10 +505,15 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
430
  )
431
 
432
  with gr.Column(visible=True) as audio_settings_group:
 
 
 
 
 
433
  audio_negative_prompt = gr.Textbox(
434
  label="Audio Negative Prompt",
435
  value=DEFAULT_AUDIO_NEGATIVE_PROMPT,
436
- placeholder="Elements to avoid in audio (e.g., music, speech)",
437
  )
438
 
439
  with gr.Row():
@@ -441,7 +521,7 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
441
  minimum=10,
442
  maximum=50,
443
  step=5,
444
- value=25,
445
  label="๐ŸŽš๏ธ Audio Steps",
446
  info="More steps = better quality"
447
  )
@@ -478,8 +558,8 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
478
 
479
  gr.HTML("""
480
  <div style="text-align: center; margin-top: 20px; color: #6b7280;">
481
- <p>๐Ÿ’ก Tip: The same prompt is used for both video and audio generation!</p>
482
- <p>๐ŸŽง Audio is automatically matched to the visual content</p>
483
  </div>
484
  """)
485
 
@@ -501,7 +581,8 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
501
  height_input, width_input, duration_seconds_input,
502
  steps_slider,
503
  seed_input, randomize_seed_checkbox,
504
- enable_audio, audio_negative_prompt, audio_steps, audio_cfg_strength,
 
505
  ]
506
 
507
  generate_button.click(
 
5
  import os
6
  from pathlib import Path
7
  from datetime import datetime
8
+ import re
9
 
10
  import torch
11
  import numpy as np
 
16
  import gradio as gr
17
  import tempfile
18
  from huggingface_hub import hf_hub_download
19
+ import traceback
20
 
21
  # Patch for scaled_dot_product_attention to fix enable_gqa issue
22
  import torch.nn.functional as F
 
75
  MAX_FRAMES_MODEL = 129
76
 
77
  DEFAULT_NAG_NEGATIVE_PROMPT = "Static, motionless, still, ugly, bad quality, worst quality, poorly drawn, low resolution, blurry, lack of details"
78
+ DEFAULT_AUDIO_NEGATIVE_PROMPT = "music, speech, voice, singing, narration"
79
 
80
  # NAG Model Settings
81
  MODEL_ID = "Wan-AI/Wan2.1-T2V-14B-Diffusers"
 
138
  print(f"Error loading MMAudio Model: {e}")
139
  audio_net = None
140
 
141
+ # ๋น„๋””์˜ค ํ”„๋กฌํ”„ํŠธ๋ฅผ ์˜ค๋””์˜ค ํ”„๋กฌํ”„ํŠธ๋กœ ๋ณ€ํ™˜ํ•˜๋Š” ํ•จ์ˆ˜
142
+ def extract_audio_description(video_prompt):
143
+ """๋น„๋””์˜ค ํ”„๋กฌํ”„ํŠธ์—์„œ ์˜ค๋””์˜ค ๊ด€๋ จ ์„ค๋ช… ์ถ”์ถœ/๋ณ€ํ™˜"""
144
+
145
+ # ํ‚ค์›Œ๋“œ ๋งคํ•‘
146
+ audio_keywords = {
147
+ 'car': 'car engine sound, vehicle noise',
148
+ 'porsche': 'sports car engine roar, exhaust sound',
149
+ 'guitar': 'electric guitar playing, guitar music',
150
+ 'concert': 'crowd cheering, live music, applause',
151
+ 'motorcycle': 'motorcycle engine sound, motor rumble',
152
+ 'highway': 'traffic noise, road ambience',
153
+ 'rain': 'rain sounds, water drops',
154
+ 'wind': 'wind blowing sound',
155
+ 'ocean': 'ocean waves, water sounds',
156
+ 'city': 'urban ambience, city traffic sounds',
157
+ 'singer': 'singing voice, vocals',
158
+ 'crowd': 'crowd noise, people talking',
159
+ 'flames': 'fire crackling sound',
160
+ 'pyro': 'fire whoosh, flame burst sound',
161
+ 'explosion': 'explosion sound, blast',
162
+ 'countryside': 'nature ambience, birds chirping',
163
+ 'wheat fields': 'wind through grass, rural ambience',
164
+ 'engine': 'motor sound, mechanical noise',
165
+ 'flat-six engine': 'sports car engine sound',
166
+ 'roaring': 'loud engine roar',
167
+ 'thunderous': 'loud booming sound',
168
+ 'child': 'children playing sounds',
169
+ 'running': 'footsteps sound',
170
+ 'woman': 'ambient sounds',
171
+ 'phone': 'subtle electronic ambience',
172
+ 'advertisement': 'modern ambient sounds'
173
+ }
174
+
175
+ # ๊ฐ„๋‹จํ•œ ํ‚ค์›Œ๋“œ ๊ธฐ๋ฐ˜ ๋ณ€ํ™˜
176
+ audio_descriptions = []
177
+ lower_prompt = video_prompt.lower()
178
+
179
+ for key, value in audio_keywords.items():
180
+ if key in lower_prompt:
181
+ audio_descriptions.append(value)
182
+
183
+ # ๊ธฐ๋ณธ๊ฐ’ ์„ค์ •
184
+ if not audio_descriptions:
185
+ # ํ”„๋กฌํ”„ํŠธ์— ๋ช…์‹œ์ ์ธ ์˜ค๋””์˜ค ์„ค๋ช…์ด ์žˆ๋Š”์ง€ ํ™•์ธ
186
+ if 'sound' in lower_prompt or 'audio' in lower_prompt or 'noise' in lower_prompt:
187
+ # ํ”„๋กฌํ”„ํŠธ์—์„œ ์˜ค๋””์˜ค ๊ด€๋ จ ๋ถ€๋ถ„๋งŒ ์ถ”์ถœ
188
+ audio_pattern = r'([^.]*(?:sound|audio|noise|music|voice|roar|rumble)[^.]*)'
189
+ matches = re.findall(audio_pattern, lower_prompt, re.IGNORECASE)
190
+ if matches:
191
+ return ', '.join(matches)
192
+
193
+ # ๊ธฐ๋ณธ ambient sound
194
+ return "ambient environmental sounds matching the scene"
195
+
196
+ return ', '.join(audio_descriptions)
197
+
198
  # Audio generation function
199
  @torch.inference_mode()
200
+ def add_audio_to_video(video_path, prompt, audio_custom_prompt, audio_negative_prompt, audio_steps, audio_cfg_strength, duration):
201
  """Generate and add audio to video using MMAudio"""
202
  if audio_net is None:
203
  print("MMAudio model not loaded, returning video without audio")
204
  return video_path
205
 
206
  try:
207
+ # ์ปค์Šคํ…€ ์˜ค๋””์˜ค ํ”„๋กฌํ”„ํŠธ๊ฐ€ ์žˆ์œผ๋ฉด ์‚ฌ์šฉ, ์—†์œผ๋ฉด ๋น„๋””์˜ค ํ”„๋กฌํ”„ํŠธ์—์„œ ์ถ”์ถœ
208
+ if audio_custom_prompt and audio_custom_prompt.strip():
209
+ audio_prompt = audio_custom_prompt.strip()
210
+ else:
211
+ audio_prompt = extract_audio_description(prompt)
212
+
213
+ print(f"Original prompt: {prompt}")
214
+ print(f"Audio prompt: {audio_prompt}")
215
+
216
  rng = torch.Generator(device=device)
217
+ rng.manual_seed(random.randint(0, 2**32 - 1)) # ๋” ๋ช…ํ™•ํ•œ ๋žœ๋ค ์‹œ๋“œ
218
  fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=audio_steps)
219
 
220
  video_info = load_video(video_path, duration)
 
226
  audio_seq_cfg.duration = duration
227
  audio_net.update_seq_lengths(audio_seq_cfg.latent_seq_len, audio_seq_cfg.clip_seq_len, audio_seq_cfg.sync_seq_len)
228
 
229
+ # ํ–ฅ์ƒ๋œ ๋„ค๊ฑฐํ‹ฐ๋ธŒ ํ”„๋กฌํ”„ํŠธ
230
+ enhanced_negative = f"{audio_negative_prompt}, distortion, static noise, silence, random beeps"
231
+
232
  audios = mmaudio_generate(clip_frames,
233
+ sync_frames, [audio_prompt], # ๋ณ€ํ™˜๋œ ์˜ค๋””์˜ค ํ”„๋กฌํ”„ํŠธ ์‚ฌ์šฉ
234
+ negative_text=[enhanced_negative],
235
  feature_utils=audio_feature_utils,
236
  net=audio_net,
237
  fm=fm,
 
246
  return video_with_audio_path
247
  except Exception as e:
248
  print(f"Error in audio generation: {e}")
249
+ traceback.print_exc()
250
  return video_path
251
 
252
  # Combined generation function
253
  def get_duration(prompt, nag_negative_prompt, nag_scale, height, width, duration_seconds,
254
+ steps, seed, randomize_seed, enable_audio, audio_custom_prompt,
255
+ audio_negative_prompt, audio_steps, audio_cfg_strength):
256
  # Calculate total duration including audio processing if enabled
257
  video_duration = int(duration_seconds) * int(steps) * 2.25 + 5
258
  audio_duration = 30 if enable_audio else 0 # Additional time for audio processing
 
265
  height=DEFAULT_H_SLIDER_VALUE, width=DEFAULT_W_SLIDER_VALUE, duration_seconds=DEFAULT_DURATION_SECONDS,
266
  steps=DEFAULT_STEPS,
267
  seed=DEFAULT_SEED, randomize_seed=False,
268
+ enable_audio=True, audio_custom_prompt="",
269
+ audio_negative_prompt=DEFAULT_AUDIO_NEGATIVE_PROMPT,
270
+ audio_steps=30, audio_cfg_strength=4.5,
271
  ):
272
  if pipe is None:
273
  return None, DEFAULT_SEED
 
308
  print("Adding audio to video...")
309
  final_video_path = add_audio_to_video(
310
  temp_video_path,
311
+ prompt,
312
+ audio_custom_prompt,
313
  audio_negative_prompt,
314
  audio_steps,
315
  audio_cfg_strength,
 
344
  DEFAULT_SEED,
345
  True, # randomize_seed
346
  True, # enable_audio
347
+ "", # audio_custom_prompt
348
  DEFAULT_AUDIO_NEGATIVE_PROMPT,
349
+ 30, # audio_steps
350
  4.5 # audio_cfg_strength
351
  )
352
 
 
505
  )
506
 
507
  with gr.Column(visible=True) as audio_settings_group:
508
+ audio_custom_prompt = gr.Textbox(
509
+ label="Custom Audio Prompt (Optional)",
510
+ placeholder="Leave empty to auto-generate from video prompt, or specify custom audio description (e.g., 'car engine sound, traffic noise')",
511
+ value="",
512
+ )
513
  audio_negative_prompt = gr.Textbox(
514
  label="Audio Negative Prompt",
515
  value=DEFAULT_AUDIO_NEGATIVE_PROMPT,
516
+ placeholder="Elements to avoid in audio",
517
  )
518
 
519
  with gr.Row():
 
521
  minimum=10,
522
  maximum=50,
523
  step=5,
524
+ value=30,
525
  label="๐ŸŽš๏ธ Audio Steps",
526
  info="More steps = better quality"
527
  )
 
558
 
559
  gr.HTML("""
560
  <div style="text-align: center; margin-top: 20px; color: #6b7280;">
561
+ <p>๐Ÿ’ก Tip: For better audio, use Custom Audio Prompt with sound descriptions!</p>
562
+ <p>๐ŸŽง Examples: "car engine sound", "crowd cheering", "nature ambience"</p>
563
  </div>
564
  """)
565
 
 
581
  height_input, width_input, duration_seconds_input,
582
  steps_slider,
583
  seed_input, randomize_seed_checkbox,
584
+ enable_audio, audio_custom_prompt, audio_negative_prompt,
585
+ audio_steps, audio_cfg_strength,
586
  ]
587
 
588
  generate_button.click(