James Zhou commited on
Commit
49611df
·
1 Parent(s): 3fff971

[update] neg prompt

Browse files
app.py CHANGED
@@ -120,6 +120,7 @@ def auto_load_models() -> str:
120
  def infer_single_video(
121
  video_file,
122
  text_prompt: str,
 
123
  guidance_scale: float = 4.5,
124
  num_inference_steps: int = 50,
125
  sample_nums: int = 1
@@ -147,7 +148,8 @@ def infer_single_video(
147
  video_file,
148
  text_prompt,
149
  model_dict,
150
- cfg
 
151
  )
152
 
153
  # Denoising process to generate multiple audio samples
@@ -566,6 +568,12 @@ def create_gradio_interface():
566
  placeholder="A person walks on frozen ice",
567
  lines=3,
568
  )
 
 
 
 
 
 
569
 
570
  with gr.Row():
571
  guidance_scale = gr.Slider(
@@ -748,10 +756,10 @@ def create_gradio_interface():
748
  example_buttons.append((example_btn, example))
749
 
750
  # Event handlers
751
- def process_inference(video_file, text_prompt, guidance_scale, inference_steps, sample_nums):
752
  # Generate videos
753
  video_list, status_msg = infer_single_video(
754
- video_file, text_prompt, guidance_scale, inference_steps, int(sample_nums)
755
  )
756
  # Update outputs with proper visibility
757
  return update_video_outputs(video_list, status_msg)
@@ -777,7 +785,7 @@ def create_gradio_interface():
777
 
778
  generate_btn.click(
779
  fn=process_inference,
780
- inputs=[video_input, text_input, guidance_scale, inference_steps, sample_nums],
781
  outputs=[
782
  video_output_1, # Sample 1 value
783
  video_output_2, # Sample 2 value
@@ -810,12 +818,12 @@ def create_gradio_interface():
810
  if not result_video:
811
  status_msg += f"\n⚠️ Result video not found: {ex['result_path']}"
812
 
813
- return video_file, ex['caption'], result_video, status_msg
814
  return handler
815
 
816
  btn.click(
817
  fn=create_example_handler(example),
818
- outputs=[video_input, text_input, video_output_1, result_text]
819
  )
820
 
821
  # Footer
 
120
  def infer_single_video(
121
  video_file,
122
  text_prompt: str,
123
+ neg_prompt: str = None,
124
  guidance_scale: float = 4.5,
125
  num_inference_steps: int = 50,
126
  sample_nums: int = 1
 
148
  video_file,
149
  text_prompt,
150
  model_dict,
151
+ cfg,
152
+ neg_prompt=neg_prompt
153
  )
154
 
155
  # Denoising process to generate multiple audio samples
 
568
  placeholder="A person walks on frozen ice",
569
  lines=3,
570
  )
571
+
572
+ neg_prompt_input = gr.Textbox(
573
+ label="🚫 Negative Prompt",
574
+ placeholder="noisy, harsh",
575
+ lines=2,
576
+ )
577
 
578
  with gr.Row():
579
  guidance_scale = gr.Slider(
 
756
  example_buttons.append((example_btn, example))
757
 
758
  # Event handlers
759
+ def process_inference(video_file, text_prompt, neg_prompt, guidance_scale, inference_steps, sample_nums):
760
  # Generate videos
761
  video_list, status_msg = infer_single_video(
762
+ video_file, text_prompt, neg_prompt, guidance_scale, inference_steps, int(sample_nums)
763
  )
764
  # Update outputs with proper visibility
765
  return update_video_outputs(video_list, status_msg)
 
785
 
786
  generate_btn.click(
787
  fn=process_inference,
788
+ inputs=[video_input, text_input, neg_prompt_input, guidance_scale, inference_steps, sample_nums],
789
  outputs=[
790
  video_output_1, # Sample 1 value
791
  video_output_2, # Sample 2 value
 
818
  if not result_video:
819
  status_msg += f"\n⚠️ Result video not found: {ex['result_path']}"
820
 
821
+ return video_file, ex['caption'], "noisy, harsh", result_video, status_msg
822
  return handler
823
 
824
  btn.click(
825
  fn=create_example_handler(example),
826
+ outputs=[video_input, text_input, neg_prompt_input, video_output_1, result_text]
827
  )
828
 
829
  # Footer
hunyuanvideo_foley/utils/feature_utils.py CHANGED
@@ -10,7 +10,7 @@ from typing import Any, Dict, List, Union, Tuple
10
  from loguru import logger
11
 
12
  from .config_utils import AttributeDict
13
- from ..constants import FPS_VISUAL, MAX_VIDEO_DURATION_SECONDS
14
 
15
 
16
  class FeatureExtractionError(Exception):
@@ -134,9 +134,10 @@ def encode_text_feat(text: List[str], model_dict):
134
  return outputs.last_hidden_state, outputs.attentions
135
 
136
 
137
- def feature_process(video_path, prompt, model_dict, cfg):
138
  visual_feats, audio_len_in_s = encode_video_features(video_path, model_dict)
139
- neg_prompt = "noisy, harsh"
 
140
  prompts = [neg_prompt, prompt]
141
  text_feat_res, text_feat_mask = encode_text_feat(prompts, model_dict)
142
 
 
10
  from loguru import logger
11
 
12
  from .config_utils import AttributeDict
13
+ from ..constants import FPS_VISUAL, MAX_VIDEO_DURATION_SECONDS, DEFAULT_NEGATIVE_PROMPT
14
 
15
 
16
  class FeatureExtractionError(Exception):
 
134
  return outputs.last_hidden_state, outputs.attentions
135
 
136
 
137
+ def feature_process(video_path, prompt, model_dict, cfg, neg_prompt=None):
138
  visual_feats, audio_len_in_s = encode_video_features(video_path, model_dict)
139
+ if neg_prompt is None:
140
+ neg_prompt = DEFAULT_NEGATIVE_PROMPT # 使用常量中的默认值
141
  prompts = [neg_prompt, prompt]
142
  text_feat_res, text_feat_mask = encode_text_feat(prompts, model_dict)
143