Spaces:
Running
on
Zero
Running
on
Zero
James Zhou
commited on
Commit
·
49611df
1
Parent(s):
3fff971
[update] neg prompt
Browse files- app.py +14 -6
- hunyuanvideo_foley/utils/feature_utils.py +4 -3
app.py
CHANGED
@@ -120,6 +120,7 @@ def auto_load_models() -> str:
|
|
120 |
def infer_single_video(
|
121 |
video_file,
|
122 |
text_prompt: str,
|
|
|
123 |
guidance_scale: float = 4.5,
|
124 |
num_inference_steps: int = 50,
|
125 |
sample_nums: int = 1
|
@@ -147,7 +148,8 @@ def infer_single_video(
|
|
147 |
video_file,
|
148 |
text_prompt,
|
149 |
model_dict,
|
150 |
-
cfg
|
|
|
151 |
)
|
152 |
|
153 |
# Denoising process to generate multiple audio samples
|
@@ -566,6 +568,12 @@ def create_gradio_interface():
|
|
566 |
placeholder="A person walks on frozen ice",
|
567 |
lines=3,
|
568 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
569 |
|
570 |
with gr.Row():
|
571 |
guidance_scale = gr.Slider(
|
@@ -748,10 +756,10 @@ def create_gradio_interface():
|
|
748 |
example_buttons.append((example_btn, example))
|
749 |
|
750 |
# Event handlers
|
751 |
-
def process_inference(video_file, text_prompt, guidance_scale, inference_steps, sample_nums):
|
752 |
# Generate videos
|
753 |
video_list, status_msg = infer_single_video(
|
754 |
-
video_file, text_prompt, guidance_scale, inference_steps, int(sample_nums)
|
755 |
)
|
756 |
# Update outputs with proper visibility
|
757 |
return update_video_outputs(video_list, status_msg)
|
@@ -777,7 +785,7 @@ def create_gradio_interface():
|
|
777 |
|
778 |
generate_btn.click(
|
779 |
fn=process_inference,
|
780 |
-
inputs=[video_input, text_input, guidance_scale, inference_steps, sample_nums],
|
781 |
outputs=[
|
782 |
video_output_1, # Sample 1 value
|
783 |
video_output_2, # Sample 2 value
|
@@ -810,12 +818,12 @@ def create_gradio_interface():
|
|
810 |
if not result_video:
|
811 |
status_msg += f"\n⚠️ Result video not found: {ex['result_path']}"
|
812 |
|
813 |
-
return video_file, ex['caption'], result_video, status_msg
|
814 |
return handler
|
815 |
|
816 |
btn.click(
|
817 |
fn=create_example_handler(example),
|
818 |
-
outputs=[video_input, text_input, video_output_1, result_text]
|
819 |
)
|
820 |
|
821 |
# Footer
|
|
|
120 |
def infer_single_video(
|
121 |
video_file,
|
122 |
text_prompt: str,
|
123 |
+
neg_prompt: str = None,
|
124 |
guidance_scale: float = 4.5,
|
125 |
num_inference_steps: int = 50,
|
126 |
sample_nums: int = 1
|
|
|
148 |
video_file,
|
149 |
text_prompt,
|
150 |
model_dict,
|
151 |
+
cfg,
|
152 |
+
neg_prompt=neg_prompt
|
153 |
)
|
154 |
|
155 |
# Denoising process to generate multiple audio samples
|
|
|
568 |
placeholder="A person walks on frozen ice",
|
569 |
lines=3,
|
570 |
)
|
571 |
+
|
572 |
+
neg_prompt_input = gr.Textbox(
|
573 |
+
label="🚫 Negative Prompt",
|
574 |
+
placeholder="noisy, harsh",
|
575 |
+
lines=2,
|
576 |
+
)
|
577 |
|
578 |
with gr.Row():
|
579 |
guidance_scale = gr.Slider(
|
|
|
756 |
example_buttons.append((example_btn, example))
|
757 |
|
758 |
# Event handlers
|
759 |
+
def process_inference(video_file, text_prompt, neg_prompt, guidance_scale, inference_steps, sample_nums):
|
760 |
# Generate videos
|
761 |
video_list, status_msg = infer_single_video(
|
762 |
+
video_file, text_prompt, neg_prompt, guidance_scale, inference_steps, int(sample_nums)
|
763 |
)
|
764 |
# Update outputs with proper visibility
|
765 |
return update_video_outputs(video_list, status_msg)
|
|
|
785 |
|
786 |
generate_btn.click(
|
787 |
fn=process_inference,
|
788 |
+
inputs=[video_input, text_input, neg_prompt_input, guidance_scale, inference_steps, sample_nums],
|
789 |
outputs=[
|
790 |
video_output_1, # Sample 1 value
|
791 |
video_output_2, # Sample 2 value
|
|
|
818 |
if not result_video:
|
819 |
status_msg += f"\n⚠️ Result video not found: {ex['result_path']}"
|
820 |
|
821 |
+
return video_file, ex['caption'], "noisy, harsh", result_video, status_msg
|
822 |
return handler
|
823 |
|
824 |
btn.click(
|
825 |
fn=create_example_handler(example),
|
826 |
+
outputs=[video_input, text_input, neg_prompt_input, video_output_1, result_text]
|
827 |
)
|
828 |
|
829 |
# Footer
|
hunyuanvideo_foley/utils/feature_utils.py
CHANGED
@@ -10,7 +10,7 @@ from typing import Any, Dict, List, Union, Tuple
|
|
10 |
from loguru import logger
|
11 |
|
12 |
from .config_utils import AttributeDict
|
13 |
-
from ..constants import FPS_VISUAL, MAX_VIDEO_DURATION_SECONDS
|
14 |
|
15 |
|
16 |
class FeatureExtractionError(Exception):
|
@@ -134,9 +134,10 @@ def encode_text_feat(text: List[str], model_dict):
|
|
134 |
return outputs.last_hidden_state, outputs.attentions
|
135 |
|
136 |
|
137 |
-
def feature_process(video_path, prompt, model_dict, cfg):
|
138 |
visual_feats, audio_len_in_s = encode_video_features(video_path, model_dict)
|
139 |
-
neg_prompt
|
|
|
140 |
prompts = [neg_prompt, prompt]
|
141 |
text_feat_res, text_feat_mask = encode_text_feat(prompts, model_dict)
|
142 |
|
|
|
10 |
from loguru import logger
|
11 |
|
12 |
from .config_utils import AttributeDict
|
13 |
+
from ..constants import FPS_VISUAL, MAX_VIDEO_DURATION_SECONDS, DEFAULT_NEGATIVE_PROMPT
|
14 |
|
15 |
|
16 |
class FeatureExtractionError(Exception):
|
|
|
134 |
return outputs.last_hidden_state, outputs.attentions
|
135 |
|
136 |
|
137 |
+
def feature_process(video_path, prompt, model_dict, cfg, neg_prompt=None):
|
138 |
visual_feats, audio_len_in_s = encode_video_features(video_path, model_dict)
|
139 |
+
if neg_prompt is None:
|
140 |
+
neg_prompt = DEFAULT_NEGATIVE_PROMPT # 使用常量中的默认值
|
141 |
prompts = [neg_prompt, prompt]
|
142 |
text_feat_res, text_feat_mask = encode_text_feat(prompts, model_dict)
|
143 |
|