Spaces:
Sleeping
Sleeping
import gradio as gr | |
from audio_transform_demo import AudioRedTeamOrchestrator | |
import os | |
from typing import Optional | |
# Define the available options for each hyperparameter | |
TARGET_MODELS = [ | |
"gpt-4o-audio-preview", | |
"gemini-2.5-pro-preview-05-06", | |
"gemini-2.0-flash", | |
"gemini-2.5-flash-preview-04-17", | |
] | |
TTS_PROVIDERS = ["kokoro", "smallestai"] | |
TTS_MODELS = [ | |
"Kokoro-82M", | |
"lightning", | |
"lightning-large", | |
] | |
VOICE_IDS = [ | |
"af_heart (American, F, Kokoro-82M)", | |
"karen (British, F, lightning)", | |
"rebecca (American, F, lightning)", | |
"chetan (Indian, M, lightning)", | |
"george (American, M, lightning)", | |
"solomon (British, M, lightning-large)", | |
"saina (Indian, F, lightning)", | |
"angela (British, F, lightning-large)", | |
"nyah (Australian, F, lightning-large)", | |
] | |
TRANSFORM_TYPES = ["none", "speed", "pitch", "echo", "reverb", "noise"] | |
def create_transform_ui(): | |
"""Create dynamic UI components for transform parameters""" | |
with gr.Blocks() as transform_block: | |
transform_type = gr.Dropdown( | |
choices=TRANSFORM_TYPES, | |
label="Transform Type", | |
value="none", | |
info="Select the type of audio transformation to apply", | |
) | |
# Speed transform parameters | |
speed_rate = gr.Slider( | |
minimum=0.5, | |
maximum=2.0, | |
value=1.0, | |
step=0.1, | |
label="Speed Rate", | |
info="Rate of the speed effect (1.0 = original speed)", | |
visible=False, | |
) | |
# Pitch transform parameters | |
pitch_steps = gr.Slider( | |
minimum=-12, | |
maximum=12, | |
value=4, | |
step=1, | |
label="Pitch Steps", | |
info="Steps to shift the pitch (in semitones)", | |
visible=False, | |
) | |
pitch_bins = gr.Slider( | |
minimum=6, | |
maximum=24, | |
value=12, | |
step=1, | |
label="Bins per Octave", | |
info="Steps per octave (12 = standard semitones)", | |
visible=False, | |
) | |
# Echo transform parameters | |
echo_delay = gr.Slider( | |
minimum=50, | |
maximum=1000, | |
value=250, | |
step=1, | |
label="Echo Delay (ms)", | |
info="Delay in milliseconds for the echo effect", | |
visible=False, | |
) | |
echo_volume = gr.Slider( | |
minimum=-20, | |
maximum=0, | |
value=-6, | |
step=1, | |
label="Echo Volume (dB)", | |
info="Volume change in dB for the echo effect", | |
visible=False, | |
) | |
# Reverb transform parameters | |
reverb_rir = gr.File( | |
label="Room Impulse Response File", | |
file_types=[".wav"], | |
visible=False, | |
) | |
reverb_method = gr.Radio( | |
choices=["fft", "conv1d"], | |
value="fft", | |
label="Convolution Method", | |
info="Method to use for applying reverb", | |
visible=False, | |
) | |
# Noise transform parameters | |
noise_file = gr.File( | |
label="Noise File", | |
file_types=[".wav"], | |
visible=False, | |
) | |
noise_volume = gr.Slider( | |
minimum=-20, | |
maximum=40, | |
value=-10, | |
step=1, | |
label="Noise Volume (dB)", | |
info="Volume change in dB for the noise effect", | |
visible=False, | |
) | |
# Function to update visible components based on transform type | |
def update_transform_ui(transform_type): | |
return [ | |
speed_rate.update(visible=transform_type == "speed"), | |
pitch_steps.update(visible=transform_type == "pitch"), | |
pitch_bins.update(visible=transform_type == "pitch"), | |
echo_delay.update(visible=transform_type == "echo"), | |
echo_volume.update(visible=transform_type == "echo"), | |
reverb_rir.update(visible=transform_type == "reverb"), | |
reverb_method.update(visible=transform_type == "reverb"), | |
noise_file.update(visible=transform_type == "noise"), | |
noise_volume.update(visible=transform_type == "noise"), | |
] | |
# Connect the transform type dropdown to update the UI | |
transform_type.change( | |
fn=update_transform_ui, | |
inputs=[transform_type], | |
outputs=[ | |
speed_rate, | |
pitch_steps, | |
pitch_bins, | |
echo_delay, | |
echo_volume, | |
reverb_rir, | |
reverb_method, | |
noise_file, | |
noise_volume, | |
], | |
) | |
return transform_block, { | |
"speed": {"rate": speed_rate}, | |
"pitch": {"n_steps": pitch_steps, "bins_per_octave": pitch_bins}, | |
"echo": {"delay": echo_delay, "volume": echo_volume}, | |
"reverb": {"rir_path": reverb_rir, "conv_method": reverb_method}, | |
"noise": {"noise_path": noise_file, "volume": noise_volume}, | |
} | |
def create_redteam_demo(): | |
def process_attack( | |
prompt, | |
target_model, | |
tts_provider, | |
tts_model, | |
voice_id, | |
transform_type, | |
speed_rate: Optional[float] = None, | |
pitch_steps: Optional[int] = None, | |
pitch_bins: Optional[int] = None, | |
echo_delay: Optional[int] = None, | |
echo_volume: Optional[int] = None, | |
reverb_rir: Optional[str] = None, | |
reverb_method: Optional[str] = None, | |
noise_file: Optional[str] = None, | |
noise_volume: Optional[int] = None, | |
): | |
# Prepare transform parameters | |
# print("AAAA") | |
transform_kwargs = None | |
if transform_type != "none": | |
if transform_type == "speed": | |
transform_kwargs = {"rate": speed_rate} | |
elif transform_type == "pitch": | |
transform_kwargs = { | |
"n_steps": pitch_steps, | |
"bins_per_octave": pitch_bins, | |
} | |
elif transform_type == "echo": | |
transform_kwargs = {"delay": echo_delay, "volume": echo_volume} | |
elif transform_type == "reverb": | |
transform_kwargs = { | |
"rir_path": reverb_rir.name if reverb_rir else None, | |
"conv_method": reverb_method, | |
} | |
elif transform_type == "noise": | |
transform_kwargs = { | |
"noise_path": noise_file.name if noise_file else None, | |
"volume": noise_volume, | |
} | |
# print("BBBB") | |
voice_id = voice_id.split("(")[0].strip() | |
print("Voice ID: ", voice_id) | |
# Initialize the orchestrator with selected parameters | |
orchestrator = AudioRedTeamOrchestrator( | |
tts_provider=tts_provider, | |
model_name=target_model, | |
voice_id=voice_id, | |
tts_model=tts_model, | |
) | |
# print("CCCC") | |
# Create a temporary directory for saving files | |
save_dir = "temp_audio_files" | |
os.makedirs(save_dir, exist_ok=True) | |
# Generate a unique ID for this attack | |
prompt_id = "attack_1" | |
# print("DDDD") | |
# Run the attack | |
result = orchestrator.attack( | |
prompt=prompt, | |
prompt_id=prompt_id, | |
save_dir=save_dir, | |
model_name=target_model, | |
generate_audio=False, | |
transform_type=transform_type if transform_type != "none" else None, | |
transform_kwargs=transform_kwargs, | |
) | |
# print("EEEE") | |
print("Attack run successfully") | |
if result: | |
audio_path = f"{save_dir}/{prompt_id}.wav" | |
if os.path.exists(audio_path): | |
print("Audio file exists") | |
return result, audio_path | |
else: | |
return result, None | |
else: | |
return {"Evaluation Result": "Attack failed or no response generated"}, None | |
with gr.Blocks() as demo: | |
prompt = gr.Textbox(label="Attack Prompt", lines=3) | |
target_model = gr.Dropdown( | |
choices=TARGET_MODELS, label="Target Model", value=TARGET_MODELS[0] | |
) | |
tts_provider = gr.Dropdown( | |
choices=TTS_PROVIDERS, label="TTS Provider", value=TTS_PROVIDERS[0] | |
) | |
tts_model = gr.Dropdown( | |
choices=TTS_MODELS, label="TTS Model", value=TTS_MODELS[0] | |
) | |
voice_id = gr.Dropdown(choices=VOICE_IDS, label="Voice ID", value=VOICE_IDS[0]) | |
transform_type = gr.Dropdown( | |
choices=TRANSFORM_TYPES, | |
label="Transform Type", | |
value="none", | |
interactive=True, | |
info="Select the type of audio transformation to apply", | |
) | |
speed_rate = gr.Slider( | |
minimum=0.5, | |
maximum=2.0, | |
value=1.0, | |
step=0.1, | |
label="Speed Rate", | |
interactive=True, | |
info="Rate of the speed effect (1.0 = original speed)", | |
visible=False, | |
) | |
pitch_steps = gr.Slider( | |
minimum=-12, | |
maximum=12, | |
value=4, | |
step=1, | |
label="Pitch Steps", | |
interactive=True, | |
info="Steps to shift the pitch (in semitones)", | |
visible=False, | |
) | |
pitch_bins = gr.Slider( | |
minimum=6, | |
maximum=24, | |
value=12, | |
step=1, | |
label="Bins per Octave", | |
interactive=True, | |
info="Steps per octave (12 = standard semitones)", | |
visible=False, | |
) | |
echo_delay = gr.Slider( | |
minimum=50, | |
maximum=1000, | |
value=250, | |
step=1, | |
label="Echo Delay (ms)", | |
interactive=True, | |
info="Delay in milliseconds for the echo effect", | |
visible=False, | |
) | |
echo_volume = gr.Slider( | |
minimum=-20, | |
maximum=0, | |
value=-6, | |
step=1, | |
label="Echo Volume (dB)", | |
interactive=True, | |
info="Volume change in dB for the echo effect", | |
visible=False, | |
) | |
reverb_rir = gr.File( | |
label="Room Impulse Response File", | |
file_types=[".wav"], | |
interactive=True, | |
visible=False, | |
) | |
reverb_method = gr.Radio( | |
choices=["fft", "conv1d"], | |
value="fft", | |
label="Convolution Method", | |
interactive=True, | |
info="Method to use for applying reverb", | |
visible=False, | |
) | |
noise_file = gr.File( | |
label="Noise File", | |
file_types=[".wav"], | |
interactive=True, | |
visible=False, | |
) | |
noise_volume = gr.Slider( | |
minimum=-20, | |
maximum=40, | |
value=-10, | |
step=1, | |
label="Noise Volume (dB)", | |
interactive=True, | |
info="Volume change in dB for the noise effect", | |
visible=False, | |
) | |
# Function to update visible components based on transform type | |
def update_transform_ui(transform_type): | |
return [ | |
gr.update(visible=transform_type == "speed"), | |
gr.update(visible=transform_type == "pitch"), | |
gr.update(visible=transform_type == "pitch"), | |
gr.update(visible=transform_type == "echo"), | |
gr.update(visible=transform_type == "echo"), | |
gr.update(visible=transform_type == "reverb"), | |
gr.update(visible=transform_type == "reverb"), | |
gr.update(visible=transform_type == "noise"), | |
gr.update(visible=transform_type == "noise"), | |
] | |
# print("FFFF") | |
transform_type.change( | |
fn=update_transform_ui, | |
inputs=[transform_type], | |
outputs=[ | |
speed_rate, | |
pitch_steps, | |
pitch_bins, | |
echo_delay, | |
echo_volume, | |
reverb_rir, | |
reverb_method, | |
noise_file, | |
noise_volume, | |
], | |
) | |
# print("GGGG") | |
btn = gr.Button("Run Attack") | |
out_json = gr.JSON() | |
out_audio = gr.Audio() | |
# print("HHHH") | |
btn.click( | |
fn=process_attack, | |
inputs=[ | |
prompt, | |
target_model, | |
tts_provider, | |
tts_model, | |
voice_id, | |
transform_type, | |
speed_rate, | |
pitch_steps, | |
pitch_bins, | |
echo_delay, | |
echo_volume, | |
reverb_rir, | |
reverb_method, | |
noise_file, | |
noise_volume, | |
], | |
outputs=[out_json, out_audio], | |
) | |
return demo | |
if __name__ == "__main__": | |
demo = create_redteam_demo() | |
demo.launch(share=True) | |