Audio-RedTeaming-Demo / audio_redteam_demo.py
jena-shreyas's picture
Upload folder using huggingface_hub
232ff56 verified
import gradio as gr
from audio_transform_demo import AudioRedTeamOrchestrator
import os
from typing import Optional
# Define the available options for each hyperparameter
TARGET_MODELS = [
"gpt-4o-audio-preview",
"gemini-2.5-pro-preview-05-06",
"gemini-2.0-flash",
"gemini-2.5-flash-preview-04-17",
]
TTS_PROVIDERS = ["kokoro", "smallestai"]
TTS_MODELS = [
"Kokoro-82M",
"lightning",
"lightning-large",
]
VOICE_IDS = [
"af_heart (American, F, Kokoro-82M)",
"karen (British, F, lightning)",
"rebecca (American, F, lightning)",
"chetan (Indian, M, lightning)",
"george (American, M, lightning)",
"solomon (British, M, lightning-large)",
"saina (Indian, F, lightning)",
"angela (British, F, lightning-large)",
"nyah (Australian, F, lightning-large)",
]
TRANSFORM_TYPES = ["none", "speed", "pitch", "echo", "reverb", "noise"]
def create_transform_ui():
"""Create dynamic UI components for transform parameters"""
with gr.Blocks() as transform_block:
transform_type = gr.Dropdown(
choices=TRANSFORM_TYPES,
label="Transform Type",
value="none",
info="Select the type of audio transformation to apply",
)
# Speed transform parameters
speed_rate = gr.Slider(
minimum=0.5,
maximum=2.0,
value=1.0,
step=0.1,
label="Speed Rate",
info="Rate of the speed effect (1.0 = original speed)",
visible=False,
)
# Pitch transform parameters
pitch_steps = gr.Slider(
minimum=-12,
maximum=12,
value=4,
step=1,
label="Pitch Steps",
info="Steps to shift the pitch (in semitones)",
visible=False,
)
pitch_bins = gr.Slider(
minimum=6,
maximum=24,
value=12,
step=1,
label="Bins per Octave",
info="Steps per octave (12 = standard semitones)",
visible=False,
)
# Echo transform parameters
echo_delay = gr.Slider(
minimum=50,
maximum=1000,
value=250,
step=1,
label="Echo Delay (ms)",
info="Delay in milliseconds for the echo effect",
visible=False,
)
echo_volume = gr.Slider(
minimum=-20,
maximum=0,
value=-6,
step=1,
label="Echo Volume (dB)",
info="Volume change in dB for the echo effect",
visible=False,
)
# Reverb transform parameters
reverb_rir = gr.File(
label="Room Impulse Response File",
file_types=[".wav"],
visible=False,
)
reverb_method = gr.Radio(
choices=["fft", "conv1d"],
value="fft",
label="Convolution Method",
info="Method to use for applying reverb",
visible=False,
)
# Noise transform parameters
noise_file = gr.File(
label="Noise File",
file_types=[".wav"],
visible=False,
)
noise_volume = gr.Slider(
minimum=-20,
maximum=40,
value=-10,
step=1,
label="Noise Volume (dB)",
info="Volume change in dB for the noise effect",
visible=False,
)
# Function to update visible components based on transform type
def update_transform_ui(transform_type):
return [
speed_rate.update(visible=transform_type == "speed"),
pitch_steps.update(visible=transform_type == "pitch"),
pitch_bins.update(visible=transform_type == "pitch"),
echo_delay.update(visible=transform_type == "echo"),
echo_volume.update(visible=transform_type == "echo"),
reverb_rir.update(visible=transform_type == "reverb"),
reverb_method.update(visible=transform_type == "reverb"),
noise_file.update(visible=transform_type == "noise"),
noise_volume.update(visible=transform_type == "noise"),
]
# Connect the transform type dropdown to update the UI
transform_type.change(
fn=update_transform_ui,
inputs=[transform_type],
outputs=[
speed_rate,
pitch_steps,
pitch_bins,
echo_delay,
echo_volume,
reverb_rir,
reverb_method,
noise_file,
noise_volume,
],
)
return transform_block, {
"speed": {"rate": speed_rate},
"pitch": {"n_steps": pitch_steps, "bins_per_octave": pitch_bins},
"echo": {"delay": echo_delay, "volume": echo_volume},
"reverb": {"rir_path": reverb_rir, "conv_method": reverb_method},
"noise": {"noise_path": noise_file, "volume": noise_volume},
}
def create_redteam_demo():
def process_attack(
prompt,
target_model,
tts_provider,
tts_model,
voice_id,
transform_type,
speed_rate: Optional[float] = None,
pitch_steps: Optional[int] = None,
pitch_bins: Optional[int] = None,
echo_delay: Optional[int] = None,
echo_volume: Optional[int] = None,
reverb_rir: Optional[str] = None,
reverb_method: Optional[str] = None,
noise_file: Optional[str] = None,
noise_volume: Optional[int] = None,
):
# Prepare transform parameters
# print("AAAA")
transform_kwargs = None
if transform_type != "none":
if transform_type == "speed":
transform_kwargs = {"rate": speed_rate}
elif transform_type == "pitch":
transform_kwargs = {
"n_steps": pitch_steps,
"bins_per_octave": pitch_bins,
}
elif transform_type == "echo":
transform_kwargs = {"delay": echo_delay, "volume": echo_volume}
elif transform_type == "reverb":
transform_kwargs = {
"rir_path": reverb_rir.name if reverb_rir else None,
"conv_method": reverb_method,
}
elif transform_type == "noise":
transform_kwargs = {
"noise_path": noise_file.name if noise_file else None,
"volume": noise_volume,
}
# print("BBBB")
voice_id = voice_id.split("(")[0].strip()
print("Voice ID: ", voice_id)
# Initialize the orchestrator with selected parameters
orchestrator = AudioRedTeamOrchestrator(
tts_provider=tts_provider,
model_name=target_model,
voice_id=voice_id,
tts_model=tts_model,
)
# print("CCCC")
# Create a temporary directory for saving files
save_dir = "temp_audio_files"
os.makedirs(save_dir, exist_ok=True)
# Generate a unique ID for this attack
prompt_id = "attack_1"
# print("DDDD")
# Run the attack
result = orchestrator.attack(
prompt=prompt,
prompt_id=prompt_id,
save_dir=save_dir,
model_name=target_model,
generate_audio=False,
transform_type=transform_type if transform_type != "none" else None,
transform_kwargs=transform_kwargs,
)
# print("EEEE")
print("Attack run successfully")
if result:
audio_path = f"{save_dir}/{prompt_id}.wav"
if os.path.exists(audio_path):
print("Audio file exists")
return result, audio_path
else:
return result, None
else:
return {"Evaluation Result": "Attack failed or no response generated"}, None
with gr.Blocks() as demo:
prompt = gr.Textbox(label="Attack Prompt", lines=3)
target_model = gr.Dropdown(
choices=TARGET_MODELS, label="Target Model", value=TARGET_MODELS[0]
)
tts_provider = gr.Dropdown(
choices=TTS_PROVIDERS, label="TTS Provider", value=TTS_PROVIDERS[0]
)
tts_model = gr.Dropdown(
choices=TTS_MODELS, label="TTS Model", value=TTS_MODELS[0]
)
voice_id = gr.Dropdown(choices=VOICE_IDS, label="Voice ID", value=VOICE_IDS[0])
transform_type = gr.Dropdown(
choices=TRANSFORM_TYPES,
label="Transform Type",
value="none",
interactive=True,
info="Select the type of audio transformation to apply",
)
speed_rate = gr.Slider(
minimum=0.5,
maximum=2.0,
value=1.0,
step=0.1,
label="Speed Rate",
interactive=True,
info="Rate of the speed effect (1.0 = original speed)",
visible=False,
)
pitch_steps = gr.Slider(
minimum=-12,
maximum=12,
value=4,
step=1,
label="Pitch Steps",
interactive=True,
info="Steps to shift the pitch (in semitones)",
visible=False,
)
pitch_bins = gr.Slider(
minimum=6,
maximum=24,
value=12,
step=1,
label="Bins per Octave",
interactive=True,
info="Steps per octave (12 = standard semitones)",
visible=False,
)
echo_delay = gr.Slider(
minimum=50,
maximum=1000,
value=250,
step=1,
label="Echo Delay (ms)",
interactive=True,
info="Delay in milliseconds for the echo effect",
visible=False,
)
echo_volume = gr.Slider(
minimum=-20,
maximum=0,
value=-6,
step=1,
label="Echo Volume (dB)",
interactive=True,
info="Volume change in dB for the echo effect",
visible=False,
)
reverb_rir = gr.File(
label="Room Impulse Response File",
file_types=[".wav"],
interactive=True,
visible=False,
)
reverb_method = gr.Radio(
choices=["fft", "conv1d"],
value="fft",
label="Convolution Method",
interactive=True,
info="Method to use for applying reverb",
visible=False,
)
noise_file = gr.File(
label="Noise File",
file_types=[".wav"],
interactive=True,
visible=False,
)
noise_volume = gr.Slider(
minimum=-20,
maximum=40,
value=-10,
step=1,
label="Noise Volume (dB)",
interactive=True,
info="Volume change in dB for the noise effect",
visible=False,
)
# Function to update visible components based on transform type
def update_transform_ui(transform_type):
return [
gr.update(visible=transform_type == "speed"),
gr.update(visible=transform_type == "pitch"),
gr.update(visible=transform_type == "pitch"),
gr.update(visible=transform_type == "echo"),
gr.update(visible=transform_type == "echo"),
gr.update(visible=transform_type == "reverb"),
gr.update(visible=transform_type == "reverb"),
gr.update(visible=transform_type == "noise"),
gr.update(visible=transform_type == "noise"),
]
# print("FFFF")
transform_type.change(
fn=update_transform_ui,
inputs=[transform_type],
outputs=[
speed_rate,
pitch_steps,
pitch_bins,
echo_delay,
echo_volume,
reverb_rir,
reverb_method,
noise_file,
noise_volume,
],
)
# print("GGGG")
btn = gr.Button("Run Attack")
out_json = gr.JSON()
out_audio = gr.Audio()
# print("HHHH")
btn.click(
fn=process_attack,
inputs=[
prompt,
target_model,
tts_provider,
tts_model,
voice_id,
transform_type,
speed_rate,
pitch_steps,
pitch_bins,
echo_delay,
echo_volume,
reverb_rir,
reverb_method,
noise_file,
noise_volume,
],
outputs=[out_json, out_audio],
)
return demo
if __name__ == "__main__":
demo = create_redteam_demo()
demo.launch(share=True)