import gradio as gr from audio_transform_demo import AudioRedTeamOrchestrator import os from typing import Optional # Define the available options for each hyperparameter TARGET_MODELS = [ "gpt-4o-audio-preview", "gemini-2.5-pro-preview-05-06", "gemini-2.0-flash", "gemini-2.5-flash-preview-04-17", ] TTS_PROVIDERS = ["kokoro", "smallestai"] TTS_MODELS = [ "Kokoro-82M", "lightning", "lightning-large", ] VOICE_IDS = [ "af_heart (American, F, Kokoro-82M)", "karen (British, F, lightning)", "rebecca (American, F, lightning)", "chetan (Indian, M, lightning)", "george (American, M, lightning)", "solomon (British, M, lightning-large)", "saina (Indian, F, lightning)", "angela (British, F, lightning-large)", "nyah (Australian, F, lightning-large)", ] TRANSFORM_TYPES = ["none", "speed", "pitch", "echo", "reverb", "noise"] def create_transform_ui(): """Create dynamic UI components for transform parameters""" with gr.Blocks() as transform_block: transform_type = gr.Dropdown( choices=TRANSFORM_TYPES, label="Transform Type", value="none", info="Select the type of audio transformation to apply", ) # Speed transform parameters speed_rate = gr.Slider( minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed Rate", info="Rate of the speed effect (1.0 = original speed)", visible=False, ) # Pitch transform parameters pitch_steps = gr.Slider( minimum=-12, maximum=12, value=4, step=1, label="Pitch Steps", info="Steps to shift the pitch (in semitones)", visible=False, ) pitch_bins = gr.Slider( minimum=6, maximum=24, value=12, step=1, label="Bins per Octave", info="Steps per octave (12 = standard semitones)", visible=False, ) # Echo transform parameters echo_delay = gr.Slider( minimum=50, maximum=1000, value=250, step=1, label="Echo Delay (ms)", info="Delay in milliseconds for the echo effect", visible=False, ) echo_volume = gr.Slider( minimum=-20, maximum=0, value=-6, step=1, label="Echo Volume (dB)", info="Volume change in dB for the echo effect", visible=False, ) # Reverb transform parameters reverb_rir = gr.File( label="Room Impulse Response File", file_types=[".wav"], visible=False, ) reverb_method = gr.Radio( choices=["fft", "conv1d"], value="fft", label="Convolution Method", info="Method to use for applying reverb", visible=False, ) # Noise transform parameters noise_file = gr.File( label="Noise File", file_types=[".wav"], visible=False, ) noise_volume = gr.Slider( minimum=-20, maximum=40, value=-10, step=1, label="Noise Volume (dB)", info="Volume change in dB for the noise effect", visible=False, ) # Function to update visible components based on transform type def update_transform_ui(transform_type): return [ speed_rate.update(visible=transform_type == "speed"), pitch_steps.update(visible=transform_type == "pitch"), pitch_bins.update(visible=transform_type == "pitch"), echo_delay.update(visible=transform_type == "echo"), echo_volume.update(visible=transform_type == "echo"), reverb_rir.update(visible=transform_type == "reverb"), reverb_method.update(visible=transform_type == "reverb"), noise_file.update(visible=transform_type == "noise"), noise_volume.update(visible=transform_type == "noise"), ] # Connect the transform type dropdown to update the UI transform_type.change( fn=update_transform_ui, inputs=[transform_type], outputs=[ speed_rate, pitch_steps, pitch_bins, echo_delay, echo_volume, reverb_rir, reverb_method, noise_file, noise_volume, ], ) return transform_block, { "speed": {"rate": speed_rate}, "pitch": {"n_steps": pitch_steps, "bins_per_octave": pitch_bins}, "echo": {"delay": echo_delay, "volume": echo_volume}, "reverb": {"rir_path": reverb_rir, "conv_method": reverb_method}, "noise": {"noise_path": noise_file, "volume": noise_volume}, } def create_redteam_demo(): def process_attack( prompt, target_model, tts_provider, tts_model, voice_id, transform_type, speed_rate: Optional[float] = None, pitch_steps: Optional[int] = None, pitch_bins: Optional[int] = None, echo_delay: Optional[int] = None, echo_volume: Optional[int] = None, reverb_rir: Optional[str] = None, reverb_method: Optional[str] = None, noise_file: Optional[str] = None, noise_volume: Optional[int] = None, ): # Prepare transform parameters # print("AAAA") transform_kwargs = None if transform_type != "none": if transform_type == "speed": transform_kwargs = {"rate": speed_rate} elif transform_type == "pitch": transform_kwargs = { "n_steps": pitch_steps, "bins_per_octave": pitch_bins, } elif transform_type == "echo": transform_kwargs = {"delay": echo_delay, "volume": echo_volume} elif transform_type == "reverb": transform_kwargs = { "rir_path": reverb_rir.name if reverb_rir else None, "conv_method": reverb_method, } elif transform_type == "noise": transform_kwargs = { "noise_path": noise_file.name if noise_file else None, "volume": noise_volume, } # print("BBBB") voice_id = voice_id.split("(")[0].strip() print("Voice ID: ", voice_id) # Initialize the orchestrator with selected parameters orchestrator = AudioRedTeamOrchestrator( tts_provider=tts_provider, model_name=target_model, voice_id=voice_id, tts_model=tts_model, ) # print("CCCC") # Create a temporary directory for saving files save_dir = "temp_audio_files" os.makedirs(save_dir, exist_ok=True) # Generate a unique ID for this attack prompt_id = "attack_1" # print("DDDD") # Run the attack result = orchestrator.attack( prompt=prompt, prompt_id=prompt_id, save_dir=save_dir, model_name=target_model, generate_audio=False, transform_type=transform_type if transform_type != "none" else None, transform_kwargs=transform_kwargs, ) # print("EEEE") print("Attack run successfully") if result: audio_path = f"{save_dir}/{prompt_id}.wav" if os.path.exists(audio_path): print("Audio file exists") return result, audio_path else: return result, None else: return {"Evaluation Result": "Attack failed or no response generated"}, None with gr.Blocks() as demo: prompt = gr.Textbox(label="Attack Prompt", lines=3) target_model = gr.Dropdown( choices=TARGET_MODELS, label="Target Model", value=TARGET_MODELS[0] ) tts_provider = gr.Dropdown( choices=TTS_PROVIDERS, label="TTS Provider", value=TTS_PROVIDERS[0] ) tts_model = gr.Dropdown( choices=TTS_MODELS, label="TTS Model", value=TTS_MODELS[0] ) voice_id = gr.Dropdown(choices=VOICE_IDS, label="Voice ID", value=VOICE_IDS[0]) transform_type = gr.Dropdown( choices=TRANSFORM_TYPES, label="Transform Type", value="none", interactive=True, info="Select the type of audio transformation to apply", ) speed_rate = gr.Slider( minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed Rate", interactive=True, info="Rate of the speed effect (1.0 = original speed)", visible=False, ) pitch_steps = gr.Slider( minimum=-12, maximum=12, value=4, step=1, label="Pitch Steps", interactive=True, info="Steps to shift the pitch (in semitones)", visible=False, ) pitch_bins = gr.Slider( minimum=6, maximum=24, value=12, step=1, label="Bins per Octave", interactive=True, info="Steps per octave (12 = standard semitones)", visible=False, ) echo_delay = gr.Slider( minimum=50, maximum=1000, value=250, step=1, label="Echo Delay (ms)", interactive=True, info="Delay in milliseconds for the echo effect", visible=False, ) echo_volume = gr.Slider( minimum=-20, maximum=0, value=-6, step=1, label="Echo Volume (dB)", interactive=True, info="Volume change in dB for the echo effect", visible=False, ) reverb_rir = gr.File( label="Room Impulse Response File", file_types=[".wav"], interactive=True, visible=False, ) reverb_method = gr.Radio( choices=["fft", "conv1d"], value="fft", label="Convolution Method", interactive=True, info="Method to use for applying reverb", visible=False, ) noise_file = gr.File( label="Noise File", file_types=[".wav"], interactive=True, visible=False, ) noise_volume = gr.Slider( minimum=-20, maximum=40, value=-10, step=1, label="Noise Volume (dB)", interactive=True, info="Volume change in dB for the noise effect", visible=False, ) # Function to update visible components based on transform type def update_transform_ui(transform_type): return [ gr.update(visible=transform_type == "speed"), gr.update(visible=transform_type == "pitch"), gr.update(visible=transform_type == "pitch"), gr.update(visible=transform_type == "echo"), gr.update(visible=transform_type == "echo"), gr.update(visible=transform_type == "reverb"), gr.update(visible=transform_type == "reverb"), gr.update(visible=transform_type == "noise"), gr.update(visible=transform_type == "noise"), ] # print("FFFF") transform_type.change( fn=update_transform_ui, inputs=[transform_type], outputs=[ speed_rate, pitch_steps, pitch_bins, echo_delay, echo_volume, reverb_rir, reverb_method, noise_file, noise_volume, ], ) # print("GGGG") btn = gr.Button("Run Attack") out_json = gr.JSON() out_audio = gr.Audio() # print("HHHH") btn.click( fn=process_attack, inputs=[ prompt, target_model, tts_provider, tts_model, voice_id, transform_type, speed_rate, pitch_steps, pitch_bins, echo_delay, echo_volume, reverb_rir, reverb_method, noise_file, noise_volume, ], outputs=[out_json, out_audio], ) return demo if __name__ == "__main__": demo = create_redteam_demo() demo.launch(share=True)