Commit
·
b36d167
1
Parent(s):
7e40d92
added code
Browse files- .gitattributes +1 -0
- .gitignore +1 -0
- __pycache__/helper.cpython-312.pyc +0 -0
- anushka.wav +3 -0
- app.py +96 -0
- helper.py +106 -0
- traffic.wav +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
/.venv
|
__pycache__/helper.cpython-312.pyc
ADDED
|
Binary file (3.25 kB). View file
|
|
|
anushka.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:79d0cfa2385223555e11093ade0c9dcabe1932171318afcd70479e00176026cb
|
| 3 |
+
size 192780
|
app.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from helper import process_audio
|
| 3 |
+
import os
|
| 4 |
+
import numpy as np
|
| 5 |
+
|
| 6 |
+
# Sample audio file paths
|
| 7 |
+
SAMPLE_SPEECH = "anushka.wav"
|
| 8 |
+
SAMPLE_NOISE = "traffic.wav"
|
| 9 |
+
|
| 10 |
+
def process_audio_files(speech_file, noise_file, alpha, beta):
|
| 11 |
+
"""
|
| 12 |
+
Process the audio files and return the mixed output
|
| 13 |
+
|
| 14 |
+
Args:
|
| 15 |
+
speech_file (tuple): Speech audio (sample_rate, data)
|
| 16 |
+
noise_file (tuple): Noise audio (sample_rate, data)
|
| 17 |
+
alpha (float): First slider value (-30 to +30)
|
| 18 |
+
beta (float): Second slider value (-30 to +30)
|
| 19 |
+
|
| 20 |
+
Returns:
|
| 21 |
+
tuple: (sample_rate, processed_audio_data)
|
| 22 |
+
"""
|
| 23 |
+
speech_sr, speech_data = speech_file
|
| 24 |
+
noise_sr, noise_data = noise_file
|
| 25 |
+
|
| 26 |
+
# Process the audio using the helper function
|
| 27 |
+
output_audio = process_audio(speech_data, noise_data, speech_sr, noise_sr, alpha, beta)
|
| 28 |
+
|
| 29 |
+
# Convert AudioSegment to numpy array
|
| 30 |
+
samples = np.array(output_audio.get_array_of_samples())
|
| 31 |
+
|
| 32 |
+
return (output_audio.frame_rate, samples)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# Create the Gradio interface
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
with gr.Blocks() as app:
|
| 39 |
+
gr.Markdown("# Audio Mixing Application")
|
| 40 |
+
|
| 41 |
+
with gr.Row():
|
| 42 |
+
with gr.Column():
|
| 43 |
+
# Input components
|
| 44 |
+
speech_input = gr.Audio(
|
| 45 |
+
label="Speech Audio",
|
| 46 |
+
type="numpy"
|
| 47 |
+
)
|
| 48 |
+
noise_input = gr.Audio(
|
| 49 |
+
label="Noise Audio",
|
| 50 |
+
type="numpy"
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
# Sample audio examples
|
| 54 |
+
gr.Examples(
|
| 55 |
+
examples=[[SAMPLE_SPEECH, SAMPLE_NOISE]],
|
| 56 |
+
inputs=[speech_input, noise_input],
|
| 57 |
+
label="Sample Audio Files"
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
# Slider controls
|
| 61 |
+
alpha_slider = gr.Slider(
|
| 62 |
+
minimum=-30,
|
| 63 |
+
maximum=30,
|
| 64 |
+
value=0,
|
| 65 |
+
step=1,
|
| 66 |
+
label="Alpha (dB)",
|
| 67 |
+
info="Adjust alpha from -30 to +30 dB"
|
| 68 |
+
)
|
| 69 |
+
beta_slider = gr.Slider(
|
| 70 |
+
minimum=-30,
|
| 71 |
+
maximum=30,
|
| 72 |
+
value=0,
|
| 73 |
+
step=1,
|
| 74 |
+
label="Beta (dB)",
|
| 75 |
+
info="Adjust beta from -30 to +30 dB"
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
# Submit button
|
| 79 |
+
submit_btn = gr.Button("Process Audio")
|
| 80 |
+
|
| 81 |
+
with gr.Column():
|
| 82 |
+
# Output audio player
|
| 83 |
+
output_audio = gr.Audio(
|
| 84 |
+
label="Mixed Audio",
|
| 85 |
+
type="numpy"
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
# Connect the components
|
| 89 |
+
submit_btn.click(
|
| 90 |
+
fn=process_audio_files,
|
| 91 |
+
inputs=[speech_input, noise_input, alpha_slider, beta_slider],
|
| 92 |
+
outputs=output_audio
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
if __name__ == "__main__":
|
| 96 |
+
app.launch()
|
helper.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import random
|
| 2 |
+
from pydub import AudioSegment
|
| 3 |
+
# from pydub.effects import normalize
|
| 4 |
+
import numpy as np
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def get_audio_volume_db(audio):
|
| 8 |
+
"""Estimate the volume in dBFS (decibels relative to full scale) using PyDub."""
|
| 9 |
+
return audio.dBFS if audio.dBFS != float('-inf') else -50.0 # Default to -50 dB for silence
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def adjust_volume(audio, volume_change_db):
|
| 13 |
+
"""Adjusts the volume of an AudioSegment."""
|
| 14 |
+
return audio + volume_change_db
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
# def compress_audio(audio):
|
| 18 |
+
# """Apply compression to normalize speech volume."""
|
| 19 |
+
# return normalize(audio)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def place_in_stereo(audio, pan_value):
|
| 23 |
+
"""Places audio in stereo field (-1.0 = full left, 1.0 = full right)."""
|
| 24 |
+
return audio.pan(pan_value)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def overlay_audio(speech_audio, noise_audio):
|
| 28 |
+
"""Overlays speech and noise using PyDub."""
|
| 29 |
+
return speech_audio.overlay(noise_audio)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def process_audio(speech_data, noise_data, speech_sr, noise_sr, alpha, beta):
|
| 33 |
+
"""
|
| 34 |
+
Process speech and noise audio data to create a mixed audio output.
|
| 35 |
+
|
| 36 |
+
Args:
|
| 37 |
+
speech_data (numpy.ndarray): Speech audio data
|
| 38 |
+
noise_data (numpy.ndarray): Noise audio data
|
| 39 |
+
speech_sr (int): Speech sample rate
|
| 40 |
+
noise_sr (int): Noise sample rate
|
| 41 |
+
alpha (float): Speech volume adjustment
|
| 42 |
+
beta (float): Noise volume adjustment
|
| 43 |
+
|
| 44 |
+
Returns:
|
| 45 |
+
AudioSegment: Processed audio
|
| 46 |
+
"""
|
| 47 |
+
# Convert numpy arrays to AudioSegment
|
| 48 |
+
speech_audio = AudioSegment(
|
| 49 |
+
speech_data.tobytes(),
|
| 50 |
+
frame_rate=speech_sr,
|
| 51 |
+
sample_width=speech_data.dtype.itemsize,
|
| 52 |
+
channels=1
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
noise_audio = AudioSegment(
|
| 56 |
+
noise_data.tobytes(),
|
| 57 |
+
frame_rate=noise_sr,
|
| 58 |
+
sample_width=noise_data.dtype.itemsize,
|
| 59 |
+
channels=1
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
# Get speech duration
|
| 63 |
+
speech_duration = len(speech_audio) / 1000.0 # Convert ms to sec
|
| 64 |
+
|
| 65 |
+
# Cut noise segment
|
| 66 |
+
if len(noise_audio) / 1000.0 <= speech_duration:
|
| 67 |
+
trimmed_noise = noise_audio
|
| 68 |
+
else:
|
| 69 |
+
start_time = random.uniform(0, len(noise_audio) / 1000.0 - speech_duration) * 1000
|
| 70 |
+
trimmed_noise = noise_audio[start_time:start_time + (speech_duration * 1000)]
|
| 71 |
+
|
| 72 |
+
trimmed_noise = trimmed_noise.set_frame_rate(8000)
|
| 73 |
+
|
| 74 |
+
# Calculate volumes and adjustments
|
| 75 |
+
speech_vol = get_audio_volume_db(speech_audio)
|
| 76 |
+
noise_vol = get_audio_volume_db(trimmed_noise)
|
| 77 |
+
|
| 78 |
+
current_snr = speech_vol - noise_vol
|
| 79 |
+
adjustment_needed = 10 - current_snr # target_snr hardcoded to 10
|
| 80 |
+
|
| 81 |
+
if adjustment_needed > 0: # Speech too quiet
|
| 82 |
+
speech_adjust = min(adjustment_needed, 2)
|
| 83 |
+
noise_adjust = -min(adjustment_needed / 2, 5)
|
| 84 |
+
else: # Speech too loud
|
| 85 |
+
speech_adjust = max(adjustment_needed, -5)
|
| 86 |
+
noise_adjust = -5 / 2
|
| 87 |
+
|
| 88 |
+
# Apply adjustments
|
| 89 |
+
adjusted_speech = adjust_volume(speech_audio, speech_adjust + alpha)
|
| 90 |
+
adjusted_noise = adjust_volume(trimmed_noise, noise_adjust + beta)
|
| 91 |
+
|
| 92 |
+
final_audio = overlay_audio(adjusted_speech, adjusted_noise)
|
| 93 |
+
|
| 94 |
+
return final_audio
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
# final_audio = process_audio("anushka.wav", "traffic.wav")
|
| 98 |
+
# # Single write operation at the end
|
| 99 |
+
# final_audio.export("output-traffic.wav", format="wav")
|
| 100 |
+
|
| 101 |
+
# print("Processing complete. Check output.wav!")
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
# -18, -20 for office
|
| 105 |
+
# -13 , -20 for market
|
| 106 |
+
# -18, -20 for traffic
|
traffic.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1fcab96844aaeebbdbd40b0d39df8689edefda2bd05ab6a44b74e5d96e7b852d
|
| 3 |
+
size 24178236
|