Spaces:
Running
Running
import os | |
import io | |
import torch | |
import librosa | |
import requests | |
import tempfile | |
import numpy as np | |
import soundfile as sf | |
import gradio as gr | |
from transformers import AutoModel | |
import spaces | |
import time | |
from huggingface_hub import login | |
# Get Hugging Face token from environment variable | |
hf_token = os.getenv("HF_TOKEN") # Retrieve token from environment | |
# Ensure the token is set, otherwise raise an error | |
if hf_token: | |
login(token=hf_token) # Log in using the token from environment | |
else: | |
raise ValueError("Hugging Face token not found in environment variables.") | |
# Function to load reference audio from URL | |
def load_audio_from_url(url): | |
response = requests.get(url) | |
if response.status_code == 200: | |
audio_data, sample_rate = sf.read(io.BytesIO(response.content)) | |
return sample_rate, audio_data | |
return None, None | |
# Function to check and use GPU | |
def get_device(): | |
if torch.cuda.is_available(): | |
device = torch.device("cuda") | |
print(f"Using GPU: {torch.cuda.get_device_name(0)}") | |
else: | |
device = torch.device("cpu") | |
print("Using CPU") | |
return device | |
# Resampling function to match sample rates | |
def resample_audio(audio, orig_sr, target_sr): | |
if orig_sr != target_sr: | |
return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr) | |
return audio | |
def synthesize_speech(text, ref_audio, ref_text): | |
if ref_audio is None or ref_text.strip() == "": | |
return "Error: Please provide a reference audio and its corresponding text." | |
# Ensure valid reference audio input | |
if isinstance(ref_audio, tuple) and len(ref_audio) == 2: | |
sample_rate, audio_data = ref_audio | |
else: | |
return "Error: Invalid reference audio input." | |
# Save reference audio directly without resampling | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio: | |
sf.write(temp_audio.name, audio_data, samplerate=sample_rate, format='WAV') | |
temp_audio.flush() | |
# Profiling the model inference time | |
start_time = time.time() | |
# Run the inference | |
audio = model(text, ref_audio_path=temp_audio.name, ref_text=ref_text) | |
end_time = time.time() | |
print(f"Inference time: {end_time - start_time} seconds") | |
# Normalize output and save | |
if audio.dtype == np.int16: | |
audio = audio.astype(np.float32) / 32768.0 | |
# Resample the generated audio to match the reference audio's sample rate | |
audio = resample_audio(audio, orig_sr=24000, target_sr=sample_rate) | |
return sample_rate, audio | |
# Load TTS model and move it to the appropriate device (GPU/CPU) | |
repo_id = "ai4bharat/IndicF5" | |
model = AutoModel.from_pretrained(repo_id, trust_remote_code=True) | |
device = get_device() # Get device (GPU/CPU) | |
model = model.to(device) | |
# Example Malayalam Data | |
EXAMPLES = [ | |
{ | |
"audio_name": "Aparna Voice", | |
"audio_url": "https://raw.githubusercontent.com/Aparna0112/voicerecording-_TTS/main/Aparna%20Voice.wav", # Replace with actual Malayalam audio URL | |
"ref_text": " ഞാൻ ഒരു ഫോണിന്റെ കവർ നോക്കുകയാണ്. എനിക്ക് സ്മാർട്ട് ഫോണിന് കവർ വേണം", | |
"synth_text": "ഞാൻ മലയാളം സംസാരിക്കാൻ കഴിയുന്നു." | |
}, | |
{ | |
"audio_name": "KC Voice", | |
"audio_url": "https://raw.githubusercontent.com/Aparna0112/voicerecording-_TTS/main/KC%20Voice.wav", # Replace with actual Malayalam audio URL | |
"ref_text": "ഹലോ ഇത് അപരനെ അല്ലേ ഞാൻ ജഗദീപ് ആണ് വിളിക്കുന്നത് ഇപ്പോൾ ഫ്രീയാണോ സംസാരിക്കാമോ ", | |
"synth_text": "ഞാൻ മലയാളം സംസാരിക്കാൻ കഴിയുന്നു." | |
}, | |
# Add more examples here if needed | |
] | |
# Preload all example audios | |
for example in EXAMPLES: | |
sample_rate, audio_data = load_audio_from_url(example["audio_url"]) | |
example["sample_rate"] = sample_rate | |
example["audio_data"] = audio_data | |
# Define Gradio interface | |
with gr.Blocks() as iface: | |
gr.Markdown( | |
""" | |
# **Text-to-Speech for Malayalam** | |
[](https://huggingface.co/ai4bharat/IndicF5) | |
Use **IndicF5**, a **Text-to-Speech (TTS)** model, to generate Malayalam speech. | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(): | |
# Text to synthesize | |
text_input = gr.Textbox(label="Text to Synthesize (Malayalam)", placeholder="Enter Malayalam text...", lines=3) | |
# Reference audio input | |
ref_audio_input = gr.Audio(type="numpy", label="Reference Prompt Audio") | |
# Reference text input | |
ref_text_input = gr.Textbox(label="Text in Reference Prompt Audio (Malayalam)", placeholder="Enter the transcript in Malayalam...", lines=2) | |
# Submit button | |
submit_btn = gr.Button("🎤 Generate Malayalam Speech", variant="primary") | |
with gr.Column(): | |
# Output audio of generated speech | |
output_audio = gr.Audio(label="Generated Speech (Malayalam)", type="numpy") | |
# Dropdown to select audio name | |
audio_name_input = gr.Dropdown( | |
label="Select Reference Audio", | |
choices=[ex["audio_name"] for ex in EXAMPLES], | |
type="value" # The value will be the audio name | |
) | |
# Function to update the reference audio and text based on selected audio name | |
def update_reference_audio(selected_audio_name): | |
# Find the selected example by audio name | |
selected_example = next(ex for ex in EXAMPLES if ex["audio_name"] == selected_audio_name) | |
ref_audio = (selected_example["sample_rate"], selected_example["audio_data"]) | |
ref_text = selected_example["ref_text"] | |
return ref_audio, ref_text | |
# Use `audio_name_input` to update `ref_audio_input` and `ref_text_input` | |
audio_name_input.change( | |
update_reference_audio, | |
inputs=[audio_name_input], | |
outputs=[ref_audio_input, ref_text_input] | |
) | |
# Set the click event for the button | |
submit_btn.click( | |
synthesize_speech, | |
inputs=[text_input, ref_audio_input, ref_text_input], | |
outputs=[output_audio] | |
) | |
iface.launch() | |