Spaces:

aparna29
/

Malayalam_TTS

Running

App Files Files Community

Malayalam_TTS / app.py

aparna29

Update app.py

69733ab verified 3 months ago

raw

history blame contribute delete

6.44 kB

	import os
	import io
	import torch
	import librosa
	import requests
	import tempfile
	import numpy as np
	import soundfile as sf
	import gradio as gr
	from transformers import AutoModel
	import spaces
	import time
	from huggingface_hub import login

	# Get Hugging Face token from environment variable
	hf_token = os.getenv("HF_TOKEN") # Retrieve token from environment

	# Ensure the token is set, otherwise raise an error
	if hf_token:
	login(token=hf_token) # Log in using the token from environment
	else:
	raise ValueError("Hugging Face token not found in environment variables.")

	# Function to load reference audio from URL
	def load_audio_from_url(url):
	response = requests.get(url)
	if response.status_code == 200:
	audio_data, sample_rate = sf.read(io.BytesIO(response.content))
	return sample_rate, audio_data
	return None, None

	# Function to check and use GPU
	def get_device():
	if torch.cuda.is_available():
	device = torch.device("cuda")
	print(f"Using GPU: {torch.cuda.get_device_name(0)}")
	else:
	device = torch.device("cpu")
	print("Using CPU")
	return device

	# Resampling function to match sample rates
	def resample_audio(audio, orig_sr, target_sr):
	if orig_sr != target_sr:
	return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
	return audio

	@spaces.GPU
	def synthesize_speech(text, ref_audio, ref_text):
	if ref_audio is None or ref_text.strip() == "":
	return "Error: Please provide a reference audio and its corresponding text."

	# Ensure valid reference audio input
	if isinstance(ref_audio, tuple) and len(ref_audio) == 2:
	sample_rate, audio_data = ref_audio
	else:
	return "Error: Invalid reference audio input."

	# Save reference audio directly without resampling
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
	sf.write(temp_audio.name, audio_data, samplerate=sample_rate, format='WAV')
	temp_audio.flush()

	# Profiling the model inference time
	start_time = time.time()

	# Run the inference
	audio = model(text, ref_audio_path=temp_audio.name, ref_text=ref_text)

	end_time = time.time()
	print(f"Inference time: {end_time - start_time} seconds")

	# Normalize output and save
	if audio.dtype == np.int16:
	audio = audio.astype(np.float32) / 32768.0

	# Resample the generated audio to match the reference audio's sample rate
	audio = resample_audio(audio, orig_sr=24000, target_sr=sample_rate)

	return sample_rate, audio

	# Load TTS model and move it to the appropriate device (GPU/CPU)
	repo_id = "ai4bharat/IndicF5"
	model = AutoModel.from_pretrained(repo_id, trust_remote_code=True)
	device = get_device() # Get device (GPU/CPU)
	model = model.to(device)

	# Example Malayalam Data
	EXAMPLES = [
	{
	"audio_name": "Aparna Voice",
	"audio_url": "https://raw.githubusercontent.com/Aparna0112/voicerecording-_TTS/main/Aparna%20Voice.wav", # Replace with actual Malayalam audio URL
	"ref_text": " ഞാൻ ഒരു ഫോണിന്‍റെ കവർ നോക്കുകയാണ്. എനിക്ക് സ്മാർട്ട് ഫോണിന് കവർ വേണം",
	"synth_text": "ഞാൻ മലയാളം സംസാരിക്കാൻ കഴിയുന്നു."
	},
	{
	"audio_name": "KC Voice",
	"audio_url": "https://raw.githubusercontent.com/Aparna0112/voicerecording-_TTS/main/KC%20Voice.wav", # Replace with actual Malayalam audio URL
	"ref_text": "ഹലോ ഇത് അപരനെ അല്ലേ ഞാൻ ജഗദീപ് ആണ് വിളിക്കുന്നത് ഇപ്പോൾ ഫ്രീയാണോ സംസാരിക്കാമോ ",
	"synth_text": "ഞാൻ മലയാളം സംസാരിക്കാൻ കഴിയുന്നു."
	},
	# Add more examples here if needed
	]

	# Preload all example audios
	for example in EXAMPLES:
	sample_rate, audio_data = load_audio_from_url(example["audio_url"])
	example["sample_rate"] = sample_rate
	example["audio_data"] = audio_data

	# Define Gradio interface
	with gr.Blocks() as iface:
	gr.Markdown(
	"""
	# Text-to-Speech for Malayalam
	[![Hugging Face](https://img.shields.io/badge/HuggingFace-Model-orange)](https://huggingface.co/ai4bharat/IndicF5)
	Use IndicF5, a Text-to-Speech (TTS) model, to generate Malayalam speech.
	"""
	)

	with gr.Row():
	with gr.Column():
	# Text to synthesize
	text_input = gr.Textbox(label="Text to Synthesize (Malayalam)", placeholder="Enter Malayalam text...", lines=3)
	# Reference audio input
	ref_audio_input = gr.Audio(type="numpy", label="Reference Prompt Audio")
	# Reference text input
	ref_text_input = gr.Textbox(label="Text in Reference Prompt Audio (Malayalam)", placeholder="Enter the transcript in Malayalam...", lines=2)
	# Submit button
	submit_btn = gr.Button("🎤 Generate Malayalam Speech", variant="primary")

	with gr.Column():
	# Output audio of generated speech
	output_audio = gr.Audio(label="Generated Speech (Malayalam)", type="numpy")

	# Dropdown to select audio name
	audio_name_input = gr.Dropdown(
	label="Select Reference Audio",
	choices=[ex["audio_name"] for ex in EXAMPLES],
	type="value" # The value will be the audio name
	)

	# Function to update the reference audio and text based on selected audio name
	def update_reference_audio(selected_audio_name):
	# Find the selected example by audio name
	selected_example = next(ex for ex in EXAMPLES if ex["audio_name"] == selected_audio_name)
	ref_audio = (selected_example["sample_rate"], selected_example["audio_data"])
	ref_text = selected_example["ref_text"]
	return ref_audio, ref_text

	# Use `audio_name_input` to update `ref_audio_input` and `ref_text_input`
	audio_name_input.change(
	update_reference_audio,
	inputs=[audio_name_input],
	outputs=[ref_audio_input, ref_text_input]
	)

	# Set the click event for the button
	submit_btn.click(
	synthesize_speech,
	inputs=[text_input, ref_audio_input, ref_text_input],
	outputs=[output_audio]
	)

	iface.launch()