File size: 6,436 Bytes
8a6f40a
5de1ef9
 
 
 
 
 
2522e45
8a6f40a
5de1ef9
8a6f40a
 
 
 
 
 
 
 
 
 
 
 
d1ae334
5de1ef9
 
 
 
 
 
 
 
8a6f40a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5de1ef9
 
 
 
8a6f40a
5de1ef9
 
 
 
 
8a6f40a
5de1ef9
 
 
 
8a6f40a
 
 
 
 
5de1ef9
8a6f40a
 
 
 
5de1ef9
 
 
 
8a6f40a
 
5de1ef9
8a6f40a
5de1ef9
8a6f40a
5de1ef9
 
8a6f40a
5de1ef9
 
8a6f40a
5de1ef9
 
8a6f40a
3b263db
8a6f40a
 
5de1ef9
 
8a6f40a
3b263db
8a6f40a
 
5de1ef9
8a6f40a
5de1ef9
 
 
 
 
 
 
 
8a6f40a
5de1ef9
 
 
69733ab
5de1ef9
8a6f40a
5de1ef9
 
8a6f40a
5de1ef9
 
8a6f40a
 
 
5de1ef9
8a6f40a
 
 
 
 
5de1ef9
8a6f40a
 
 
 
 
 
 
 
5de1ef9
 
8a6f40a
 
 
 
 
 
 
 
 
 
 
 
 
 
5de1ef9
8a6f40a
 
 
 
 
 
5de1ef9
8a6f40a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import os
import io
import torch
import librosa
import requests
import tempfile
import numpy as np
import soundfile as sf
import gradio as gr
from transformers import AutoModel
import spaces
import time
from huggingface_hub import login

# Get Hugging Face token from environment variable
hf_token = os.getenv("HF_TOKEN")  # Retrieve token from environment

# Ensure the token is set, otherwise raise an error
if hf_token:
    login(token=hf_token)  # Log in using the token from environment
else:
    raise ValueError("Hugging Face token not found in environment variables.")

# Function to load reference audio from URL
def load_audio_from_url(url):
    response = requests.get(url)
    if response.status_code == 200:
        audio_data, sample_rate = sf.read(io.BytesIO(response.content))
        return sample_rate, audio_data
    return None, None

# Function to check and use GPU
def get_device():
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print(f"Using GPU: {torch.cuda.get_device_name(0)}")
    else:
        device = torch.device("cpu")
        print("Using CPU")
    return device

# Resampling function to match sample rates
def resample_audio(audio, orig_sr, target_sr):
    if orig_sr != target_sr:
        return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
    return audio

@spaces.GPU
def synthesize_speech(text, ref_audio, ref_text):
    if ref_audio is None or ref_text.strip() == "":
        return "Error: Please provide a reference audio and its corresponding text."

    # Ensure valid reference audio input
    if isinstance(ref_audio, tuple) and len(ref_audio) == 2:
        sample_rate, audio_data = ref_audio
    else:
        return "Error: Invalid reference audio input."

    # Save reference audio directly without resampling
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
        sf.write(temp_audio.name, audio_data, samplerate=sample_rate, format='WAV')
        temp_audio.flush()

    # Profiling the model inference time
    start_time = time.time()

    # Run the inference
    audio = model(text, ref_audio_path=temp_audio.name, ref_text=ref_text)

    end_time = time.time()
    print(f"Inference time: {end_time - start_time} seconds")

    # Normalize output and save
    if audio.dtype == np.int16:
        audio = audio.astype(np.float32) / 32768.0

    # Resample the generated audio to match the reference audio's sample rate
    audio = resample_audio(audio, orig_sr=24000, target_sr=sample_rate)

    return sample_rate, audio

# Load TTS model and move it to the appropriate device (GPU/CPU)
repo_id = "ai4bharat/IndicF5"
model = AutoModel.from_pretrained(repo_id, trust_remote_code=True)
device = get_device()  # Get device (GPU/CPU)
model = model.to(device)

# Example Malayalam Data
EXAMPLES = [
    {
        "audio_name": "Aparna Voice",
        "audio_url": "https://raw.githubusercontent.com/Aparna0112/voicerecording-_TTS/main/Aparna%20Voice.wav",  # Replace with actual Malayalam audio URL
        "ref_text": " ഞാൻ ഒരു ഫോണിന്‍റെ കവർ നോക്കുകയാണ്. എനിക്ക് സ്മാർട്ട് ഫോണിന് കവർ വേണം",
        "synth_text": "ഞാൻ മലയാളം സംസാരിക്കാൻ കഴിയുന്നു."
    },
    {
        "audio_name": "KC Voice",
        "audio_url": "https://raw.githubusercontent.com/Aparna0112/voicerecording-_TTS/main/KC%20Voice.wav",  # Replace with actual Malayalam audio URL
        "ref_text": "ഹലോ ഇത് അപരനെ അല്ലേ ഞാൻ ജഗദീപ് ആണ് വിളിക്കുന്നത് ഇപ്പോൾ ഫ്രീയാണോ സംസാരിക്കാമോ ",
        "synth_text": "ഞാൻ മലയാളം സംസാരിക്കാൻ കഴിയുന്നു."
    },
    # Add more examples here if needed
]

# Preload all example audios
for example in EXAMPLES:
    sample_rate, audio_data = load_audio_from_url(example["audio_url"])
    example["sample_rate"] = sample_rate
    example["audio_data"] = audio_data

# Define Gradio interface
with gr.Blocks() as iface:
    gr.Markdown(
        """
        # **Text-to-Speech for Malayalam**
        [![Hugging Face](https://img.shields.io/badge/HuggingFace-Model-orange)](https://huggingface.co/ai4bharat/IndicF5)
        Use **IndicF5**, a **Text-to-Speech (TTS)** model, to generate Malayalam speech.
        """
    )

    with gr.Row():
        with gr.Column():
            # Text to synthesize
            text_input = gr.Textbox(label="Text to Synthesize (Malayalam)", placeholder="Enter Malayalam text...", lines=3)
            # Reference audio input
            ref_audio_input = gr.Audio(type="numpy", label="Reference Prompt Audio")
            # Reference text input
            ref_text_input = gr.Textbox(label="Text in Reference Prompt Audio (Malayalam)", placeholder="Enter the transcript in Malayalam...", lines=2)
            # Submit button
            submit_btn = gr.Button("🎤 Generate Malayalam Speech", variant="primary")

        with gr.Column():
            # Output audio of generated speech
            output_audio = gr.Audio(label="Generated Speech (Malayalam)", type="numpy")

    # Dropdown to select audio name
    audio_name_input = gr.Dropdown(
        label="Select Reference Audio",
        choices=[ex["audio_name"] for ex in EXAMPLES],
        type="value"  # The value will be the audio name
    )

    # Function to update the reference audio and text based on selected audio name
    def update_reference_audio(selected_audio_name):
        # Find the selected example by audio name
        selected_example = next(ex for ex in EXAMPLES if ex["audio_name"] == selected_audio_name)
        ref_audio = (selected_example["sample_rate"], selected_example["audio_data"])
        ref_text = selected_example["ref_text"]
        return ref_audio, ref_text

    # Use `audio_name_input` to update `ref_audio_input` and `ref_text_input`
    audio_name_input.change(
        update_reference_audio,
        inputs=[audio_name_input],
        outputs=[ref_audio_input, ref_text_input]
    )

    # Set the click event for the button
    submit_btn.click(
        synthesize_speech,
        inputs=[text_input, ref_audio_input, ref_text_input],
        outputs=[output_audio]
    )

iface.launch()