Spaces:
Sleeping
Sleeping
File size: 14,080 Bytes
b2c4ab7 c1fa46d b2c4ab7 c4f9f48 c1fa46d a7ffad5 b2c4ab7 6d45ca0 da460a1 c1fa46d da460a1 6d45ca0 da460a1 e392e54 b2c4ab7 c1fa46d 6d45ca0 c1fa46d 6d45ca0 c1fa46d e392e54 b2c4ab7 6d45ca0 b2c4ab7 a7ffad5 e392e54 b2c4ab7 120ea4f 6d45ca0 c1fa46d b2c4ab7 c1fa46d b2c4ab7 a7ffad5 e392e54 c1fa46d 6d45ca0 b2c4ab7 6d45ca0 a7ffad5 6d45ca0 a7ffad5 b2c4ab7 e392e54 c1fa46d 6d45ca0 b2c4ab7 c1fa46d b2c4ab7 c1fa46d a7ffad5 e392e54 c1fa46d 6d45ca0 a7ffad5 c1fa46d a7ffad5 c1fa46d b2c4ab7 c1fa46d b2c4ab7 a7ffad5 c1fa46d c4f9f48 b2c4ab7 c4f9f48 b2c4ab7 a7ffad5 c1fa46d b2c4ab7 e392e54 a7ffad5 c4f9f48 a7ffad5 c4f9f48 a7ffad5 c4f9f48 b2c4ab7 e392e54 c1fa46d c4f9f48 a7ffad5 e392e54 6ac2cb8 e392e54 b2c4ab7 e392e54 b2c4ab7 a7ffad5 6ac2cb8 c1fa46d 6ac2cb8 a7ffad5 c1fa46d b2c4ab7 a7ffad5 c4f9f48 a7ffad5 b2c4ab7 c1fa46d b2c4ab7 e392e54 b2c4ab7 c1fa46d b2c4ab7 a7ffad5 b2c4ab7 c1fa46d b2c4ab7 c1fa46d a7ffad5 6ac2cb8 a7ffad5 c1fa46d c4f9f48 a7ffad5 c4f9f48 a7ffad5 6ac2cb8 a7ffad5 c4f9f48 a7ffad5 b2c4ab7 a7ffad5 e392e54 a7ffad5 c1fa46d b2c4ab7 e392e54 a895c00 c1fa46d 6d45ca0 a895c00 6ac2cb8 b2c4ab7 6ac2cb8 a7ffad5 6ac2cb8 a7ffad5 6ac2cb8 c1fa46d b2c4ab7 6ac2cb8 c4f9f48 a7ffad5 e392e54 6d45ca0 6ac2cb8 b2c4ab7 6ac2cb8 a7ffad5 b2c4ab7 6ac2cb8 b2c4ab7 c1fa46d b2c4ab7 6ac2cb8 a7ffad5 b2c4ab7 6ac2cb8 b2c4ab7 e392e54 a7ffad5 b2c4ab7 6ac2cb8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 |
import gradio as gr
import torch
import numpy as np
import re
import soundfile as sf
import tempfile
import os
import nltk
from nltk.tokenize import sent_tokenize
import warnings
import time
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
warnings.filterwarnings("ignore")
# Download required NLTK data including punkt_tab
try:
nltk.data.find('tokenizers/punkt')
nltk.data.find('tokenizers/punkt_tab')
except LookupError:
nltk.download(['punkt', 'punkt_tab'], quiet=True)
class LongFormTTS:
def __init__(self):
print("π Loading TTS models...")
try:
# Load SpeechT5 - most reliable for HF Spaces
print("Loading SpeechT5 TTS...")
self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
self.model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
# Load speaker embeddings dataset
print("Loading speaker embeddings...")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
# Store multiple speakers
self.speakers = {
f"Speaker {i+1} ({id})": embeddings_dataset[id]["xvector"]
for i, id in enumerate([7306, 7339, 7341, 7345, 7367, 7422])
}
self.speaker_ids = list(self.speakers.keys())
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model = self.model.to(self.device)
self.vocoder = self.vocoder.to(self.device)
print("β
SpeechT5 loaded successfully!")
except Exception as e:
print(f"β Failed to load SpeechT5: {e}")
raise Exception(f"TTS model loading failed: {e}")
def preprocess_text(self, text):
"""Clean and prepare text for TTS"""
text = re.sub(r'\s+', ' ', text.strip())
abbreviations = {
'Dr.': 'Doctor',
'Mr.': 'Mister',
'Mrs.': 'Missus',
'Ms.': 'Miss',
'Prof.': 'Professor',
'etc.': 'etcetera',
'vs.': 'versus',
'e.g.': 'for example',
'i.e.': 'that is',
'St.': 'Street',
'Ave.': 'Avenue',
'Blvd.': 'Boulevard',
'Inc.': 'Incorporated',
'Corp.': 'Corporation',
'Ltd.': 'Limited',
'U.S.': 'United States',
'U.K.': 'United Kingdom',
'Ph.D.': 'PhD',
'M.D.': 'MD',
}
for abbr, full in abbreviations.items():
text = text.replace(abbr, full)
text = re.sub(r'\b(\d{1,4})\b', lambda m: self.number_to_words(int(m.group())), text)
text = re.sub(r'\b(1[0-9]{3}|20[0-9]{2}|2100)\b', lambda m: m.group(), text)
text = re.sub(r'[^\w\s\.,!?;:\-\(\)\'"]', ' ', text)
return text.strip()
def number_to_words(self, num):
ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
teens = ["ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
"sixteen", "seventeen", "eighteen", "nineteen"]
tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
if num == 0:
return "zero"
if num > 9999:
return str(num)
if num < 10:
return ones[num]
elif num < 20:
return teens[num - 10]
elif num < 100:
return tens[num // 10] + ("" if num % 10 == 0 else " " + ones[num % 10])
elif num < 1000:
return ones[num // 100] + " hundred" + (" " + self.number_to_words(num % 100)).strip()
else:
thousands = num // 1000
remainder = num % 1000
result = self.number_to_words(thousands) + " thousand"
if remainder > 0:
result += " " + self.number_to_words(remainder)
return result
def chunk_text(self, text, max_length=400):
"""Split text into manageable chunks"""
sentences = sent_tokenize(text)
chunks = []
current_chunk = ""
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
if len(current_chunk + " " + sentence) > max_length:
if current_chunk:
chunks.append(current_chunk.strip())
if len(sentence) > max_length:
words = sentence.split()
temp_chunk = ""
for word in words:
if len(temp_chunk + " " + word) > max_length:
if temp_chunk:
chunks.append(temp_chunk.strip())
temp_chunk = word
else:
chunks.append(word)
else:
temp_chunk = temp_chunk + " " + word if temp_chunk else word
current_chunk = temp_chunk
else:
current_chunk = sentence
else:
current_chunk = current_chunk + " " + sentence if current_chunk else sentence
if current_chunk:
chunks.append(current_chunk.strip())
return [chunk for chunk in chunks if chunk.strip()]
def generate_speech_chunk(self, text_chunk, speaker_embedding):
"""Generate speech for a single chunk"""
try:
inputs = self.processor(text=text_chunk, return_tensors="pt").to(self.device)
with torch.no_grad():
speech = self.model.generate_speech(
inputs["input_ids"],
torch.tensor(speaker_embedding).unsqueeze(0).to(self.device),
vocoder=self.vocoder
)
if isinstance(speech, torch.Tensor):
speech = speech.cpu().numpy()
return speech
except Exception as e:
print(f"Error generating speech for chunk: {e}")
print(f"Chunk text: {text_chunk}")
return None
def generate_long_speech(self, text, speaker_id=None, progress_callback=None):
"""Generate speech for long text"""
processed_text = self.preprocess_text(text)
print(f"Original length: {len(text)}, Processed length: {len(processed_text)}")
chunks = self.chunk_text(processed_text)
print(f"Split into {len(chunks)} chunks")
if not chunks:
return None, None
# Generate speech for each chunk
audio_segments = []
sample_rate = 16000
for i, chunk in enumerate(chunks):
if progress_callback:
progress_callback(f"Processing chunk {i+1}/{len(chunks)}: {chunk[:40]}{'...' if len(chunk) > 40 else ''}")
print(f"Processing chunk {i+1}: {chunk}")
audio_chunk = self.generate_speech_chunk(chunk, self.speakers[speaker_id or self.speaker_ids[0]])
if audio_chunk is not None and len(audio_chunk) > 0:
if len(audio_chunk.shape) > 1:
audio_chunk = np.mean(audio_chunk, axis=0)
audio_segments.append(audio_chunk)
pause_samples = int(0.4 * sample_rate)
silence = np.zeros(pause_samples)
audio_segments.append(silence)
time.sleep(0.1)
if not audio_segments:
return None, None
final_audio = np.concatenate(audio_segments)
max_val = np.max(np.abs(final_audio))
if max_val > 0:
final_audio = final_audio / max_val * 0.95
return final_audio, sample_rate
# Global TTS system
print("π Initializing TTS system...")
try:
tts_system = LongFormTTS()
print("β
TTS system ready!")
except Exception as e:
print(f"β TTS initialization failed: {e}")
tts_system = None
def text_to_speech_interface(text, speaker="Speaker 1 (7306)", progress=gr.Progress()):
"""Main Gradio interface function"""
if tts_system is None:
return None, "β TTS system is not available. Please check the logs."
if not text or not text.strip():
return None, "β οΈ Please enter some text to convert to speech."
if len(text) > 50000:
return None, "β οΈ Text is too long. Please keep it under 50,000 characters."
def progress_callback(message):
progress(0.5, desc=message)
try:
progress(0.1, desc="π Starting text-to-speech conversion...")
audio, sample_rate = tts_system.generate_long_speech(text, speaker, progress_callback)
if audio is None or len(audio) == 0:
return None, "β Failed to generate audio."
progress(0.9, desc="πΎ Saving audio file...")
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
sf.write(tmp_file.name, audio, sample_rate)
audio_path = tmp_file.name
progress(1.0, desc="β
Complete!")
duration = len(audio) / sample_rate
return audio_path, f"β
Generated {duration:.1f} seconds of audio successfully!"
except Exception as e:
error_msg = f"β Error: {str(e)}"
print(f"TTS Error: {e}")
return None, error_msg
# Create Gradio interface
def create_interface():
with gr.Blocks(
title="π€ Long-Form Text-to-Speech",
theme=gr.themes.Soft(),
css="""
.main-header {
text-align: center;
margin-bottom: 2rem;
background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
background-clip: text;
}
"""
) as demo:
gr.HTML("""
<div class="main-header">
<h1>π€ Long-Form Text-to-Speech Generator</h1>
<p style="color: #666; font-size: 1.1em;">Transform any text into natural human-like speech using advanced AI</p>
</div>
""")
# System status
if tts_system:
gr.HTML("""
<div style="padding: 1rem; border-radius: 10px; margin: 1rem 0; border-left: 4px solid #28a745; background: #f8f9fa;">
<h4>π’ System Ready</h4>
<p>Using <strong>Microsoft SpeechT5</strong> - High quality neural text-to-speech</p>
</div>
""")
else:
gr.HTML("""
<div style="padding: 1rem; border-radius: 10px; margin: 1rem 0; border-left: 4px solid #dc3545; background: #f8d7da;">
<h4>π΄ System Error</h4>
<p>TTS system failed to initialize. Please refresh the page.</p>
</div>
""")
with gr.Row():
with gr.Column(scale=2):
text_input = gr.Textbox(
label="π Enter Your Text",
placeholder="Type or paste your text here... (Max 50,000 characters)",
lines=10,
max_lines=20,
info="Supports any length text with automatic chunking for optimal quality"
)
char_count = gr.HTML("<span style='color: #666;'>Character count: 0 / 50,000</span>")
speaker_dropdown = gr.Dropdown(
choices=tts_system.speaker_ids if tts_system else [],
value=tts_system.speaker_ids[0] if tts_system and tts_system.speaker_ids else None,
label="π£οΈ Choose Voice"
)
generate_btn = gr.Button("π― Generate Speech", variant="primary", size="lg", scale=1)
with gr.Column(scale=1):
gr.HTML("""
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1.5rem; border-radius: 15px; margin: 1rem 0; box-shadow: 0 4px 15px rgba(0,0,0,0.1);">
<h3>β¨ Key Features</h3>
<ul style="margin: 0; padding-left: 1.2em;">
<li>π Handles long texts</li>
<li>π Multiple human voices</li>
<li>β‘ Smart text processing</li>
<li>π§ Auto chunking</li>
<li>π΅ Natural-sounding speech</li>
<li>π MP3 audio output</li>
</ul>
</div>
""")
status_output = gr.Textbox(label="π Status", interactive=False, value="Ready to generate speech! Enter some text above.")
audio_output = gr.Audio(label="π Generated Speech", type="filepath", show_download_button=True)
def update_char_count(text):
count = len(text) if text else 0
color = "#28a745" if count <= 50000 else "#dc3545"
return f'<span style="color: {color};">Character count: {count:,} / 50,000</span>'
text_input.change(fn=update_char_count, inputs=[text_input], outputs=[char_count])
generate_btn.click(
fn=text_to_speech_interface,
inputs=[text_input, speaker_dropdown],
outputs=[audio_output, status_output],
show_progress=True
)
gr.Examples(
examples=[
["Hello! Welcome to our advanced text-to-speech system.", "Speaker 1 (7306)"],
["The quick brown fox jumps over the lazy dog.", "Speaker 2 (7339)"],
["Artificial intelligence has revolutionized many aspects of our lives.", "Speaker 3 (7341)"],
],
inputs=[text_input, speaker_dropdown],
label="π Try These Examples"
)
return demo
# Launch the application
if __name__ == "__main__":
demo = create_interface()
demo.launch(server_name="0.0.0.0", server_port=7860, share=True) |