Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
import numpy as np | |
import re | |
from transformers import pipeline | |
import soundfile as sf | |
import io | |
import tempfile | |
import os | |
from pydub import AudioSegment | |
import nltk | |
from nltk.tokenize import sent_tokenize | |
import warnings | |
import time | |
warnings.filterwarnings("ignore") | |
# Download required NLTK data | |
try: | |
nltk.data.find('tokenizers/punkt') | |
except LookupError: | |
nltk.download('punkt') | |
class LongFormTTS: | |
def __init__(self): | |
print("Loading TTS models...") | |
# Try multiple TTS approaches for better compatibility | |
self.tts_pipeline = None | |
self.backup_tts = None | |
# Primary: Try Bark (works well on HF Spaces) | |
try: | |
print("Loading Bark TTS...") | |
self.tts_pipeline = pipeline( | |
"text-to-speech", | |
model="suno/bark-small", | |
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, | |
) | |
self.tts_method = "bark" | |
print("β Bark TTS loaded successfully!") | |
except Exception as e: | |
print(f"β Bark TTS failed: {e}") | |
# Backup: Try Parler TTS | |
try: | |
print("Loading Parler TTS...") | |
self.tts_pipeline = pipeline( | |
"text-to-speech", | |
model="parler-tts/parler_tts_mini_v0.1", | |
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, | |
) | |
self.tts_method = "parler" | |
print("β Parler TTS loaded successfully!") | |
except Exception as e: | |
print(f"β Parler TTS failed: {e}") | |
# Final backup: Try FastSpeech2 | |
try: | |
print("Loading FastSpeech2...") | |
from TTS.api import TTS | |
self.backup_tts = TTS(model_name="tts_models/en/ljspeech/fastspeech2") | |
self.tts_method = "fastspeech2" | |
print("β FastSpeech2 loaded successfully!") | |
except Exception as e: | |
print(f"β All TTS models failed: {e}") | |
raise Exception("No TTS model could be loaded. Please check the requirements.") | |
def preprocess_text(self, text): | |
"""Clean and prepare text for TTS""" | |
# Remove extra whitespace | |
text = re.sub(r'\s+', ' ', text.strip()) | |
# Handle common abbreviations | |
abbreviations = { | |
'Dr.': 'Doctor', | |
'Mr.': 'Mister', | |
'Mrs.': 'Missus', | |
'Ms.': 'Miss', | |
'Prof.': 'Professor', | |
'etc.': 'etcetera', | |
'vs.': 'versus', | |
'e.g.': 'for example', | |
'i.e.': 'that is', | |
'St.': 'Street', | |
'Ave.': 'Avenue', | |
'Blvd.': 'Boulevard', | |
'Inc.': 'Incorporated', | |
'Corp.': 'Corporation', | |
'Ltd.': 'Limited', | |
} | |
for abbr, full in abbreviations.items(): | |
text = text.replace(abbr, full) | |
# Handle numbers (basic) | |
text = re.sub(r'\b(\d+)\b', lambda m: self.number_to_words(int(m.group())), text) | |
# Clean up any problematic characters | |
text = re.sub(r'[^\w\s\.,!?;:\-\(\)]', '', text) | |
return text | |
def number_to_words(self, num): | |
"""Convert numbers to words (basic implementation)""" | |
if num == 0: | |
return "zero" | |
if num > 9999: | |
return str(num) # Keep large numbers as digits | |
ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"] | |
teens = ["ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", | |
"sixteen", "seventeen", "eighteen", "nineteen"] | |
tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"] | |
if num < 10: | |
return ones[num] | |
elif num < 20: | |
return teens[num - 10] | |
elif num < 100: | |
return tens[num // 10] + ("" if num % 10 == 0 else " " + ones[num % 10]) | |
elif num < 1000: | |
return ones[num // 100] + " hundred" + ("" if num % 100 == 0 else " " + self.number_to_words(num % 100)) | |
else: | |
return str(num) | |
def chunk_text(self, text, max_length=200): | |
"""Split text into manageable chunks while preserving sentence boundaries""" | |
sentences = sent_tokenize(text) | |
chunks = [] | |
current_chunk = "" | |
for sentence in sentences: | |
# If single sentence is too long, split by clauses | |
if len(sentence) > max_length: | |
clauses = re.split(r'[,;:]', sentence) | |
for clause in clauses: | |
clause = clause.strip() | |
if len(current_chunk + clause) > max_length: | |
if current_chunk: | |
chunks.append(current_chunk.strip()) | |
current_chunk = clause | |
else: | |
# Even single clause is too long, force split | |
words = clause.split() | |
temp_chunk = "" | |
for word in words: | |
if len(temp_chunk + word) > max_length: | |
if temp_chunk: | |
chunks.append(temp_chunk.strip()) | |
temp_chunk = word | |
else: | |
chunks.append(word) | |
else: | |
temp_chunk += " " + word if temp_chunk else word | |
if temp_chunk: | |
current_chunk = temp_chunk | |
else: | |
current_chunk += " " + clause if current_chunk else clause | |
else: | |
if len(current_chunk + sentence) > max_length: | |
if current_chunk: | |
chunks.append(current_chunk.strip()) | |
current_chunk = sentence | |
else: | |
chunks.append(sentence) | |
else: | |
current_chunk += " " + sentence if current_chunk else sentence | |
if current_chunk: | |
chunks.append(current_chunk.strip()) | |
return [chunk for chunk in chunks if chunk.strip()] | |
def generate_speech_chunk(self, text_chunk): | |
"""Generate speech for a single text chunk""" | |
try: | |
if self.tts_method == "bark": | |
# Bark TTS | |
speech = self.tts_pipeline(text_chunk) | |
audio = speech["audio"] | |
sampling_rate = speech["sampling_rate"] | |
return audio, sampling_rate | |
elif self.tts_method == "parler": | |
# Parler TTS | |
speech = self.tts_pipeline(text_chunk) | |
audio = speech["audio"] | |
sampling_rate = speech["sampling_rate"] | |
return audio, sampling_rate | |
elif self.tts_method == "fastspeech2": | |
# FastSpeech2 via TTS library | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: | |
self.backup_tts.tts_to_file(text=text_chunk, file_path=tmp_file.name) | |
audio, sr = sf.read(tmp_file.name) | |
os.unlink(tmp_file.name) | |
return audio, sr | |
else: | |
raise Exception("No TTS method available") | |
except Exception as e: | |
print(f"Error generating speech for chunk: {e}") | |
return None, None | |
def generate_long_speech(self, text, progress_callback=None): | |
"""Generate speech for long text by processing in chunks""" | |
# Preprocess text | |
text = self.preprocess_text(text) | |
# Split into chunks | |
chunks = self.chunk_text(text, max_length=150) # Smaller chunks for better compatibility | |
print(f"Split text into {len(chunks)} chunks") | |
if not chunks: | |
return None, None | |
# Generate speech for each chunk | |
audio_segments = [] | |
sampling_rate = None | |
total_chunks = len(chunks) | |
for i, chunk in enumerate(chunks): | |
if progress_callback: | |
progress_callback(f"Processing chunk {i+1}/{total_chunks}: {chunk[:30]}...") | |
audio_chunk, sr = self.generate_speech_chunk(chunk) | |
if audio_chunk is not None and len(audio_chunk) > 0: | |
if sampling_rate is None: | |
sampling_rate = sr | |
# Ensure audio is 1D | |
if len(audio_chunk.shape) > 1: | |
audio_chunk = audio_chunk.mean(axis=1) | |
audio_segments.append(audio_chunk) | |
# Add small pause between chunks (300ms of silence) | |
pause_duration = int(0.3 * sampling_rate) | |
silence = np.zeros(pause_duration) | |
audio_segments.append(silence) | |
# Small delay to prevent overwhelming the system | |
time.sleep(0.1) | |
if not audio_segments: | |
return None, None | |
# Concatenate all audio segments | |
final_audio = np.concatenate(audio_segments) | |
return final_audio, sampling_rate | |
# Initialize TTS system | |
print("Initializing TTS system...") | |
try: | |
tts_system = LongFormTTS() | |
print("β TTS system initialized successfully!") | |
except Exception as e: | |
print(f"β Failed to initialize TTS system: {e}") | |
tts_system = None | |
def text_to_speech_interface(text, progress=gr.Progress()): | |
"""Main interface function for Gradio""" | |
if tts_system is None: | |
return None, "β TTS system not available. Please check the logs." | |
if not text.strip(): | |
return None, "Please enter some text to convert to speech." | |
if len(text) > 10000: | |
return None, "Text is too long. Please keep it under 10,000 characters for optimal performance." | |
def progress_callback(message): | |
progress(0.5, desc=message) | |
try: | |
progress(0.1, desc="Starting text-to-speech conversion...") | |
audio, sample_rate = tts_system.generate_long_speech(text, progress_callback) | |
if audio is None or len(audio) == 0: | |
return None, "Failed to generate audio. Please try with shorter text or check your input." | |
progress(0.9, desc="Finalizing audio...") | |
# Save to temporary file | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: | |
sf.write(tmp_file.name, audio, sample_rate) | |
audio_path = tmp_file.name | |
progress(1.0, desc="Complete!") | |
duration = len(audio) / sample_rate | |
return audio_path, f"β Successfully generated {duration:.1f} seconds of audio using {tts_system.tts_method.upper()}!" | |
except Exception as e: | |
error_msg = f"β Error: {str(e)}" | |
print(error_msg) | |
return None, error_msg | |
# Create Gradio interface | |
def create_interface(): | |
with gr.Blocks( | |
title="π€ Long-Form Text-to-Speech Generator", | |
theme=gr.themes.Soft(), | |
css=""" | |
.main-header { | |
text-align: center; | |
margin-bottom: 2rem; | |
} | |
.feature-box { | |
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
color: white; | |
padding: 1rem; | |
border-radius: 10px; | |
margin: 1rem 0; | |
} | |
.status-box { | |
background: #f8f9fa; | |
border-left: 4px solid #007bff; | |
padding: 1rem; | |
margin: 1rem 0; | |
} | |
""" | |
) as demo: | |
gr.HTML(""" | |
<div class="main-header"> | |
<h1>π€ Long-Form Text-to-Speech Generator</h1> | |
<p>Convert any length of text to natural human-like speech using free AI models</p> | |
</div> | |
""") | |
# Show TTS system status | |
if tts_system is not None: | |
status_html = f""" | |
<div class="status-box"> | |
<h4>π’ System Status: Ready</h4> | |
<p>Using <strong>{tts_system.tts_method.upper()}</strong> TTS engine</p> | |
</div> | |
""" | |
else: | |
status_html = """ | |
<div class="status-box" style="border-left-color: #dc3545;"> | |
<h4>π΄ System Status: Error</h4> | |
<p>TTS system failed to initialize. Please check the logs.</p> | |
</div> | |
""" | |
gr.HTML(status_html) | |
with gr.Row(): | |
with gr.Column(scale=2): | |
text_input = gr.Textbox( | |
label="π Enter your text (max 10,000 characters)", | |
placeholder="Type or paste your text here...", | |
lines=8, | |
max_lines=15 | |
) | |
char_count = gr.HTML("Character count: 0") | |
generate_btn = gr.Button( | |
"π― Generate Speech", | |
variant="primary", | |
size="lg" | |
) | |
with gr.Column(scale=1): | |
gr.HTML(""" | |
<div class="feature-box"> | |
<h3>β¨ Features</h3> | |
<ul> | |
<li>π Long text support</li> | |
<li>π€ Multiple TTS engines</li> | |
<li>β‘ Smart text chunking</li> | |
<li>π Completely free</li> | |
<li>π§ Auto preprocessing</li> | |
<li>π± Mobile friendly</li> | |
</ul> | |
</div> | |
""") | |
status_text = gr.Textbox( | |
label="π Status", | |
interactive=False, | |
value="Ready to generate speech!" | |
) | |
audio_output = gr.Audio( | |
label="π Generated Speech", | |
type="filepath" | |
) | |
# Character counter | |
def update_char_count(text): | |
count = len(text) | |
color = "green" if count <= 10000 else "red" | |
return f'<span style="color: {color};">Character count: {count}/10,000</span>' | |
text_input.change( | |
fn=update_char_count, | |
inputs=[text_input], | |
outputs=[char_count] | |
) | |
# Event handlers | |
generate_btn.click( | |
fn=text_to_speech_interface, | |
inputs=[text_input], | |
outputs=[audio_output, status_text] | |
) | |
# Example texts | |
gr.Examples( | |
examples=[ | |
["Hello! This is a test of the text-to-speech system. It can handle longer texts by splitting them into smaller chunks."], | |
["The quick brown fox jumps over the lazy dog. This sentence contains every letter of the alphabet."], | |
["In a hole in the ground there lived a hobbit. Not a nasty, dirty, wet hole, but a comfortable hobbit-hole."], | |
["Welcome to our advanced text-to-speech generator. This system uses state-of-the-art AI models to convert your text into natural-sounding speech. You can input texts of various lengths, and the system will intelligently process them to create high-quality audio output."] | |
], | |
inputs=[text_input] | |
) | |
gr.HTML(""" | |
<div style="margin-top: 2rem; padding: 1rem; background: #f0f0f0; border-radius: 5px;"> | |
<h4>π§ How it works:</h4> | |
<ol> | |
<li><strong>Multiple Engines:</strong> Tries Bark, Parler, or FastSpeech2 TTS models</li> | |
<li><strong>Smart Chunking:</strong> Splits long text at natural boundaries</li> | |
<li><strong>Audio Processing:</strong> Combines chunks with natural pauses</li> | |
<li><strong>Quality Output:</strong> Generates high-quality WAV audio</li> | |
</ol> | |
<p><em>π‘ Tip: For best results, use well-formatted text with proper punctuation!</em></p> | |
</div> | |
""") | |
return demo | |
# Launch the app | |
if __name__ == "__main__": | |
demo = create_interface() | |
demo.launch( | |
server_name="0.0.0.0", | |
server_port=7860, | |
share=True | |
) |