Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -12,42 +12,53 @@ from TTS.api import TTS
|
|
12 |
|
13 |
warnings.filterwarnings("ignore")
|
14 |
|
15 |
-
# Download required NLTK data
|
16 |
try:
|
17 |
nltk.data.find('tokenizers/punkt')
|
|
|
18 |
except LookupError:
|
19 |
-
nltk.download('punkt')
|
20 |
|
21 |
-
# Load Coqui TTS model
|
22 |
-
print("π Loading Coqui TTS models...")
|
23 |
-
try:
|
24 |
-
# This will download the model automatically if not found
|
25 |
-
tts = TTS(model_name="tts_models/en/vctk/vits", progress_bar=False, gpu=False)
|
26 |
-
print("β
Coqui TTS loaded successfully!")
|
27 |
-
speakers = tts.speakers
|
28 |
-
print(f"Available Speakers: {speakers}")
|
29 |
-
except Exception as e:
|
30 |
-
print(f"β Failed to load Coqui TTS: {e}")
|
31 |
-
tts = None
|
32 |
-
speakers = []
|
33 |
|
34 |
class LongFormTTS:
|
35 |
def __init__(self):
|
36 |
-
|
37 |
-
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
def preprocess_text(self, text):
|
41 |
"""Clean and prepare text for TTS"""
|
42 |
text = re.sub(r'\s+', ' ', text.strip())
|
43 |
abbreviations = {
|
44 |
-
'Dr.': 'Doctor',
|
45 |
-
'
|
46 |
-
'
|
47 |
-
'
|
48 |
-
'
|
49 |
-
'
|
50 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
}
|
52 |
for abbr, full in abbreviations.items():
|
53 |
text = text.replace(abbr, full)
|
@@ -57,13 +68,15 @@ class LongFormTTS:
|
|
57 |
return text.strip()
|
58 |
|
59 |
def number_to_words(self, num):
|
|
|
|
|
|
|
|
|
|
|
60 |
ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
|
61 |
teens = ["ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
|
62 |
"sixteen", "seventeen", "eighteen", "nineteen"]
|
63 |
-
tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy",
|
64 |
-
"eighty", "ninety"]
|
65 |
-
if num == 0:
|
66 |
-
return "zero"
|
67 |
if num < 10:
|
68 |
return ones[num]
|
69 |
elif num < 20:
|
@@ -71,7 +84,7 @@ class LongFormTTS:
|
|
71 |
elif num < 100:
|
72 |
return tens[num // 10] + ("" if num % 10 == 0 else " " + ones[num % 10])
|
73 |
elif num < 1000:
|
74 |
-
return ones[num // 100] + " hundred" + (" " + self.number_to_words(num % 100))
|
75 |
else:
|
76 |
thousands = num // 1000
|
77 |
remainder = num % 1000
|
@@ -81,14 +94,32 @@ class LongFormTTS:
|
|
81 |
return result
|
82 |
|
83 |
def chunk_text(self, text, max_length=200):
|
|
|
84 |
sentences = sent_tokenize(text)
|
85 |
chunks = []
|
86 |
current_chunk = ""
|
87 |
for sentence in sentences:
|
|
|
|
|
|
|
88 |
if len(current_chunk + " " + sentence) > max_length:
|
89 |
if current_chunk:
|
90 |
chunks.append(current_chunk.strip())
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
else:
|
93 |
current_chunk = current_chunk + " " + sentence if current_chunk else sentence
|
94 |
if current_chunk:
|
@@ -96,14 +127,15 @@ class LongFormTTS:
|
|
96 |
return [chunk for chunk in chunks if chunk.strip()]
|
97 |
|
98 |
def generate_speech_chunk(self, text_chunk, speaker):
|
|
|
99 |
try:
|
100 |
-
|
101 |
-
return audio
|
102 |
except Exception as e:
|
103 |
print(f"Error generating speech for chunk: {e}")
|
104 |
return None
|
105 |
|
106 |
def generate_long_speech(self, text, speaker=None, progress_callback=None):
|
|
|
107 |
processed_text = self.preprocess_text(text)
|
108 |
chunks = self.chunk_text(processed_text)
|
109 |
print(f"Split into {len(chunks)} chunks")
|
@@ -157,7 +189,7 @@ def text_to_speech_interface(text, speaker="p225", progress=gr.Progress()):
|
|
157 |
if audio is None:
|
158 |
return None, "β Failed to generate audio."
|
159 |
progress(0.9, desc="πΎ Saving audio file...")
|
160 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".
|
161 |
sf.write(tmp_file.name, audio, sample_rate)
|
162 |
audio_path = tmp_file.name
|
163 |
progress(1.0, desc="β
Complete!")
|
@@ -189,7 +221,7 @@ def create_interface():
|
|
189 |
<p style="color: #666; font-size: 1.1em;">Choose a voice and transform any written text into expressive human-like speech</p>
|
190 |
</div>
|
191 |
""")
|
192 |
-
if tts_system:
|
193 |
gr.HTML("""
|
194 |
<div style="padding: 1rem; border-radius: 10px; margin: 1rem 0; border-left: 4px solid #28a745; background: #f8f9fa;">
|
195 |
<h4>π’ System Ready</h4>
|
@@ -203,6 +235,7 @@ def create_interface():
|
|
203 |
<p>TTS system failed to initialize. Please refresh the page.</p>
|
204 |
</div>
|
205 |
""")
|
|
|
206 |
with gr.Row():
|
207 |
with gr.Column(scale=2):
|
208 |
text_input = gr.Textbox(
|
@@ -214,9 +247,9 @@ def create_interface():
|
|
214 |
)
|
215 |
char_count = gr.HTML("<span style='color: #666;'>Character count: 0 / 50,000</span>")
|
216 |
speaker_dropdown = gr.Dropdown(
|
217 |
-
choices=tts_system.speakers if tts_system else [],
|
218 |
value=tts_system.speakers[0] if tts_system and tts_system.speakers else None,
|
219 |
-
label="π£οΈ
|
220 |
)
|
221 |
generate_btn = gr.Button("π― Generate Speech", variant="primary", size="lg", scale=1)
|
222 |
with gr.Column(scale=1):
|
@@ -229,7 +262,6 @@ def create_interface():
|
|
229 |
<li>β‘ Smart text processing</li>
|
230 |
<li>π§ Auto chunking</li>
|
231 |
<li>π΅ Natural-sounding speech</li>
|
232 |
-
<li>π MP3 audio output</li>
|
233 |
</ul>
|
234 |
</div>
|
235 |
""")
|
@@ -240,7 +272,7 @@ def create_interface():
|
|
240 |
count = len(text) if text else 0
|
241 |
color = "#28a745" if count <= 50000 else "#dc3545"
|
242 |
return f'<span style="color: {color};">Character count: {count:,} / 50,000</span>'
|
243 |
-
|
244 |
text_input.change(fn=update_char_count, inputs=[text_input], outputs=[char_count])
|
245 |
|
246 |
generate_btn.click(
|
@@ -254,7 +286,7 @@ def create_interface():
|
|
254 |
examples=[
|
255 |
["Hello! Welcome to our advanced text-to-speech system.", "p225"],
|
256 |
["The quick brown fox jumps over the lazy dog.", "p226"],
|
257 |
-
["Artificial intelligence has revolutionized many aspects of our lives.", "p227"],
|
258 |
],
|
259 |
inputs=[text_input, speaker_dropdown],
|
260 |
label="π Try These Examples"
|
|
|
12 |
|
13 |
warnings.filterwarnings("ignore")
|
14 |
|
15 |
+
# Download required NLTK data including punkt_tab
|
16 |
try:
|
17 |
nltk.data.find('tokenizers/punkt')
|
18 |
+
nltk.data.find('tokenizers/punkt_tab') # This is the missing one!
|
19 |
except LookupError:
|
20 |
+
nltk.download(['punkt', 'punkt_tab'], quiet=True)
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
class LongFormTTS:
|
24 |
def __init__(self):
|
25 |
+
print("π Loading Coqui TTS models...")
|
26 |
+
try:
|
27 |
+
# Load Coqui model
|
28 |
+
self.tts = TTS(model_name="tts_models/en/vctk/vits", progress_bar=False, gpu=False)
|
29 |
+
self.speakers = self.tts.speakers
|
30 |
+
self.sample_rate = 22050
|
31 |
+
print("β
Coqui TTS loaded successfully!")
|
32 |
+
print(f"Available Speakers: {self.speakers}")
|
33 |
+
except Exception as e:
|
34 |
+
print(f"β Failed to load Coqui TTS: {e}")
|
35 |
+
self.tts = None
|
36 |
+
self.speakers = []
|
37 |
+
|
38 |
|
39 |
def preprocess_text(self, text):
|
40 |
"""Clean and prepare text for TTS"""
|
41 |
text = re.sub(r'\s+', ' ', text.strip())
|
42 |
abbreviations = {
|
43 |
+
'Dr.': 'Doctor',
|
44 |
+
'Mr.': 'Mister',
|
45 |
+
'Mrs.': 'Missus',
|
46 |
+
'Ms.': 'Miss',
|
47 |
+
'Prof.': 'Professor',
|
48 |
+
'etc.': 'etcetera',
|
49 |
+
'vs.': 'versus',
|
50 |
+
'e.g.': 'for example',
|
51 |
+
'i.e.': 'that is',
|
52 |
+
'St.': 'Street',
|
53 |
+
'Ave.': 'Avenue',
|
54 |
+
'Blvd.': 'Boulevard',
|
55 |
+
'Inc.': 'Incorporated',
|
56 |
+
'Corp.': 'Corporation',
|
57 |
+
'Ltd.': 'Limited',
|
58 |
+
'U.S.': 'United States',
|
59 |
+
'U.K.': 'United Kingdom',
|
60 |
+
'Ph.D.': 'PhD',
|
61 |
+
'M.D.': 'MD',
|
62 |
}
|
63 |
for abbr, full in abbreviations.items():
|
64 |
text = text.replace(abbr, full)
|
|
|
68 |
return text.strip()
|
69 |
|
70 |
def number_to_words(self, num):
|
71 |
+
"""Convert numbers to words"""
|
72 |
+
if num == 0:
|
73 |
+
return "zero"
|
74 |
+
if num > 9999:
|
75 |
+
return str(num)
|
76 |
ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
|
77 |
teens = ["ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
|
78 |
"sixteen", "seventeen", "eighteen", "nineteen"]
|
79 |
+
tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
|
|
|
|
|
|
|
80 |
if num < 10:
|
81 |
return ones[num]
|
82 |
elif num < 20:
|
|
|
84 |
elif num < 100:
|
85 |
return tens[num // 10] + ("" if num % 10 == 0 else " " + ones[num % 10])
|
86 |
elif num < 1000:
|
87 |
+
return ones[num // 100] + " hundred" + ("" if num % 100 == 0 else " " + self.number_to_words(num % 100))
|
88 |
else:
|
89 |
thousands = num // 1000
|
90 |
remainder = num % 1000
|
|
|
94 |
return result
|
95 |
|
96 |
def chunk_text(self, text, max_length=200):
|
97 |
+
"""Split text into manageable chunks"""
|
98 |
sentences = sent_tokenize(text)
|
99 |
chunks = []
|
100 |
current_chunk = ""
|
101 |
for sentence in sentences:
|
102 |
+
sentence = sentence.strip()
|
103 |
+
if not sentence:
|
104 |
+
continue
|
105 |
if len(current_chunk + " " + sentence) > max_length:
|
106 |
if current_chunk:
|
107 |
chunks.append(current_chunk.strip())
|
108 |
+
if len(sentence) > max_length:
|
109 |
+
words = sentence.split()
|
110 |
+
temp_chunk = ""
|
111 |
+
for word in words:
|
112 |
+
if len(temp_chunk + " " + word) > max_length:
|
113 |
+
if temp_chunk:
|
114 |
+
chunks.append(temp_chunk.strip())
|
115 |
+
temp_chunk = word
|
116 |
+
else:
|
117 |
+
chunks.append(word)
|
118 |
+
else:
|
119 |
+
temp_chunk = temp_chunk + " " + word if temp_chunk else word
|
120 |
+
current_chunk = temp_chunk
|
121 |
+
else:
|
122 |
+
current_chunk = sentence
|
123 |
else:
|
124 |
current_chunk = current_chunk + " " + sentence if current_chunk else sentence
|
125 |
if current_chunk:
|
|
|
127 |
return [chunk for chunk in chunks if chunk.strip()]
|
128 |
|
129 |
def generate_speech_chunk(self, text_chunk, speaker):
|
130 |
+
"""Generate speech for a single chunk"""
|
131 |
try:
|
132 |
+
return self.tts.tts(text=text_chunk, speaker=speaker)
|
|
|
133 |
except Exception as e:
|
134 |
print(f"Error generating speech for chunk: {e}")
|
135 |
return None
|
136 |
|
137 |
def generate_long_speech(self, text, speaker=None, progress_callback=None):
|
138 |
+
"""Generate speech for long text"""
|
139 |
processed_text = self.preprocess_text(text)
|
140 |
chunks = self.chunk_text(processed_text)
|
141 |
print(f"Split into {len(chunks)} chunks")
|
|
|
189 |
if audio is None:
|
190 |
return None, "β Failed to generate audio."
|
191 |
progress(0.9, desc="πΎ Saving audio file...")
|
192 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
|
193 |
sf.write(tmp_file.name, audio, sample_rate)
|
194 |
audio_path = tmp_file.name
|
195 |
progress(1.0, desc="β
Complete!")
|
|
|
221 |
<p style="color: #666; font-size: 1.1em;">Choose a voice and transform any written text into expressive human-like speech</p>
|
222 |
</div>
|
223 |
""")
|
224 |
+
if tts_system and tts_system.speakers:
|
225 |
gr.HTML("""
|
226 |
<div style="padding: 1rem; border-radius: 10px; margin: 1rem 0; border-left: 4px solid #28a745; background: #f8f9fa;">
|
227 |
<h4>π’ System Ready</h4>
|
|
|
235 |
<p>TTS system failed to initialize. Please refresh the page.</p>
|
236 |
</div>
|
237 |
""")
|
238 |
+
|
239 |
with gr.Row():
|
240 |
with gr.Column(scale=2):
|
241 |
text_input = gr.Textbox(
|
|
|
247 |
)
|
248 |
char_count = gr.HTML("<span style='color: #666;'>Character count: 0 / 50,000</span>")
|
249 |
speaker_dropdown = gr.Dropdown(
|
250 |
+
choices=tts_system.speakers if tts_system and tts_system.speakers else [],
|
251 |
value=tts_system.speakers[0] if tts_system and tts_system.speakers else None,
|
252 |
+
label="π£οΈ Choose Voice"
|
253 |
)
|
254 |
generate_btn = gr.Button("π― Generate Speech", variant="primary", size="lg", scale=1)
|
255 |
with gr.Column(scale=1):
|
|
|
262 |
<li>β‘ Smart text processing</li>
|
263 |
<li>π§ Auto chunking</li>
|
264 |
<li>π΅ Natural-sounding speech</li>
|
|
|
265 |
</ul>
|
266 |
</div>
|
267 |
""")
|
|
|
272 |
count = len(text) if text else 0
|
273 |
color = "#28a745" if count <= 50000 else "#dc3545"
|
274 |
return f'<span style="color: {color};">Character count: {count:,} / 50,000</span>'
|
275 |
+
|
276 |
text_input.change(fn=update_char_count, inputs=[text_input], outputs=[char_count])
|
277 |
|
278 |
generate_btn.click(
|
|
|
286 |
examples=[
|
287 |
["Hello! Welcome to our advanced text-to-speech system.", "p225"],
|
288 |
["The quick brown fox jumps over the lazy dog.", "p226"],
|
289 |
+
["Artificial intelligence has revolutionized many aspects of our daily lives.", "p227"],
|
290 |
],
|
291 |
inputs=[text_input, speaker_dropdown],
|
292 |
label="π Try These Examples"
|