Nick021402 commited on
Commit
6d45ca0
Β·
verified Β·
1 Parent(s): f8bacea

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -39
app.py CHANGED
@@ -12,42 +12,53 @@ from TTS.api import TTS
12
 
13
  warnings.filterwarnings("ignore")
14
 
15
- # Download required NLTK data
16
  try:
17
  nltk.data.find('tokenizers/punkt')
 
18
  except LookupError:
19
- nltk.download('punkt')
20
 
21
- # Load Coqui TTS model
22
- print("πŸ”„ Loading Coqui TTS models...")
23
- try:
24
- # This will download the model automatically if not found
25
- tts = TTS(model_name="tts_models/en/vctk/vits", progress_bar=False, gpu=False)
26
- print("βœ… Coqui TTS loaded successfully!")
27
- speakers = tts.speakers
28
- print(f"Available Speakers: {speakers}")
29
- except Exception as e:
30
- print(f"❌ Failed to load Coqui TTS: {e}")
31
- tts = None
32
- speakers = []
33
 
34
  class LongFormTTS:
35
  def __init__(self):
36
- self.tts = tts
37
- self.speakers = speakers or ["p225", "p226", "p227", "p228"] # fallback static list
38
- self.sample_rate = 22050 # Default sample rate for Coqui TTS
 
 
 
 
 
 
 
 
 
 
39
 
40
  def preprocess_text(self, text):
41
  """Clean and prepare text for TTS"""
42
  text = re.sub(r'\s+', ' ', text.strip())
43
  abbreviations = {
44
- 'Dr.': 'Doctor', 'Mr.': 'Mister', 'Mrs.': 'Missus',
45
- 'Ms.': 'Miss', 'Prof.': 'Professor', 'etc.': 'etcetera',
46
- 'vs.': 'versus', 'e.g.': 'for example', 'i.e.': 'that is',
47
- 'St.': 'Street', 'Ave.': 'Avenue', 'Blvd.': 'Boulevard',
48
- 'Inc.': 'Incorporated', 'Corp.': 'Corporation', 'Ltd.': 'Limited',
49
- 'U.S.': 'United States', 'U.K.': 'United Kingdom',
50
- 'Ph.D.': 'PhD', 'M.D.': 'MD'
 
 
 
 
 
 
 
 
 
 
 
 
51
  }
52
  for abbr, full in abbreviations.items():
53
  text = text.replace(abbr, full)
@@ -57,13 +68,15 @@ class LongFormTTS:
57
  return text.strip()
58
 
59
  def number_to_words(self, num):
 
 
 
 
 
60
  ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
61
  teens = ["ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
62
  "sixteen", "seventeen", "eighteen", "nineteen"]
63
- tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy",
64
- "eighty", "ninety"]
65
- if num == 0:
66
- return "zero"
67
  if num < 10:
68
  return ones[num]
69
  elif num < 20:
@@ -71,7 +84,7 @@ class LongFormTTS:
71
  elif num < 100:
72
  return tens[num // 10] + ("" if num % 10 == 0 else " " + ones[num % 10])
73
  elif num < 1000:
74
- return ones[num // 100] + " hundred" + (" " + self.number_to_words(num % 100)).strip()
75
  else:
76
  thousands = num // 1000
77
  remainder = num % 1000
@@ -81,14 +94,32 @@ class LongFormTTS:
81
  return result
82
 
83
  def chunk_text(self, text, max_length=200):
 
84
  sentences = sent_tokenize(text)
85
  chunks = []
86
  current_chunk = ""
87
  for sentence in sentences:
 
 
 
88
  if len(current_chunk + " " + sentence) > max_length:
89
  if current_chunk:
90
  chunks.append(current_chunk.strip())
91
- current_chunk = sentence
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  else:
93
  current_chunk = current_chunk + " " + sentence if current_chunk else sentence
94
  if current_chunk:
@@ -96,14 +127,15 @@ class LongFormTTS:
96
  return [chunk for chunk in chunks if chunk.strip()]
97
 
98
  def generate_speech_chunk(self, text_chunk, speaker):
 
99
  try:
100
- audio = self.tts.tts(text=text_chunk, speaker=speaker)
101
- return audio
102
  except Exception as e:
103
  print(f"Error generating speech for chunk: {e}")
104
  return None
105
 
106
  def generate_long_speech(self, text, speaker=None, progress_callback=None):
 
107
  processed_text = self.preprocess_text(text)
108
  chunks = self.chunk_text(processed_text)
109
  print(f"Split into {len(chunks)} chunks")
@@ -157,7 +189,7 @@ def text_to_speech_interface(text, speaker="p225", progress=gr.Progress()):
157
  if audio is None:
158
  return None, "❌ Failed to generate audio."
159
  progress(0.9, desc="πŸ’Ύ Saving audio file...")
160
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
161
  sf.write(tmp_file.name, audio, sample_rate)
162
  audio_path = tmp_file.name
163
  progress(1.0, desc="βœ… Complete!")
@@ -189,7 +221,7 @@ def create_interface():
189
  <p style="color: #666; font-size: 1.1em;">Choose a voice and transform any written text into expressive human-like speech</p>
190
  </div>
191
  """)
192
- if tts_system:
193
  gr.HTML("""
194
  <div style="padding: 1rem; border-radius: 10px; margin: 1rem 0; border-left: 4px solid #28a745; background: #f8f9fa;">
195
  <h4>🟒 System Ready</h4>
@@ -203,6 +235,7 @@ def create_interface():
203
  <p>TTS system failed to initialize. Please refresh the page.</p>
204
  </div>
205
  """)
 
206
  with gr.Row():
207
  with gr.Column(scale=2):
208
  text_input = gr.Textbox(
@@ -214,9 +247,9 @@ def create_interface():
214
  )
215
  char_count = gr.HTML("<span style='color: #666;'>Character count: 0 / 50,000</span>")
216
  speaker_dropdown = gr.Dropdown(
217
- choices=tts_system.speakers if tts_system else [],
218
  value=tts_system.speakers[0] if tts_system and tts_system.speakers else None,
219
- label="πŸ—£οΈ Select Voice"
220
  )
221
  generate_btn = gr.Button("🎯 Generate Speech", variant="primary", size="lg", scale=1)
222
  with gr.Column(scale=1):
@@ -229,7 +262,6 @@ def create_interface():
229
  <li>⚑ Smart text processing</li>
230
  <li>πŸ”§ Auto chunking</li>
231
  <li>🎡 Natural-sounding speech</li>
232
- <li>πŸ”Š MP3 audio output</li>
233
  </ul>
234
  </div>
235
  """)
@@ -240,7 +272,7 @@ def create_interface():
240
  count = len(text) if text else 0
241
  color = "#28a745" if count <= 50000 else "#dc3545"
242
  return f'<span style="color: {color};">Character count: {count:,} / 50,000</span>'
243
-
244
  text_input.change(fn=update_char_count, inputs=[text_input], outputs=[char_count])
245
 
246
  generate_btn.click(
@@ -254,7 +286,7 @@ def create_interface():
254
  examples=[
255
  ["Hello! Welcome to our advanced text-to-speech system.", "p225"],
256
  ["The quick brown fox jumps over the lazy dog.", "p226"],
257
- ["Artificial intelligence has revolutionized many aspects of our lives.", "p227"],
258
  ],
259
  inputs=[text_input, speaker_dropdown],
260
  label="πŸ“š Try These Examples"
 
12
 
13
  warnings.filterwarnings("ignore")
14
 
15
+ # Download required NLTK data including punkt_tab
16
  try:
17
  nltk.data.find('tokenizers/punkt')
18
+ nltk.data.find('tokenizers/punkt_tab') # This is the missing one!
19
  except LookupError:
20
+ nltk.download(['punkt', 'punkt_tab'], quiet=True)
21
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  class LongFormTTS:
24
  def __init__(self):
25
+ print("πŸ”„ Loading Coqui TTS models...")
26
+ try:
27
+ # Load Coqui model
28
+ self.tts = TTS(model_name="tts_models/en/vctk/vits", progress_bar=False, gpu=False)
29
+ self.speakers = self.tts.speakers
30
+ self.sample_rate = 22050
31
+ print("βœ… Coqui TTS loaded successfully!")
32
+ print(f"Available Speakers: {self.speakers}")
33
+ except Exception as e:
34
+ print(f"❌ Failed to load Coqui TTS: {e}")
35
+ self.tts = None
36
+ self.speakers = []
37
+
38
 
39
  def preprocess_text(self, text):
40
  """Clean and prepare text for TTS"""
41
  text = re.sub(r'\s+', ' ', text.strip())
42
  abbreviations = {
43
+ 'Dr.': 'Doctor',
44
+ 'Mr.': 'Mister',
45
+ 'Mrs.': 'Missus',
46
+ 'Ms.': 'Miss',
47
+ 'Prof.': 'Professor',
48
+ 'etc.': 'etcetera',
49
+ 'vs.': 'versus',
50
+ 'e.g.': 'for example',
51
+ 'i.e.': 'that is',
52
+ 'St.': 'Street',
53
+ 'Ave.': 'Avenue',
54
+ 'Blvd.': 'Boulevard',
55
+ 'Inc.': 'Incorporated',
56
+ 'Corp.': 'Corporation',
57
+ 'Ltd.': 'Limited',
58
+ 'U.S.': 'United States',
59
+ 'U.K.': 'United Kingdom',
60
+ 'Ph.D.': 'PhD',
61
+ 'M.D.': 'MD',
62
  }
63
  for abbr, full in abbreviations.items():
64
  text = text.replace(abbr, full)
 
68
  return text.strip()
69
 
70
  def number_to_words(self, num):
71
+ """Convert numbers to words"""
72
+ if num == 0:
73
+ return "zero"
74
+ if num > 9999:
75
+ return str(num)
76
  ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
77
  teens = ["ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
78
  "sixteen", "seventeen", "eighteen", "nineteen"]
79
+ tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
 
 
 
80
  if num < 10:
81
  return ones[num]
82
  elif num < 20:
 
84
  elif num < 100:
85
  return tens[num // 10] + ("" if num % 10 == 0 else " " + ones[num % 10])
86
  elif num < 1000:
87
+ return ones[num // 100] + " hundred" + ("" if num % 100 == 0 else " " + self.number_to_words(num % 100))
88
  else:
89
  thousands = num // 1000
90
  remainder = num % 1000
 
94
  return result
95
 
96
  def chunk_text(self, text, max_length=200):
97
+ """Split text into manageable chunks"""
98
  sentences = sent_tokenize(text)
99
  chunks = []
100
  current_chunk = ""
101
  for sentence in sentences:
102
+ sentence = sentence.strip()
103
+ if not sentence:
104
+ continue
105
  if len(current_chunk + " " + sentence) > max_length:
106
  if current_chunk:
107
  chunks.append(current_chunk.strip())
108
+ if len(sentence) > max_length:
109
+ words = sentence.split()
110
+ temp_chunk = ""
111
+ for word in words:
112
+ if len(temp_chunk + " " + word) > max_length:
113
+ if temp_chunk:
114
+ chunks.append(temp_chunk.strip())
115
+ temp_chunk = word
116
+ else:
117
+ chunks.append(word)
118
+ else:
119
+ temp_chunk = temp_chunk + " " + word if temp_chunk else word
120
+ current_chunk = temp_chunk
121
+ else:
122
+ current_chunk = sentence
123
  else:
124
  current_chunk = current_chunk + " " + sentence if current_chunk else sentence
125
  if current_chunk:
 
127
  return [chunk for chunk in chunks if chunk.strip()]
128
 
129
  def generate_speech_chunk(self, text_chunk, speaker):
130
+ """Generate speech for a single chunk"""
131
  try:
132
+ return self.tts.tts(text=text_chunk, speaker=speaker)
 
133
  except Exception as e:
134
  print(f"Error generating speech for chunk: {e}")
135
  return None
136
 
137
  def generate_long_speech(self, text, speaker=None, progress_callback=None):
138
+ """Generate speech for long text"""
139
  processed_text = self.preprocess_text(text)
140
  chunks = self.chunk_text(processed_text)
141
  print(f"Split into {len(chunks)} chunks")
 
189
  if audio is None:
190
  return None, "❌ Failed to generate audio."
191
  progress(0.9, desc="πŸ’Ύ Saving audio file...")
192
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
193
  sf.write(tmp_file.name, audio, sample_rate)
194
  audio_path = tmp_file.name
195
  progress(1.0, desc="βœ… Complete!")
 
221
  <p style="color: #666; font-size: 1.1em;">Choose a voice and transform any written text into expressive human-like speech</p>
222
  </div>
223
  """)
224
+ if tts_system and tts_system.speakers:
225
  gr.HTML("""
226
  <div style="padding: 1rem; border-radius: 10px; margin: 1rem 0; border-left: 4px solid #28a745; background: #f8f9fa;">
227
  <h4>🟒 System Ready</h4>
 
235
  <p>TTS system failed to initialize. Please refresh the page.</p>
236
  </div>
237
  """)
238
+
239
  with gr.Row():
240
  with gr.Column(scale=2):
241
  text_input = gr.Textbox(
 
247
  )
248
  char_count = gr.HTML("<span style='color: #666;'>Character count: 0 / 50,000</span>")
249
  speaker_dropdown = gr.Dropdown(
250
+ choices=tts_system.speakers if tts_system and tts_system.speakers else [],
251
  value=tts_system.speakers[0] if tts_system and tts_system.speakers else None,
252
+ label="πŸ—£οΈ Choose Voice"
253
  )
254
  generate_btn = gr.Button("🎯 Generate Speech", variant="primary", size="lg", scale=1)
255
  with gr.Column(scale=1):
 
262
  <li>⚑ Smart text processing</li>
263
  <li>πŸ”§ Auto chunking</li>
264
  <li>🎡 Natural-sounding speech</li>
 
265
  </ul>
266
  </div>
267
  """)
 
272
  count = len(text) if text else 0
273
  color = "#28a745" if count <= 50000 else "#dc3545"
274
  return f'<span style="color: {color};">Character count: {count:,} / 50,000</span>'
275
+
276
  text_input.change(fn=update_char_count, inputs=[text_input], outputs=[char_count])
277
 
278
  generate_btn.click(
 
286
  examples=[
287
  ["Hello! Welcome to our advanced text-to-speech system.", "p225"],
288
  ["The quick brown fox jumps over the lazy dog.", "p226"],
289
+ ["Artificial intelligence has revolutionized many aspects of our daily lives.", "p227"],
290
  ],
291
  inputs=[text_input, speaker_dropdown],
292
  label="πŸ“š Try These Examples"