Spaces:

Nick021402
/

Text2speech

Sleeping

App Files Files Community

Nick021402 commited on May 23

Commit

6d45ca0

verified ·

1 Parent(s): f8bacea

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -39

app.py CHANGED Viewed

@@ -12,42 +12,53 @@ from TTS.api import TTS
 warnings.filterwarnings("ignore")
-# Download required NLTK data
 try:
     nltk.data.find('tokenizers/punkt')
 except LookupError:
-    nltk.download('punkt')
-# Load Coqui TTS model
-print("🔄 Loading Coqui TTS models...")
-try:
-    # This will download the model automatically if not found
-    tts = TTS(model_name="tts_models/en/vctk/vits", progress_bar=False, gpu=False)
-    print("✅ Coqui TTS loaded successfully!")
-    speakers = tts.speakers
-    print(f"Available Speakers: {speakers}")
-except Exception as e:
-    print(f"❌ Failed to load Coqui TTS: {e}")
-    tts = None
-    speakers = []
 class LongFormTTS:
     def __init__(self):
-        self.tts = tts
-        self.speakers = speakers or ["p225", "p226", "p227", "p228"]  # fallback static list
-        self.sample_rate = 22050  # Default sample rate for Coqui TTS
     def preprocess_text(self, text):
         """Clean and prepare text for TTS"""
         text = re.sub(r'\s+', ' ', text.strip())
         abbreviations = {
-            'Dr.': 'Doctor', 'Mr.': 'Mister', 'Mrs.': 'Missus',
-            'Ms.': 'Miss', 'Prof.': 'Professor', 'etc.': 'etcetera',
-            'vs.': 'versus', 'e.g.': 'for example', 'i.e.': 'that is',
-            'St.': 'Street', 'Ave.': 'Avenue', 'Blvd.': 'Boulevard',
-            'Inc.': 'Incorporated', 'Corp.': 'Corporation', 'Ltd.': 'Limited',
-            'U.S.': 'United States', 'U.K.': 'United Kingdom',
-            'Ph.D.': 'PhD', 'M.D.': 'MD'
         }
         for abbr, full in abbreviations.items():
             text = text.replace(abbr, full)
@@ -57,13 +68,15 @@ class LongFormTTS:
         return text.strip()
     def number_to_words(self, num):
         ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
         teens = ["ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
                  "sixteen", "seventeen", "eighteen", "nineteen"]
-        tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy",
-                "eighty", "ninety"]
-        if num == 0:
-            return "zero"
         if num < 10:
             return ones[num]
         elif num < 20:
@@ -71,7 +84,7 @@ class LongFormTTS:
         elif num < 100:
             return tens[num // 10] + ("" if num % 10 == 0 else " " + ones[num % 10])
         elif num < 1000:
-            return ones[num // 100] + " hundred" + (" " + self.number_to_words(num % 100)).strip()
         else:
             thousands = num // 1000
             remainder = num % 1000
@@ -81,14 +94,32 @@ class LongFormTTS:
             return result
     def chunk_text(self, text, max_length=200):
         sentences = sent_tokenize(text)
         chunks = []
         current_chunk = ""
         for sentence in sentences:
             if len(current_chunk + " " + sentence) > max_length:
                 if current_chunk:
                     chunks.append(current_chunk.strip())
-                current_chunk = sentence
             else:
                 current_chunk = current_chunk + " " + sentence if current_chunk else sentence
         if current_chunk:
@@ -96,14 +127,15 @@ class LongFormTTS:
         return [chunk for chunk in chunks if chunk.strip()]
     def generate_speech_chunk(self, text_chunk, speaker):
         try:
-            audio = self.tts.tts(text=text_chunk, speaker=speaker)
-            return audio
         except Exception as e:
             print(f"Error generating speech for chunk: {e}")
             return None
     def generate_long_speech(self, text, speaker=None, progress_callback=None):
         processed_text = self.preprocess_text(text)
         chunks = self.chunk_text(processed_text)
         print(f"Split into {len(chunks)} chunks")
@@ -157,7 +189,7 @@ def text_to_speech_interface(text, speaker="p225", progress=gr.Progress()):
         if audio is None:
             return None, "❌ Failed to generate audio."
         progress(0.9, desc="💾 Saving audio file...")
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
             sf.write(tmp_file.name, audio, sample_rate)
             audio_path = tmp_file.name
         progress(1.0, desc="✅ Complete!")
@@ -189,7 +221,7 @@ def create_interface():
             <p style="color: #666; font-size: 1.1em;">Choose a voice and transform any written text into expressive human-like speech</p>
         </div>
         """)
-        if tts_system:
             gr.HTML("""
             <div style="padding: 1rem; border-radius: 10px; margin: 1rem 0; border-left: 4px solid #28a745; background: #f8f9fa;">
                 <h4>🟢 System Ready</h4>
@@ -203,6 +235,7 @@ def create_interface():
                 <p>TTS system failed to initialize. Please refresh the page.</p>
             </div>
             """)
         with gr.Row():
             with gr.Column(scale=2):
                 text_input = gr.Textbox(
@@ -214,9 +247,9 @@ def create_interface():
                 )
                 char_count = gr.HTML("<span style='color: #666;'>Character count: 0 / 50,000</span>")
                 speaker_dropdown = gr.Dropdown(
-                    choices=tts_system.speakers if tts_system else [],
                     value=tts_system.speakers[0] if tts_system and tts_system.speakers else None,
-                    label="🗣️ Select Voice"
                 )
                 generate_btn = gr.Button("🎯 Generate Speech", variant="primary", size="lg", scale=1)
             with gr.Column(scale=1):
@@ -229,7 +262,6 @@ def create_interface():
                         <li>⚡ Smart text processing</li>
                         <li>🔧 Auto chunking</li>
                         <li>🎵 Natural-sounding speech</li>
-                        <li>🔊 MP3 audio output</li>
                     </ul>
                 </div>
                 """)
@@ -240,7 +272,7 @@ def create_interface():
             count = len(text) if text else 0
             color = "#28a745" if count <= 50000 else "#dc3545"
             return f'<span style="color: {color};">Character count: {count:,} / 50,000</span>'
         text_input.change(fn=update_char_count, inputs=[text_input], outputs=[char_count])
         generate_btn.click(
@@ -254,7 +286,7 @@ def create_interface():
             examples=[
                 ["Hello! Welcome to our advanced text-to-speech system.", "p225"],
                 ["The quick brown fox jumps over the lazy dog.", "p226"],
-                ["Artificial intelligence has revolutionized many aspects of our lives.", "p227"],
             ],
             inputs=[text_input, speaker_dropdown],
             label="📚 Try These Examples"

 warnings.filterwarnings("ignore")
+# Download required NLTK data including punkt_tab
 try:
     nltk.data.find('tokenizers/punkt')
+    nltk.data.find('tokenizers/punkt_tab')  # This is the missing one!
 except LookupError:
+    nltk.download(['punkt', 'punkt_tab'], quiet=True)
 class LongFormTTS:
     def __init__(self):
+        print("🔄 Loading Coqui TTS models...")
+        try:
+            # Load Coqui model
+            self.tts = TTS(model_name="tts_models/en/vctk/vits", progress_bar=False, gpu=False)
+            self.speakers = self.tts.speakers
+            self.sample_rate = 22050
+            print("✅ Coqui TTS loaded successfully!")
+            print(f"Available Speakers: {self.speakers}")
+        except Exception as e:
+            print(f"❌ Failed to load Coqui TTS: {e}")
+            self.tts = None
+            self.speakers = []
     def preprocess_text(self, text):
         """Clean and prepare text for TTS"""
         text = re.sub(r'\s+', ' ', text.strip())
         abbreviations = {
+            'Dr.': 'Doctor',
+            'Mr.': 'Mister',
+            'Mrs.': 'Missus',
+            'Ms.': 'Miss',
+            'Prof.': 'Professor',
+            'etc.': 'etcetera',
+            'vs.': 'versus',
+            'e.g.': 'for example',
+            'i.e.': 'that is',
+            'St.': 'Street',
+            'Ave.': 'Avenue',
+            'Blvd.': 'Boulevard',
+            'Inc.': 'Incorporated',
+            'Corp.': 'Corporation',
+            'Ltd.': 'Limited',
+            'U.S.': 'United States',
+            'U.K.': 'United Kingdom',
+            'Ph.D.': 'PhD',
+            'M.D.': 'MD',
         }
         for abbr, full in abbreviations.items():
             text = text.replace(abbr, full)
         return text.strip()
     def number_to_words(self, num):
+        """Convert numbers to words"""
+        if num == 0:
+            return "zero"
+        if num > 9999:
+            return str(num)
         ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
         teens = ["ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
                  "sixteen", "seventeen", "eighteen", "nineteen"]
+        tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
         if num < 10:
             return ones[num]
         elif num < 20:
         elif num < 100:
             return tens[num // 10] + ("" if num % 10 == 0 else " " + ones[num % 10])
         elif num < 1000:
+            return ones[num // 100] + " hundred" + ("" if num % 100 == 0 else " " + self.number_to_words(num % 100))
         else:
             thousands = num // 1000
             remainder = num % 1000
             return result
     def chunk_text(self, text, max_length=200):
+        """Split text into manageable chunks"""
         sentences = sent_tokenize(text)
         chunks = []
         current_chunk = ""
         for sentence in sentences:
+            sentence = sentence.strip()
+            if not sentence:
+                continue
             if len(current_chunk + " " + sentence) > max_length:
                 if current_chunk:
                     chunks.append(current_chunk.strip())
+                if len(sentence) > max_length:
+                    words = sentence.split()
+                    temp_chunk = ""
+                    for word in words:
+                        if len(temp_chunk + " " + word) > max_length:
+                            if temp_chunk:
+                                chunks.append(temp_chunk.strip())
+                                temp_chunk = word
+                            else:
+                                chunks.append(word)
+                        else:
+                            temp_chunk = temp_chunk + " " + word if temp_chunk else word
+                    current_chunk = temp_chunk
+                else:
+                    current_chunk = sentence
             else:
                 current_chunk = current_chunk + " " + sentence if current_chunk else sentence
         if current_chunk:
         return [chunk for chunk in chunks if chunk.strip()]
     def generate_speech_chunk(self, text_chunk, speaker):
+        """Generate speech for a single chunk"""
         try:
+            return self.tts.tts(text=text_chunk, speaker=speaker)
         except Exception as e:
             print(f"Error generating speech for chunk: {e}")
             return None
     def generate_long_speech(self, text, speaker=None, progress_callback=None):
+        """Generate speech for long text"""
         processed_text = self.preprocess_text(text)
         chunks = self.chunk_text(processed_text)
         print(f"Split into {len(chunks)} chunks")
         if audio is None:
             return None, "❌ Failed to generate audio."
         progress(0.9, desc="💾 Saving audio file...")
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
             sf.write(tmp_file.name, audio, sample_rate)
             audio_path = tmp_file.name
         progress(1.0, desc="✅ Complete!")
             <p style="color: #666; font-size: 1.1em;">Choose a voice and transform any written text into expressive human-like speech</p>
         </div>
         """)
+        if tts_system and tts_system.speakers:
             gr.HTML("""
             <div style="padding: 1rem; border-radius: 10px; margin: 1rem 0; border-left: 4px solid #28a745; background: #f8f9fa;">
                 <h4>🟢 System Ready</h4>
                 <p>TTS system failed to initialize. Please refresh the page.</p>
             </div>
             """)
         with gr.Row():
             with gr.Column(scale=2):
                 text_input = gr.Textbox(
                 )
                 char_count = gr.HTML("<span style='color: #666;'>Character count: 0 / 50,000</span>")
                 speaker_dropdown = gr.Dropdown(
+                    choices=tts_system.speakers if tts_system and tts_system.speakers else [],
                     value=tts_system.speakers[0] if tts_system and tts_system.speakers else None,
+                    label="🗣️ Choose Voice"
                 )
                 generate_btn = gr.Button("🎯 Generate Speech", variant="primary", size="lg", scale=1)
             with gr.Column(scale=1):
                         <li>⚡ Smart text processing</li>
                         <li>🔧 Auto chunking</li>
                         <li>🎵 Natural-sounding speech</li>
                     </ul>
                 </div>
                 """)
             count = len(text) if text else 0
             color = "#28a745" if count <= 50000 else "#dc3545"
             return f'<span style="color: {color};">Character count: {count:,} / 50,000</span>'
         text_input.change(fn=update_char_count, inputs=[text_input], outputs=[char_count])
         generate_btn.click(
             examples=[
                 ["Hello! Welcome to our advanced text-to-speech system.", "p225"],
                 ["The quick brown fox jumps over the lazy dog.", "p226"],
+                ["Artificial intelligence has revolutionized many aspects of our daily lives.", "p227"],
             ],
             inputs=[text_input, speaker_dropdown],
             label="📚 Try These Examples"