drewThomasson commited on
Commit
5862afd
·
verified ·
1 Parent(s): d6ed34d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +319 -0
app.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Spark-TTS by SparkAudio – Enhanced eBook Converter
5
+ Licensed under the Apache License, Version 2.0.
6
+ (See accompanying LICENSE file for details)
7
+ """
8
+
9
+ import os
10
+ import torch
11
+ import soundfile as sf
12
+ import logging
13
+ import argparse
14
+ import platform
15
+ import subprocess
16
+ from datetime import datetime
17
+
18
+ import gradio as gr
19
+
20
+ # For eBook processing
21
+ import re
22
+ import ebooklib
23
+ from ebooklib import epub
24
+ from bs4 import BeautifulSoup
25
+ import nltk
26
+ from nltk.tokenize import sent_tokenize
27
+
28
+ # For audio combination
29
+ from pydub import AudioSegment
30
+
31
+ # Ensure NLTK sentence tokenizer is downloaded
32
+ nltk.download('punkt')
33
+
34
+ # Optional: download pretrained model from Hugging Face if not already present.
35
+ try:
36
+ from huggingface_hub import snapshot_download
37
+ if not os.path.exists("pretrained_models/Spark-TTS-0.5B"):
38
+ print("Downloading pretrained model from Hugging Face...")
39
+ snapshot_download("SparkAudio/Spark-TTS-0.5B", local_dir="pretrained_models/Spark-TTS-0.5B")
40
+ except ImportError:
41
+ print("huggingface_hub is not installed. Make sure the pretrained model is already available.")
42
+
43
+
44
+ ###########################
45
+ # Spark-TTS Core Functions
46
+ ###########################
47
+
48
+ from cli.SparkTTS import SparkTTS
49
+ from sparktts.utils.token_parser import LEVELS_MAP_UI # This maps UI slider values to model values
50
+
51
+ def initialize_model(model_dir="pretrained_models/Spark-TTS-0.5B", device=0):
52
+ """Load the Spark-TTS model once at startup."""
53
+ logging.info(f"Loading model from: {model_dir}")
54
+ if platform.system() == "Darwin":
55
+ device = torch.device(f"mps:{device}")
56
+ logging.info(f"Using MPS device: {device}")
57
+ elif torch.cuda.is_available():
58
+ device = torch.device(f"cuda:{device}")
59
+ logging.info(f"Using CUDA device: {device}")
60
+ else:
61
+ device = torch.device("cpu")
62
+ logging.info("GPU acceleration not available, using CPU")
63
+ model = SparkTTS(model_dir, device)
64
+ return model
65
+
66
+ def run_tts(text, model, prompt_text=None, prompt_speech=None, gender=None, pitch=None, speed=None, save_dir="results"):
67
+ """Perform TTS inference and save the generated audio fragment.
68
+ Returns the full path of the saved .wav file."""
69
+ logging.info(f"Saving audio to: {save_dir}")
70
+ if prompt_text is not None and len(prompt_text) < 2:
71
+ prompt_text = None
72
+ os.makedirs(save_dir, exist_ok=True)
73
+ timestamp = datetime.now().strftime("%Y%m%d%H%M%S%f")
74
+ save_path = os.path.join(save_dir, f"{timestamp}.wav")
75
+ logging.info("Starting inference...")
76
+ with torch.no_grad():
77
+ wav = model.inference(
78
+ text,
79
+ prompt_speech,
80
+ prompt_text,
81
+ gender,
82
+ pitch,
83
+ speed,
84
+ )
85
+ sf.write(save_path, wav, samplerate=16000)
86
+ logging.info(f"Audio saved at: {save_path}")
87
+ return save_path
88
+
89
+
90
+ ##############################
91
+ # eBook-to-Audiobook Functions
92
+ ##############################
93
+
94
+ def ensure_directory(directory_path):
95
+ if not os.path.exists(directory_path):
96
+ os.makedirs(directory_path)
97
+
98
+ def convert_to_epub(input_path, output_path):
99
+ """Convert an eBook (mobi/pdf/etc.) to EPUB using Calibre's ebook-convert."""
100
+ try:
101
+ subprocess.run(['ebook-convert', input_path, output_path], check=True)
102
+ return True
103
+ except subprocess.CalledProcessError as e:
104
+ logging.error(f"ebook-convert failed: {e}")
105
+ return False
106
+
107
+ def save_chapters_as_text(epub_path, chapters_dir):
108
+ """Extract HTML documents from the EPUB and save each as a text file (one per chapter)."""
109
+ book = epub.read_epub(epub_path)
110
+ chapter_counter = 0
111
+ for item in book.get_items():
112
+ if item.get_type() == ebooklib.ITEM_DOCUMENT:
113
+ soup = BeautifulSoup(item.get_content(), 'html.parser')
114
+ text = soup.get_text()
115
+ if text.strip():
116
+ chapter_file = os.path.join(chapters_dir, f"chapter_{chapter_counter}.txt")
117
+ with open(chapter_file, 'w', encoding='utf-8') as f:
118
+ f.write(text)
119
+ chapter_counter += 1
120
+ return chapter_counter
121
+
122
+ def create_chapter_labeled_book(ebook_file_path):
123
+ """Convert the uploaded eBook into chapters saved as text files."""
124
+ working_dir = os.path.join(".", "Working_files")
125
+ ensure_directory(working_dir)
126
+ temp_epub = os.path.join(working_dir, "temp.epub")
127
+ chapters_dir = os.path.join(working_dir, "chapters")
128
+ ensure_directory(chapters_dir)
129
+ if os.path.exists(temp_epub):
130
+ os.remove(temp_epub)
131
+ if convert_to_epub(ebook_file_path, temp_epub):
132
+ num_chapters = save_chapters_as_text(temp_epub, chapters_dir)
133
+ logging.info(f"Extracted {num_chapters} chapters.")
134
+ return chapters_dir
135
+ else:
136
+ raise Exception("Failed to convert ebook to EPUB.")
137
+
138
+ def split_long_sentence(sentence, max_length=250):
139
+ """Split a long sentence into smaller fragments at the last space before max_length."""
140
+ parts = []
141
+ while len(sentence) > max_length:
142
+ split_at = sentence.rfind(' ', 0, max_length)
143
+ if split_at == -1:
144
+ split_at = max_length
145
+ parts.append(sentence[:split_at].strip())
146
+ sentence = sentence[split_at:].strip()
147
+ parts.append(sentence)
148
+ return parts
149
+
150
+ def combine_wav_files(file_list, output_file):
151
+ """Combine a list of WAV files into one WAV file."""
152
+ combined = AudioSegment.empty()
153
+ for f in file_list:
154
+ seg = AudioSegment.from_wav(f)
155
+ combined += seg
156
+ combined.export(output_file, format="wav")
157
+
158
+ def convert_ebook_to_audiobook(ebook_file_path, model, gender=None, pitch=None, speed=None, prompt_text=None, prompt_speech=None):
159
+ """Convert an entire eBook into an audiobook WAV file.
160
+ Processes chapters, splits sentences, runs TTS for each fragment,
161
+ and combines all fragments with brief silences between chapters."""
162
+ # Step 1: Create chapters
163
+ chapters_dir = create_chapter_labeled_book(ebook_file_path)
164
+ chapter_files = sorted(
165
+ [os.path.join(chapters_dir, f) for f in os.listdir(chapters_dir) if f.startswith("chapter_") and f.endswith(".txt")],
166
+ key=lambda x: int(re.findall(r'\d+', os.path.basename(x))[0])
167
+ )
168
+ output_dir = os.path.join(".", "Audiobooks")
169
+ ensure_directory(output_dir)
170
+ chapter_audio_files = []
171
+ temp_audio_dir = os.path.join(".", "Working_files", "temp_audio")
172
+ ensure_directory(temp_audio_dir)
173
+
174
+ # Process each chapter
175
+ for chapter_file in chapter_files:
176
+ with open(chapter_file, 'r', encoding='utf-8') as f:
177
+ text = f.read()
178
+ sentences = sent_tokenize(text)
179
+ fragment_audio_files = []
180
+ counter = 0
181
+ for sentence in sentences:
182
+ fragments = split_long_sentence(sentence)
183
+ for frag in fragments:
184
+ if frag:
185
+ # Generate audio for each fragment; save in temp_audio_dir
186
+ frag_wav = run_tts(frag, model, prompt_text=prompt_text, prompt_speech=prompt_speech,
187
+ gender=gender, pitch=pitch, speed=speed, save_dir=temp_audio_dir)
188
+ # Rename for consistency
189
+ new_frag_wav = os.path.join(temp_audio_dir, f"{os.path.basename(chapter_file)}_{counter}.wav")
190
+ os.rename(frag_wav, new_frag_wav)
191
+ fragment_audio_files.append(new_frag_wav)
192
+ counter += 1
193
+ # Combine fragment audio files for the chapter
194
+ chapter_audio = os.path.join(temp_audio_dir, f"{os.path.basename(chapter_file)}_combined.wav")
195
+ combine_wav_files(fragment_audio_files, chapter_audio)
196
+ chapter_audio_files.append(chapter_audio)
197
+
198
+ # Combine all chapters into one final audiobook (with 2 sec silence between chapters)
199
+ silence = AudioSegment.silent(duration=2000)
200
+ final_audio = AudioSegment.empty()
201
+ for f in chapter_audio_files:
202
+ seg = AudioSegment.from_wav(f)
203
+ final_audio += seg + silence
204
+ final_output = os.path.join(output_dir, os.path.splitext(os.path.basename(ebook_file_path))[0] + ".wav")
205
+ final_audio.export(final_output, format="wav")
206
+ return final_output
207
+
208
+
209
+ ##########################
210
+ # Gradio UI Build Function
211
+ ##########################
212
+
213
+ def build_ui(model_dir, device=0):
214
+ # Initialize the model
215
+ model = initialize_model(model_dir, device=device)
216
+
217
+ # Voice Clone Tab callback
218
+ def voice_clone(text, prompt_text, prompt_wav_upload, prompt_wav_record):
219
+ prompt_speech = prompt_wav_upload if prompt_wav_upload else prompt_wav_record
220
+ prompt_text_clean = prompt_text if (prompt_text and len(prompt_text) >= 2) else None
221
+ audio_output_path = run_tts(text, model, prompt_text=prompt_text_clean, prompt_speech=prompt_speech)
222
+ return audio_output_path
223
+
224
+ # Voice Creation Tab callback
225
+ def voice_creation(text, gender, pitch, speed):
226
+ # Map UI slider values via LEVELS_MAP_UI if desired; here we pass as-is.
227
+ audio_output_path = run_tts(text, model, gender=gender, pitch=pitch, speed=speed)
228
+ return audio_output_path
229
+
230
+ # eBook Conversion Tab callback
231
+ def ebook_conversion(ebook_file, gender, pitch, speed, prompt_text, prompt_wav_upload, prompt_wav_record):
232
+ prompt_speech = prompt_wav_upload if prompt_wav_upload else prompt_wav_record
233
+ # Gradio File component returns an object with a .name attribute
234
+ ebook_file_path = ebook_file.name if hasattr(ebook_file, "name") else ebook_file
235
+ audio_output_path = convert_ebook_to_audiobook(
236
+ ebook_file_path, model, gender=gender, pitch=pitch, speed=speed,
237
+ prompt_text=prompt_text, prompt_speech=prompt_speech
238
+ )
239
+ return audio_output_path
240
+
241
+ # Build the Gradio interface with three tabs
242
+ with gr.Blocks() as demo:
243
+ gr.HTML('<h1 style="text-align: center;">Spark-TTS by SparkAudio – Enhanced eBook Converter</h1>')
244
+ with gr.Tabs():
245
+ # Voice Clone Tab
246
+ with gr.TabItem("Voice Clone"):
247
+ gr.Markdown("### Upload reference audio or record a prompt")
248
+ with gr.Row():
249
+ prompt_wav_upload = gr.Audio(sources="upload", type="filepath",
250
+ label="Upload Prompt Audio (>=16kHz)")
251
+ prompt_wav_record = gr.Audio(sources="microphone", type="filepath",
252
+ label="Record Prompt Audio")
253
+ with gr.Row():
254
+ text_input = gr.Textbox(label="Text", lines=3, placeholder="Enter text")
255
+ prompt_text_input = gr.Textbox(label="Prompt Text (Optional)", lines=3,
256
+ placeholder="Enter prompt text")
257
+ audio_output_clone = gr.Audio(label="Generated Audio", autoplay=True, streaming=True)
258
+ btn_clone = gr.Button("Generate Voice Clone")
259
+ btn_clone.click(
260
+ voice_clone,
261
+ inputs=[text_input, prompt_text_input, prompt_wav_upload, prompt_wav_record],
262
+ outputs=audio_output_clone
263
+ )
264
+ # Voice Creation Tab
265
+ with gr.TabItem("Voice Creation"):
266
+ gr.Markdown("### Create a custom voice")
267
+ with gr.Row():
268
+ gender = gr.Radio(choices=["male", "female"], value="male", label="Gender")
269
+ pitch = gr.Slider(minimum=1, maximum=5, step=1, value=3, label="Pitch")
270
+ speed = gr.Slider(minimum=1, maximum=5, step=1, value=3, label="Speed")
271
+ text_input_creation = gr.Textbox(label="Input Text", lines=3,
272
+ placeholder="Enter text",
273
+ value="Generate custom voice sample.")
274
+ audio_output_creation = gr.Audio(label="Generated Audio", autoplay=True, streaming=True)
275
+ btn_create = gr.Button("Create Voice")
276
+ btn_create.click(
277
+ voice_creation,
278
+ inputs=[text_input_creation, gender, pitch, speed],
279
+ outputs=audio_output_creation
280
+ )
281
+ # eBook Conversion Tab
282
+ with gr.TabItem("eBook Conversion"):
283
+ gr.Markdown("### Convert an eBook into an Audiobook")
284
+ ebook_file = gr.File(label="Upload eBook File (e.g., epub, mobi, pdf)",
285
+ file_types=[".epub", ".mobi", ".pdf"])
286
+ with gr.Row():
287
+ gender_ebook = gr.Radio(choices=["male", "female"], value="male", label="Gender")
288
+ pitch_ebook = gr.Slider(minimum=1, maximum=5, step=1, value=3, label="Pitch")
289
+ speed_ebook = gr.Slider(minimum=1, maximum=5, step=1, value=3, label="Speed")
290
+ prompt_text_ebook = gr.Textbox(label="Prompt Text (Optional)", lines=3,
291
+ placeholder="Enter prompt text for voice cloning")
292
+ with gr.Row():
293
+ prompt_wav_upload_ebook = gr.Audio(sources="upload", type="filepath",
294
+ label="Upload Prompt Audio (>=16kHz)")
295
+ prompt_wav_record_ebook = gr.Audio(sources="microphone", type="filepath",
296
+ label="Record Prompt Audio")
297
+ audio_output_ebook = gr.Audio(label="Generated Audiobook", autoplay=True, streaming=True)
298
+ btn_ebook = gr.Button("Convert eBook")
299
+ btn_ebook.click(
300
+ ebook_conversion,
301
+ inputs=[ebook_file, gender_ebook, pitch_ebook, speed_ebook, prompt_text_ebook,
302
+ prompt_wav_upload_ebook, prompt_wav_record_ebook],
303
+ outputs=audio_output_ebook
304
+ )
305
+ return demo
306
+
307
+ def parse_arguments():
308
+ parser = argparse.ArgumentParser(description="Spark-TTS eBook Converter")
309
+ parser.add_argument("--model_dir", type=str, default="pretrained_models/Spark-TTS-0.5B",
310
+ help="Path to the model directory.")
311
+ parser.add_argument("--device", type=int, default=0, help="GPU device id")
312
+ parser.add_argument("--server_name", type=str, default="0.0.0.0", help="Server host")
313
+ parser.add_argument("--server_port", type=int, default=7860, help="Server port")
314
+ return parser.parse_args()
315
+
316
+ if __name__ == "__main__":
317
+ args = parse_arguments()
318
+ demo = build_ui(args.model_dir, args.device)
319
+ demo.launch(server_name=args.server_name, server_port=args.server_port)