Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,8 +1,4 @@
|
|
1 |
|
2 |
-
# import os
|
3 |
-
# os.system("pip install --no-cache-dir --upgrade --force-reinstall -r requirements.txt")
|
4 |
-
|
5 |
-
|
6 |
# Initalize a pipeline
|
7 |
from kokoro import KPipeline
|
8 |
# from IPython.display import display, Audio
|
@@ -177,6 +173,7 @@ def remove_silence_function(file_path,minimum_silence=50):
|
|
177 |
combined += chunk
|
178 |
combined.export(output_path, format=audio_format)
|
179 |
return output_path
|
|
|
180 |
def generate_and_save_audio(text, Language="American English",voice="af_bella", speed=1,remove_silence=False,keep_silence_up_to=0.05):
|
181 |
text=clean_text(text)
|
182 |
update_pipeline(Language)
|
@@ -205,6 +202,8 @@ def generate_and_save_audio(text, Language="American English",voice="af_bella",
|
|
205 |
audio_int16 = (audio_np * 32767).astype(np.int16) # Scale to 16-bit range
|
206 |
audio_bytes = audio_int16.tobytes() # Convert to bytes
|
207 |
# Write the audio chunk to the WAV file
|
|
|
|
|
208 |
wav_file.writeframes(audio_bytes)
|
209 |
if remove_silence:
|
210 |
keep_silence = int(keep_silence_up_to * 1000)
|
@@ -212,39 +211,44 @@ def generate_and_save_audio(text, Language="American English",voice="af_bella",
|
|
212 |
return new_wave_file,timestamps
|
213 |
return save_path,timestamps
|
214 |
|
|
|
|
|
215 |
def adjust_timestamps(timestamp_dict):
|
216 |
adjusted_timestamps = []
|
217 |
-
|
218 |
|
219 |
for segment_id in sorted(timestamp_dict.keys()):
|
220 |
segment = timestamp_dict[segment_id]
|
221 |
words = segment["words"]
|
|
|
222 |
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
|
|
227 |
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
|
232 |
-
|
233 |
-
|
|
|
234 |
|
235 |
adjusted_timestamps.append({
|
236 |
-
"word":
|
237 |
-
"start": round(
|
238 |
-
"end": round(
|
239 |
})
|
240 |
|
241 |
-
#
|
242 |
-
|
243 |
-
last_end_time = adjusted_timestamps[-1]["end"]
|
244 |
|
245 |
return adjusted_timestamps
|
246 |
|
247 |
|
|
|
248 |
import string
|
249 |
|
250 |
def write_word_srt(word_level_timestamps, output_file="word.srt", skip_punctuation=True):
|
@@ -278,6 +282,30 @@ def write_word_srt(word_level_timestamps, output_file="word.srt", skip_punctuati
|
|
278 |
|
279 |
import string
|
280 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
281 |
def write_sentence_srt(word_level_timestamps, output_file="subtitles.srt", max_words=8, min_pause=0.1):
|
282 |
subtitles = [] # Stores subtitle blocks
|
283 |
subtitle_words = [] # Temporary list for words in the current subtitle
|
@@ -343,6 +371,7 @@ def write_sentence_srt(word_level_timestamps, output_file="subtitles.srt", max_w
|
|
343 |
# Write subtitles to SRT file
|
344 |
with open(output_file, "w", encoding="utf-8") as f:
|
345 |
for i, (start, end, text) in enumerate(subtitles, start=1):
|
|
|
346 |
f.write(f"{i}\n{format_srt_time(start)} --> {format_srt_time(end)}\n{text}\n\n")
|
347 |
|
348 |
# print(f"SRT file '{output_file}' created successfully!")
|
@@ -591,6 +620,7 @@ import click
|
|
591 |
@click.option("--debug", is_flag=True, default=False, help="Enable debug mode.")
|
592 |
@click.option("--share", is_flag=True, default=False, help="Enable sharing of the interface.")
|
593 |
def main(debug, share):
|
|
|
594 |
demo1 = ui()
|
595 |
demo2 = tutorial()
|
596 |
demo = gr.TabbedInterface([demo1, demo2],["Multilingual TTS","VoicePack Explanation"],title="Kokoro TTS")#,theme='JohnSmith9982/small_and_pretty')
|
@@ -608,4 +638,4 @@ last_used_language = "a"
|
|
608 |
pipeline = KPipeline(lang_code=last_used_language)
|
609 |
temp_folder = create_audio_dir()
|
610 |
if __name__ == "__main__":
|
611 |
-
main()
|
|
|
1 |
|
|
|
|
|
|
|
|
|
2 |
# Initalize a pipeline
|
3 |
from kokoro import KPipeline
|
4 |
# from IPython.display import display, Audio
|
|
|
173 |
combined += chunk
|
174 |
combined.export(output_path, format=audio_format)
|
175 |
return output_path
|
176 |
+
|
177 |
def generate_and_save_audio(text, Language="American English",voice="af_bella", speed=1,remove_silence=False,keep_silence_up_to=0.05):
|
178 |
text=clean_text(text)
|
179 |
update_pipeline(Language)
|
|
|
202 |
audio_int16 = (audio_np * 32767).astype(np.int16) # Scale to 16-bit range
|
203 |
audio_bytes = audio_int16.tobytes() # Convert to bytes
|
204 |
# Write the audio chunk to the WAV file
|
205 |
+
duration_sec = len(audio_np) / 24000
|
206 |
+
timestamps[i]["duration"] = duration_sec
|
207 |
wav_file.writeframes(audio_bytes)
|
208 |
if remove_silence:
|
209 |
keep_silence = int(keep_silence_up_to * 1000)
|
|
|
211 |
return new_wave_file,timestamps
|
212 |
return save_path,timestamps
|
213 |
|
214 |
+
|
215 |
+
|
216 |
def adjust_timestamps(timestamp_dict):
|
217 |
adjusted_timestamps = []
|
218 |
+
last_global_end = 0 # Cumulative audio timeline
|
219 |
|
220 |
for segment_id in sorted(timestamp_dict.keys()):
|
221 |
segment = timestamp_dict[segment_id]
|
222 |
words = segment["words"]
|
223 |
+
chunk_duration = segment["duration"]
|
224 |
|
225 |
+
# If there are valid words, get last word end
|
226 |
+
last_word_end_in_chunk = (
|
227 |
+
max(w["end"] for w in words if w["end"] not in [None, 0])
|
228 |
+
if words else 0
|
229 |
+
)
|
230 |
|
231 |
+
silence_gap = chunk_duration - last_word_end_in_chunk
|
232 |
+
if silence_gap < 0: # In rare cases where end > duration (due to rounding)
|
233 |
+
silence_gap = 0
|
234 |
|
235 |
+
for word in words:
|
236 |
+
start = word["start"] or 0
|
237 |
+
end = word["end"] or start
|
238 |
|
239 |
adjusted_timestamps.append({
|
240 |
+
"word": word["word"],
|
241 |
+
"start": round(last_global_end + start, 3),
|
242 |
+
"end": round(last_global_end + end, 3)
|
243 |
})
|
244 |
|
245 |
+
# Add entire chunk duration to global end
|
246 |
+
last_global_end += chunk_duration
|
|
|
247 |
|
248 |
return adjusted_timestamps
|
249 |
|
250 |
|
251 |
+
|
252 |
import string
|
253 |
|
254 |
def write_word_srt(word_level_timestamps, output_file="word.srt", skip_punctuation=True):
|
|
|
282 |
|
283 |
import string
|
284 |
|
285 |
+
|
286 |
+
def split_line_by_char_limit(text, max_chars=30):
|
287 |
+
words = text.split()
|
288 |
+
lines = []
|
289 |
+
current_line = ""
|
290 |
+
|
291 |
+
for word in words:
|
292 |
+
if len(current_line + " " + word) <= max_chars:
|
293 |
+
current_line = (current_line + " " + word).strip()
|
294 |
+
else:
|
295 |
+
lines.append(current_line)
|
296 |
+
current_line = word
|
297 |
+
|
298 |
+
if current_line:
|
299 |
+
# Check if last line is a single word and there is a previous line
|
300 |
+
if len(current_line.split()) == 1 and len(lines) > 0:
|
301 |
+
# Append single word to previous line
|
302 |
+
lines[-1] += " " + current_line
|
303 |
+
else:
|
304 |
+
lines.append(current_line)
|
305 |
+
|
306 |
+
return "\n".join(lines)
|
307 |
+
|
308 |
+
|
309 |
def write_sentence_srt(word_level_timestamps, output_file="subtitles.srt", max_words=8, min_pause=0.1):
|
310 |
subtitles = [] # Stores subtitle blocks
|
311 |
subtitle_words = [] # Temporary list for words in the current subtitle
|
|
|
371 |
# Write subtitles to SRT file
|
372 |
with open(output_file, "w", encoding="utf-8") as f:
|
373 |
for i, (start, end, text) in enumerate(subtitles, start=1):
|
374 |
+
text=split_line_by_char_limit(text, max_chars=30)
|
375 |
f.write(f"{i}\n{format_srt_time(start)} --> {format_srt_time(end)}\n{text}\n\n")
|
376 |
|
377 |
# print(f"SRT file '{output_file}' created successfully!")
|
|
|
620 |
@click.option("--debug", is_flag=True, default=False, help="Enable debug mode.")
|
621 |
@click.option("--share", is_flag=True, default=False, help="Enable sharing of the interface.")
|
622 |
def main(debug, share):
|
623 |
+
# def main(debug=True, share=True):
|
624 |
demo1 = ui()
|
625 |
demo2 = tutorial()
|
626 |
demo = gr.TabbedInterface([demo1, demo2],["Multilingual TTS","VoicePack Explanation"],title="Kokoro TTS")#,theme='JohnSmith9982/small_and_pretty')
|
|
|
638 |
pipeline = KPipeline(lang_code=last_used_language)
|
639 |
temp_folder = create_audio_dir()
|
640 |
if __name__ == "__main__":
|
641 |
+
main()
|