Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -72,6 +72,14 @@ parser.add_argument("--custom_model_url", type=str,
|
|
72 |
"Examples include David Attenborough's model: "
|
73 |
"'https://huggingface.co/drewThomasson/xtts_David_Attenborough_fine_tune/resolve/main/Finished_model_files.zip?download=true'. "
|
74 |
"More XTTS fine-tunes can be found on my Hugging Face at 'https://huggingface.co/drewThomasson'."))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
args = parser.parse_args()
|
76 |
|
77 |
|
@@ -580,7 +588,7 @@ from tqdm import tqdm
|
|
580 |
|
581 |
# Convert chapters to audio using XTTS
|
582 |
|
583 |
-
def convert_chapters_to_audio_custom_model(chapters_dir, output_audio_dir, target_voice_path=None, language=None, custom_model=None):
|
584 |
|
585 |
if target_voice_path==None:
|
586 |
target_voice_path = default_target_voice_path
|
@@ -635,12 +643,12 @@ def convert_chapters_to_audio_custom_model(chapters_dir, output_audio_dir, targe
|
|
635 |
print(f"Generating fragment: {fragment}...")
|
636 |
fragment_file_path = os.path.join(temp_audio_directory, f"{temp_count}.wav")
|
637 |
if custom_model:
|
638 |
-
out = model.inference(fragment, language, gpt_cond_latent, speaker_embedding, temperature
|
639 |
torchaudio.save(fragment_file_path, torch.tensor(out["wav"]).unsqueeze(0), 24000)
|
640 |
else:
|
641 |
speaker_wav_path = target_voice_path if target_voice_path else default_target_voice_path
|
642 |
language_code = language if language else default_language_code
|
643 |
-
tts.tts_to_file(text=fragment, file_path=fragment_file_path, speaker_wav=speaker_wav_path, language=language_code)
|
644 |
temp_count += 1
|
645 |
|
646 |
combine_wav_files(temp_audio_directory, output_audio_dir, output_file_name)
|
@@ -649,7 +657,7 @@ def convert_chapters_to_audio_custom_model(chapters_dir, output_audio_dir, targe
|
|
649 |
|
650 |
|
651 |
|
652 |
-
def convert_chapters_to_audio_standard_model(chapters_dir, output_audio_dir, target_voice_path=None, language="en"):
|
653 |
selected_tts_model = "tts_models/multilingual/multi-dataset/xtts_v2"
|
654 |
tts = TTS(selected_tts_model, progress_bar=False).to(device)
|
655 |
|
@@ -690,7 +698,7 @@ def convert_chapters_to_audio_standard_model(chapters_dir, output_audio_dir, tar
|
|
690 |
print(f"Generating fragment: {fragment}...")
|
691 |
fragment_file_path = os.path.join(temp_audio_directory, f"{temp_count}.wav")
|
692 |
speaker_wav_path = target_voice_path if target_voice_path else default_target_voice_path
|
693 |
-
tts.tts_to_file(text=fragment, file_path=fragment_file_path, speaker_wav=speaker_wav_path, language=language)
|
694 |
temp_count += 1
|
695 |
|
696 |
combine_wav_files(temp_audio_directory, output_audio_dir, output_file_name)
|
@@ -700,7 +708,8 @@ def convert_chapters_to_audio_standard_model(chapters_dir, output_audio_dir, tar
|
|
700 |
|
701 |
|
702 |
# Define the functions to be used in the Gradio interface
|
703 |
-
def convert_ebook_to_audio(ebook_file, target_voice_file, language, use_custom_model, custom_model_file, custom_config_file, custom_vocab_file, custom_model_url=None, progress=gr.Progress()):
|
|
|
704 |
ebook_file_path = args.ebook if args.ebook else ebook_file.name
|
705 |
target_voice = args.voice if args.voice else target_voice_file.name if target_voice_file else None
|
706 |
custom_model = None
|
@@ -771,9 +780,9 @@ def convert_ebook_to_audio(ebook_file, target_voice_file, language, use_custom_m
|
|
771 |
print(f"Error updating progress: {e}")
|
772 |
|
773 |
if use_custom_model:
|
774 |
-
convert_chapters_to_audio_custom_model(chapters_directory, output_audio_directory, target_voice, language, custom_model)
|
775 |
else:
|
776 |
-
convert_chapters_to_audio_standard_model(chapters_directory, output_audio_directory, target_voice, language)
|
777 |
|
778 |
try:
|
779 |
progress(0.9, desc="Creating M4B from chapters")
|
@@ -807,6 +816,19 @@ def download_audiobooks():
|
|
807 |
return list_audiobook_files(audiobook_output_path)
|
808 |
|
809 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
810 |
# Gradio UI setup
|
811 |
def run_gradio_interface():
|
812 |
language_options = [
|
@@ -831,18 +853,82 @@ def run_gradio_interface():
|
|
831 |
"""
|
832 |
)
|
833 |
|
834 |
-
with gr.
|
835 |
-
with gr.
|
836 |
-
|
837 |
-
|
838 |
-
|
839 |
-
|
840 |
-
|
841 |
-
|
842 |
-
|
843 |
-
|
844 |
-
|
845 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
846 |
|
847 |
convert_btn = gr.Button("Convert to Audiobook", variant="primary")
|
848 |
output = gr.Textbox(label="Conversion Status")
|
@@ -852,7 +938,11 @@ def run_gradio_interface():
|
|
852 |
|
853 |
convert_btn.click(
|
854 |
convert_ebook_to_audio,
|
855 |
-
inputs=[
|
|
|
|
|
|
|
|
|
856 |
outputs=[output, audio_player]
|
857 |
)
|
858 |
|
@@ -880,6 +970,8 @@ def run_gradio_interface():
|
|
880 |
|
881 |
|
882 |
|
|
|
|
|
883 |
# Check if running in headless mode
|
884 |
if args.headless:
|
885 |
# If the arg.custom_model_url exists then use it as the custom_model_url lol
|
@@ -915,7 +1007,7 @@ if args.headless:
|
|
915 |
|
916 |
|
917 |
# Example headless execution
|
918 |
-
convert_ebook_to_audio(ebook_file_path, target_voice, args.language, args.use_custom_model, args.custom_model, args.custom_config, args.custom_vocab, custom_model_url)
|
919 |
|
920 |
else:
|
921 |
# Launch Gradio UI
|
|
|
72 |
"Examples include David Attenborough's model: "
|
73 |
"'https://huggingface.co/drewThomasson/xtts_David_Attenborough_fine_tune/resolve/main/Finished_model_files.zip?download=true'. "
|
74 |
"More XTTS fine-tunes can be found on my Hugging Face at 'https://huggingface.co/drewThomasson'."))
|
75 |
+
parser.add_argument("--temperature", type=float, default=0.65, help="Temperature for the model. Defaults to 0.65. Higher Tempatures will lead to more creative outputs IE: more Hallucinations. Lower Tempatures will be more monotone outputs IE: less Hallucinations.")
|
76 |
+
parser.add_argument("--length_penalty", type=float, default=1.0, help="A length penalty applied to the autoregressive decoder. Defaults to 1.0.")
|
77 |
+
parser.add_argument("--repetition_penalty", type=float, default=2.0, help="A penalty that prevents the autoregressive decoder from repeating itself. Defaults to 2.0.")
|
78 |
+
parser.add_argument("--top_k", type=int, default=50, help="Top-k sampling. Lower values mean more likely outputs. Defaults to 50.")
|
79 |
+
parser.add_argument("--top_p", type=float, default=0.8, help="Top-p sampling. Lower values mean more likely outputs. Defaults to 0.8.")
|
80 |
+
parser.add_argument("--speed", type=float, default=1.0, help="Speed factor for the speech generation. Defaults to 1.0.")
|
81 |
+
parser.add_argument("--enable_text_splitting", type=bool, default=False, help="Enable splitting text into sentences. Defaults to True.")
|
82 |
+
|
83 |
args = parser.parse_args()
|
84 |
|
85 |
|
|
|
588 |
|
589 |
# Convert chapters to audio using XTTS
|
590 |
|
591 |
+
def convert_chapters_to_audio_custom_model(chapters_dir, output_audio_dir, target_voice_path=None, language=None, custom_model=None, temperature, length_penalty, repetition_penalty, top_k, top_p, speed, enable_text_splitting):
|
592 |
|
593 |
if target_voice_path==None:
|
594 |
target_voice_path = default_target_voice_path
|
|
|
643 |
print(f"Generating fragment: {fragment}...")
|
644 |
fragment_file_path = os.path.join(temp_audio_directory, f"{temp_count}.wav")
|
645 |
if custom_model:
|
646 |
+
out = model.inference(fragment, language, gpt_cond_latent, speaker_embedding, temperature, length_penalty, repetition_penalty, top_k, top_p, speed, enable_text_splitting)
|
647 |
torchaudio.save(fragment_file_path, torch.tensor(out["wav"]).unsqueeze(0), 24000)
|
648 |
else:
|
649 |
speaker_wav_path = target_voice_path if target_voice_path else default_target_voice_path
|
650 |
language_code = language if language else default_language_code
|
651 |
+
tts.tts_to_file(text=fragment, file_path=fragment_file_path, speaker_wav=speaker_wav_path, language=language_code, temperature, length_penalty, repetition_penalty, top_k, top_p, speed, enable_text_splitting)
|
652 |
temp_count += 1
|
653 |
|
654 |
combine_wav_files(temp_audio_directory, output_audio_dir, output_file_name)
|
|
|
657 |
|
658 |
|
659 |
|
660 |
+
def convert_chapters_to_audio_standard_model(chapters_dir, output_audio_dir, target_voice_path=None, language="en", temperature, length_penalty, repetition_penalty, top_k, top_p, speed, enable_text_splitting):
|
661 |
selected_tts_model = "tts_models/multilingual/multi-dataset/xtts_v2"
|
662 |
tts = TTS(selected_tts_model, progress_bar=False).to(device)
|
663 |
|
|
|
698 |
print(f"Generating fragment: {fragment}...")
|
699 |
fragment_file_path = os.path.join(temp_audio_directory, f"{temp_count}.wav")
|
700 |
speaker_wav_path = target_voice_path if target_voice_path else default_target_voice_path
|
701 |
+
tts.tts_to_file(text=fragment, file_path=fragment_file_path, speaker_wav=speaker_wav_path, language=language, temperature, length_penalty, repetition_penalty, top_k, top_p, speed, enable_text_splitting)
|
702 |
temp_count += 1
|
703 |
|
704 |
combine_wav_files(temp_audio_directory, output_audio_dir, output_file_name)
|
|
|
708 |
|
709 |
|
710 |
# Define the functions to be used in the Gradio interface
|
711 |
+
def convert_ebook_to_audio(ebook_file, target_voice_file, language, use_custom_model, custom_model_file, custom_config_file, custom_vocab_file, custom_model_url=None, temperature, length_penalty, repetition_penalty, top_k, top_p, speed, enable_text_splitting, progress=gr.Progress()):
|
712 |
+
|
713 |
ebook_file_path = args.ebook if args.ebook else ebook_file.name
|
714 |
target_voice = args.voice if args.voice else target_voice_file.name if target_voice_file else None
|
715 |
custom_model = None
|
|
|
780 |
print(f"Error updating progress: {e}")
|
781 |
|
782 |
if use_custom_model:
|
783 |
+
convert_chapters_to_audio_custom_model(chapters_directory, output_audio_directory, target_voice, language, custom_model, temperature, length_penalty, repetition_penalty, top_k, top_p, speed, enable_text_splitting)
|
784 |
else:
|
785 |
+
convert_chapters_to_audio_standard_model(chapters_directory, output_audio_directory, target_voice, language, temperature, length_penalty, repetition_penalty, top_k, top_p, speed, enable_text_splitting)
|
786 |
|
787 |
try:
|
788 |
progress(0.9, desc="Creating M4B from chapters")
|
|
|
816 |
return list_audiobook_files(audiobook_output_path)
|
817 |
|
818 |
|
819 |
+
# Gradio UI setup
|
820 |
+
def run_gradio_interface():
|
821 |
+
language_options = [
|
822 |
+
"en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "hu", "ko"
|
823 |
+
]
|
824 |
+
|
825 |
+
theme = gr.themes.Soft(
|
826 |
+
primary_hue="blue",
|
827 |
+
secondary_hue="blue",
|
828 |
+
neutral_hue="blue",
|
829 |
+
text_size=gr.themes.sizes.text_md,
|
830 |
+
)
|
831 |
+
|
832 |
# Gradio UI setup
|
833 |
def run_gradio_interface():
|
834 |
language_options = [
|
|
|
853 |
"""
|
854 |
)
|
855 |
|
856 |
+
with gr.Tabs(): # Create tabs for better UI organization
|
857 |
+
with gr.TabItem("Input Options"):
|
858 |
+
with gr.Row():
|
859 |
+
with gr.Column(scale=3):
|
860 |
+
ebook_file = gr.File(label="eBook File")
|
861 |
+
target_voice_file = gr.File(label="Target Voice File (Optional)")
|
862 |
+
language = gr.Dropdown(label="Language", choices=language_options, value="en")
|
863 |
+
|
864 |
+
with gr.Column(scale=3):
|
865 |
+
use_custom_model = gr.Checkbox(label="Use Custom Model")
|
866 |
+
custom_model_file = gr.File(label="Custom Model File (Optional)", visible=False)
|
867 |
+
custom_config_file = gr.File(label="Custom Config File (Optional)", visible=False)
|
868 |
+
custom_vocab_file = gr.File(label="Custom Vocab File (Optional)", visible=False)
|
869 |
+
custom_model_url = gr.Textbox(label="Custom Model Zip URL (Optional)", visible=False)
|
870 |
+
|
871 |
+
with gr.TabItem("Audio Generation Preferences"): # New tab for preferences
|
872 |
+
gr.Markdown(
|
873 |
+
"""
|
874 |
+
### Customize Audio Generation Parameters
|
875 |
+
|
876 |
+
Adjust the settings below to influence how the audio is generated. You can control the creativity, speed, repetition, and more.
|
877 |
+
"""
|
878 |
+
)
|
879 |
+
temperature = gr.Slider(
|
880 |
+
label="Temperature",
|
881 |
+
minimum=0.1,
|
882 |
+
maximum=2.0,
|
883 |
+
step=0.1,
|
884 |
+
value=0.65,
|
885 |
+
info="Higher values lead to more creative, unpredictable outputs. Lower values make it more monotone."
|
886 |
+
)
|
887 |
+
length_penalty = gr.Slider(
|
888 |
+
label="Length Penalty",
|
889 |
+
minimum=0.5,
|
890 |
+
maximum=3.0,
|
891 |
+
step=0.1,
|
892 |
+
value=1.0,
|
893 |
+
info="Penalize longer sequences. Higher values produce shorter outputs."
|
894 |
+
)
|
895 |
+
repetition_penalty = gr.Slider(
|
896 |
+
label="Repetition Penalty",
|
897 |
+
minimum=1.0,
|
898 |
+
maximum=5.0,
|
899 |
+
step=0.1,
|
900 |
+
value=2.0,
|
901 |
+
info="Penalizes repeated phrases. Higher values reduce repetition."
|
902 |
+
)
|
903 |
+
top_k = gr.Slider(
|
904 |
+
label="Top-k Sampling",
|
905 |
+
minimum=10,
|
906 |
+
maximum=100,
|
907 |
+
step=1,
|
908 |
+
value=50,
|
909 |
+
info="Lower values restrict outputs to more likely words."
|
910 |
+
)
|
911 |
+
top_p = gr.Slider(
|
912 |
+
label="Top-p Sampling",
|
913 |
+
minimum=0.1,
|
914 |
+
maximum=1.0,
|
915 |
+
step=0.1,
|
916 |
+
value=0.8,
|
917 |
+
info="Controls cumulative probability for word selection. Lower values make the output more predictable."
|
918 |
+
)
|
919 |
+
speed = gr.Slider(
|
920 |
+
label="Speed",
|
921 |
+
minimum=0.5,
|
922 |
+
maximum=4.0,
|
923 |
+
step=0.1,
|
924 |
+
value=1.0,
|
925 |
+
info="Adjusts the playback speed of the generated audio."
|
926 |
+
)
|
927 |
+
enable_text_splitting = gr.Checkbox(
|
928 |
+
label="Enable Text Splitting",
|
929 |
+
value=False,
|
930 |
+
info="Splits long texts into sentences to generate audio in chunks. Useful for very long inputs."
|
931 |
+
)
|
932 |
|
933 |
convert_btn = gr.Button("Convert to Audiobook", variant="primary")
|
934 |
output = gr.Textbox(label="Conversion Status")
|
|
|
938 |
|
939 |
convert_btn.click(
|
940 |
convert_ebook_to_audio,
|
941 |
+
inputs=[
|
942 |
+
ebook_file, target_voice_file, language, use_custom_model, custom_model_file, custom_config_file,
|
943 |
+
custom_vocab_file, custom_model_url, temperature, length_penalty, repetition_penalty,
|
944 |
+
top_k, top_p, speed, enable_text_splitting
|
945 |
+
],
|
946 |
outputs=[output, audio_player]
|
947 |
)
|
948 |
|
|
|
970 |
|
971 |
|
972 |
|
973 |
+
|
974 |
+
|
975 |
# Check if running in headless mode
|
976 |
if args.headless:
|
977 |
# If the arg.custom_model_url exists then use it as the custom_model_url lol
|
|
|
1007 |
|
1008 |
|
1009 |
# Example headless execution
|
1010 |
+
convert_ebook_to_audio(ebook_file_path, target_voice, args.language, args.use_custom_model, args.custom_model, args.custom_config, args.custom_vocab, custom_model_url, args.temperature, args.length_penalty, args.repetition_penalty, args.top_k, args.top_p, args.speed, args.enable_text_splitting)
|
1011 |
|
1012 |
else:
|
1013 |
# Launch Gradio UI
|