drewThomasson commited on
Commit
bc1eafc
·
verified ·
1 Parent(s): 771c7ff

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +114 -22
app.py CHANGED
@@ -72,6 +72,14 @@ parser.add_argument("--custom_model_url", type=str,
72
  "Examples include David Attenborough's model: "
73
  "'https://huggingface.co/drewThomasson/xtts_David_Attenborough_fine_tune/resolve/main/Finished_model_files.zip?download=true'. "
74
  "More XTTS fine-tunes can be found on my Hugging Face at 'https://huggingface.co/drewThomasson'."))
 
 
 
 
 
 
 
 
75
  args = parser.parse_args()
76
 
77
 
@@ -580,7 +588,7 @@ from tqdm import tqdm
580
 
581
  # Convert chapters to audio using XTTS
582
 
583
- def convert_chapters_to_audio_custom_model(chapters_dir, output_audio_dir, target_voice_path=None, language=None, custom_model=None):
584
 
585
  if target_voice_path==None:
586
  target_voice_path = default_target_voice_path
@@ -635,12 +643,12 @@ def convert_chapters_to_audio_custom_model(chapters_dir, output_audio_dir, targe
635
  print(f"Generating fragment: {fragment}...")
636
  fragment_file_path = os.path.join(temp_audio_directory, f"{temp_count}.wav")
637
  if custom_model:
638
- out = model.inference(fragment, language, gpt_cond_latent, speaker_embedding, temperature=0.7)
639
  torchaudio.save(fragment_file_path, torch.tensor(out["wav"]).unsqueeze(0), 24000)
640
  else:
641
  speaker_wav_path = target_voice_path if target_voice_path else default_target_voice_path
642
  language_code = language if language else default_language_code
643
- tts.tts_to_file(text=fragment, file_path=fragment_file_path, speaker_wav=speaker_wav_path, language=language_code)
644
  temp_count += 1
645
 
646
  combine_wav_files(temp_audio_directory, output_audio_dir, output_file_name)
@@ -649,7 +657,7 @@ def convert_chapters_to_audio_custom_model(chapters_dir, output_audio_dir, targe
649
 
650
 
651
 
652
- def convert_chapters_to_audio_standard_model(chapters_dir, output_audio_dir, target_voice_path=None, language="en"):
653
  selected_tts_model = "tts_models/multilingual/multi-dataset/xtts_v2"
654
  tts = TTS(selected_tts_model, progress_bar=False).to(device)
655
 
@@ -690,7 +698,7 @@ def convert_chapters_to_audio_standard_model(chapters_dir, output_audio_dir, tar
690
  print(f"Generating fragment: {fragment}...")
691
  fragment_file_path = os.path.join(temp_audio_directory, f"{temp_count}.wav")
692
  speaker_wav_path = target_voice_path if target_voice_path else default_target_voice_path
693
- tts.tts_to_file(text=fragment, file_path=fragment_file_path, speaker_wav=speaker_wav_path, language=language)
694
  temp_count += 1
695
 
696
  combine_wav_files(temp_audio_directory, output_audio_dir, output_file_name)
@@ -700,7 +708,8 @@ def convert_chapters_to_audio_standard_model(chapters_dir, output_audio_dir, tar
700
 
701
 
702
  # Define the functions to be used in the Gradio interface
703
- def convert_ebook_to_audio(ebook_file, target_voice_file, language, use_custom_model, custom_model_file, custom_config_file, custom_vocab_file, custom_model_url=None, progress=gr.Progress()):
 
704
  ebook_file_path = args.ebook if args.ebook else ebook_file.name
705
  target_voice = args.voice if args.voice else target_voice_file.name if target_voice_file else None
706
  custom_model = None
@@ -771,9 +780,9 @@ def convert_ebook_to_audio(ebook_file, target_voice_file, language, use_custom_m
771
  print(f"Error updating progress: {e}")
772
 
773
  if use_custom_model:
774
- convert_chapters_to_audio_custom_model(chapters_directory, output_audio_directory, target_voice, language, custom_model)
775
  else:
776
- convert_chapters_to_audio_standard_model(chapters_directory, output_audio_directory, target_voice, language)
777
 
778
  try:
779
  progress(0.9, desc="Creating M4B from chapters")
@@ -807,6 +816,19 @@ def download_audiobooks():
807
  return list_audiobook_files(audiobook_output_path)
808
 
809
 
 
 
 
 
 
 
 
 
 
 
 
 
 
810
  # Gradio UI setup
811
  def run_gradio_interface():
812
  language_options = [
@@ -831,18 +853,82 @@ def run_gradio_interface():
831
  """
832
  )
833
 
834
- with gr.Row():
835
- with gr.Column(scale=3):
836
- ebook_file = gr.File(label="eBook File")
837
- target_voice_file = gr.File(label="Target Voice File (Optional)")
838
- language = gr.Dropdown(label="Language", choices=language_options, value="en")
839
-
840
- with gr.Column(scale=3):
841
- use_custom_model = gr.Checkbox(label="Use Custom Model")
842
- custom_model_file = gr.File(label="Custom Model File (Optional)", visible=False)
843
- custom_config_file = gr.File(label="Custom Config File (Optional)", visible=False)
844
- custom_vocab_file = gr.File(label="Custom Vocab File (Optional)", visible=False)
845
- custom_model_url = gr.Textbox(label="Custom Model Zip URL (Optional)", visible=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
846
 
847
  convert_btn = gr.Button("Convert to Audiobook", variant="primary")
848
  output = gr.Textbox(label="Conversion Status")
@@ -852,7 +938,11 @@ def run_gradio_interface():
852
 
853
  convert_btn.click(
854
  convert_ebook_to_audio,
855
- inputs=[ebook_file, target_voice_file, language, use_custom_model, custom_model_file, custom_config_file, custom_vocab_file, custom_model_url],
 
 
 
 
856
  outputs=[output, audio_player]
857
  )
858
 
@@ -880,6 +970,8 @@ def run_gradio_interface():
880
 
881
 
882
 
 
 
883
  # Check if running in headless mode
884
  if args.headless:
885
  # If the arg.custom_model_url exists then use it as the custom_model_url lol
@@ -915,7 +1007,7 @@ if args.headless:
915
 
916
 
917
  # Example headless execution
918
- convert_ebook_to_audio(ebook_file_path, target_voice, args.language, args.use_custom_model, args.custom_model, args.custom_config, args.custom_vocab, custom_model_url)
919
 
920
  else:
921
  # Launch Gradio UI
 
72
  "Examples include David Attenborough's model: "
73
  "'https://huggingface.co/drewThomasson/xtts_David_Attenborough_fine_tune/resolve/main/Finished_model_files.zip?download=true'. "
74
  "More XTTS fine-tunes can be found on my Hugging Face at 'https://huggingface.co/drewThomasson'."))
75
+ parser.add_argument("--temperature", type=float, default=0.65, help="Temperature for the model. Defaults to 0.65. Higher Tempatures will lead to more creative outputs IE: more Hallucinations. Lower Tempatures will be more monotone outputs IE: less Hallucinations.")
76
+ parser.add_argument("--length_penalty", type=float, default=1.0, help="A length penalty applied to the autoregressive decoder. Defaults to 1.0.")
77
+ parser.add_argument("--repetition_penalty", type=float, default=2.0, help="A penalty that prevents the autoregressive decoder from repeating itself. Defaults to 2.0.")
78
+ parser.add_argument("--top_k", type=int, default=50, help="Top-k sampling. Lower values mean more likely outputs. Defaults to 50.")
79
+ parser.add_argument("--top_p", type=float, default=0.8, help="Top-p sampling. Lower values mean more likely outputs. Defaults to 0.8.")
80
+ parser.add_argument("--speed", type=float, default=1.0, help="Speed factor for the speech generation. Defaults to 1.0.")
81
+ parser.add_argument("--enable_text_splitting", type=bool, default=False, help="Enable splitting text into sentences. Defaults to True.")
82
+
83
  args = parser.parse_args()
84
 
85
 
 
588
 
589
  # Convert chapters to audio using XTTS
590
 
591
+ def convert_chapters_to_audio_custom_model(chapters_dir, output_audio_dir, target_voice_path=None, language=None, custom_model=None, temperature, length_penalty, repetition_penalty, top_k, top_p, speed, enable_text_splitting):
592
 
593
  if target_voice_path==None:
594
  target_voice_path = default_target_voice_path
 
643
  print(f"Generating fragment: {fragment}...")
644
  fragment_file_path = os.path.join(temp_audio_directory, f"{temp_count}.wav")
645
  if custom_model:
646
+ out = model.inference(fragment, language, gpt_cond_latent, speaker_embedding, temperature, length_penalty, repetition_penalty, top_k, top_p, speed, enable_text_splitting)
647
  torchaudio.save(fragment_file_path, torch.tensor(out["wav"]).unsqueeze(0), 24000)
648
  else:
649
  speaker_wav_path = target_voice_path if target_voice_path else default_target_voice_path
650
  language_code = language if language else default_language_code
651
+ tts.tts_to_file(text=fragment, file_path=fragment_file_path, speaker_wav=speaker_wav_path, language=language_code, temperature, length_penalty, repetition_penalty, top_k, top_p, speed, enable_text_splitting)
652
  temp_count += 1
653
 
654
  combine_wav_files(temp_audio_directory, output_audio_dir, output_file_name)
 
657
 
658
 
659
 
660
+ def convert_chapters_to_audio_standard_model(chapters_dir, output_audio_dir, target_voice_path=None, language="en", temperature, length_penalty, repetition_penalty, top_k, top_p, speed, enable_text_splitting):
661
  selected_tts_model = "tts_models/multilingual/multi-dataset/xtts_v2"
662
  tts = TTS(selected_tts_model, progress_bar=False).to(device)
663
 
 
698
  print(f"Generating fragment: {fragment}...")
699
  fragment_file_path = os.path.join(temp_audio_directory, f"{temp_count}.wav")
700
  speaker_wav_path = target_voice_path if target_voice_path else default_target_voice_path
701
+ tts.tts_to_file(text=fragment, file_path=fragment_file_path, speaker_wav=speaker_wav_path, language=language, temperature, length_penalty, repetition_penalty, top_k, top_p, speed, enable_text_splitting)
702
  temp_count += 1
703
 
704
  combine_wav_files(temp_audio_directory, output_audio_dir, output_file_name)
 
708
 
709
 
710
  # Define the functions to be used in the Gradio interface
711
+ def convert_ebook_to_audio(ebook_file, target_voice_file, language, use_custom_model, custom_model_file, custom_config_file, custom_vocab_file, custom_model_url=None, temperature, length_penalty, repetition_penalty, top_k, top_p, speed, enable_text_splitting, progress=gr.Progress()):
712
+
713
  ebook_file_path = args.ebook if args.ebook else ebook_file.name
714
  target_voice = args.voice if args.voice else target_voice_file.name if target_voice_file else None
715
  custom_model = None
 
780
  print(f"Error updating progress: {e}")
781
 
782
  if use_custom_model:
783
+ convert_chapters_to_audio_custom_model(chapters_directory, output_audio_directory, target_voice, language, custom_model, temperature, length_penalty, repetition_penalty, top_k, top_p, speed, enable_text_splitting)
784
  else:
785
+ convert_chapters_to_audio_standard_model(chapters_directory, output_audio_directory, target_voice, language, temperature, length_penalty, repetition_penalty, top_k, top_p, speed, enable_text_splitting)
786
 
787
  try:
788
  progress(0.9, desc="Creating M4B from chapters")
 
816
  return list_audiobook_files(audiobook_output_path)
817
 
818
 
819
+ # Gradio UI setup
820
+ def run_gradio_interface():
821
+ language_options = [
822
+ "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "hu", "ko"
823
+ ]
824
+
825
+ theme = gr.themes.Soft(
826
+ primary_hue="blue",
827
+ secondary_hue="blue",
828
+ neutral_hue="blue",
829
+ text_size=gr.themes.sizes.text_md,
830
+ )
831
+
832
  # Gradio UI setup
833
  def run_gradio_interface():
834
  language_options = [
 
853
  """
854
  )
855
 
856
+ with gr.Tabs(): # Create tabs for better UI organization
857
+ with gr.TabItem("Input Options"):
858
+ with gr.Row():
859
+ with gr.Column(scale=3):
860
+ ebook_file = gr.File(label="eBook File")
861
+ target_voice_file = gr.File(label="Target Voice File (Optional)")
862
+ language = gr.Dropdown(label="Language", choices=language_options, value="en")
863
+
864
+ with gr.Column(scale=3):
865
+ use_custom_model = gr.Checkbox(label="Use Custom Model")
866
+ custom_model_file = gr.File(label="Custom Model File (Optional)", visible=False)
867
+ custom_config_file = gr.File(label="Custom Config File (Optional)", visible=False)
868
+ custom_vocab_file = gr.File(label="Custom Vocab File (Optional)", visible=False)
869
+ custom_model_url = gr.Textbox(label="Custom Model Zip URL (Optional)", visible=False)
870
+
871
+ with gr.TabItem("Audio Generation Preferences"): # New tab for preferences
872
+ gr.Markdown(
873
+ """
874
+ ### Customize Audio Generation Parameters
875
+
876
+ Adjust the settings below to influence how the audio is generated. You can control the creativity, speed, repetition, and more.
877
+ """
878
+ )
879
+ temperature = gr.Slider(
880
+ label="Temperature",
881
+ minimum=0.1,
882
+ maximum=2.0,
883
+ step=0.1,
884
+ value=0.65,
885
+ info="Higher values lead to more creative, unpredictable outputs. Lower values make it more monotone."
886
+ )
887
+ length_penalty = gr.Slider(
888
+ label="Length Penalty",
889
+ minimum=0.5,
890
+ maximum=3.0,
891
+ step=0.1,
892
+ value=1.0,
893
+ info="Penalize longer sequences. Higher values produce shorter outputs."
894
+ )
895
+ repetition_penalty = gr.Slider(
896
+ label="Repetition Penalty",
897
+ minimum=1.0,
898
+ maximum=5.0,
899
+ step=0.1,
900
+ value=2.0,
901
+ info="Penalizes repeated phrases. Higher values reduce repetition."
902
+ )
903
+ top_k = gr.Slider(
904
+ label="Top-k Sampling",
905
+ minimum=10,
906
+ maximum=100,
907
+ step=1,
908
+ value=50,
909
+ info="Lower values restrict outputs to more likely words."
910
+ )
911
+ top_p = gr.Slider(
912
+ label="Top-p Sampling",
913
+ minimum=0.1,
914
+ maximum=1.0,
915
+ step=0.1,
916
+ value=0.8,
917
+ info="Controls cumulative probability for word selection. Lower values make the output more predictable."
918
+ )
919
+ speed = gr.Slider(
920
+ label="Speed",
921
+ minimum=0.5,
922
+ maximum=4.0,
923
+ step=0.1,
924
+ value=1.0,
925
+ info="Adjusts the playback speed of the generated audio."
926
+ )
927
+ enable_text_splitting = gr.Checkbox(
928
+ label="Enable Text Splitting",
929
+ value=False,
930
+ info="Splits long texts into sentences to generate audio in chunks. Useful for very long inputs."
931
+ )
932
 
933
  convert_btn = gr.Button("Convert to Audiobook", variant="primary")
934
  output = gr.Textbox(label="Conversion Status")
 
938
 
939
  convert_btn.click(
940
  convert_ebook_to_audio,
941
+ inputs=[
942
+ ebook_file, target_voice_file, language, use_custom_model, custom_model_file, custom_config_file,
943
+ custom_vocab_file, custom_model_url, temperature, length_penalty, repetition_penalty,
944
+ top_k, top_p, speed, enable_text_splitting
945
+ ],
946
  outputs=[output, audio_player]
947
  )
948
 
 
970
 
971
 
972
 
973
+
974
+
975
  # Check if running in headless mode
976
  if args.headless:
977
  # If the arg.custom_model_url exists then use it as the custom_model_url lol
 
1007
 
1008
 
1009
  # Example headless execution
1010
+ convert_ebook_to_audio(ebook_file_path, target_voice, args.language, args.use_custom_model, args.custom_model, args.custom_config, args.custom_vocab, custom_model_url, args.temperature, args.length_penalty, args.repetition_penalty, args.top_k, args.top_p, args.speed, args.enable_text_splitting)
1011
 
1012
  else:
1013
  # Launch Gradio UI