import logging from typing import cast from pdfminer.high_level import extract_text from huggingface_hub import hf_hub_download, list_repo_files import gradio as gr from balacoon_tts import TTS # Global TTS module, initialized from a selected model tts = None def read_pdf(file): text = extract_text(file.name) return text def synthesize_audio(file, model_name_str, speaker_str): text_str = read_pdf(file) if len(text_str) > 1024: text_str = text_str[:1024] samples = tts.synthesize(text_str, speaker_str) return (samples, tts.get_sampling_rate()) def main(): logging.basicConfig(level=logging.INFO) file_input = gr.inputs.File(label="Upload PDF") model_files = list_repo_files(repo_id="balacoon/tts") model_name_dropdown = gr.inputs.Dropdown(label="Model", choices=model_files) def set_model(model_name_str: str): model_path = hf_hub_download(repo_id="balacoon/tts", filename=model_name_str) global tts tts = TTS(model_path) speakers = tts.get_speakers() default_speaker = speakers[-1] speaker_dropdown.choices = speakers return speakers, default_speaker speaker_dropdown = gr.inputs.Dropdown(label="Speaker", choices=[]) audio = gr.outputs.Audio(label="Generated Audio", type="numpy") def generate_audio(file, model_name_str, speaker_str): return synthesize_audio(file, model_name_str, speaker_str) iface = gr.Interface( fn=generate_audio, inputs=[file_input, model_name_dropdown, speaker_dropdown], outputs=audio, title="PDF TO SPEECH CONVERTER", layout="rows", debug=True ) model_name_dropdown.set_action(set_model) iface.launch() if __name__ == "__main__": main()