import gradio as gr import pdfminer from pdfminer.high_level import extract_text import logging from typing import cast import gradio as gr from balacoon_tts import TTS from huggingface_hub import hf_hub_download, list_repo_files # global tts module, initialized from a model selected tts = None def read_pdf(file): text = extract_text(file.name) return text # iface = gr.Interface( # read_pdf, # gr.inputs.File(), # # gr.outputs.Textbox() # ) # iface.launch() def main(): logging.basicConfig(level=logging.INFO) with gr.Blocks() as demo: gr.Markdown( """

PDF TO SPEECH CONVERTER

1. insert a pdf 2. Select the model to synthesize with 3. Select speaker 4. Hit "Generate" and listen to the result! When you select model for the first time, it will take a little time to download it. this project is designed to take the love of reading without the hassle of looking over. if you want an audio book , you now got it . """ ) with gr.Row(variant="panel"): f=gr.inputs.File() text = read_pdf(f) with gr.Row(): with gr.Column(variant="panel"): repo_files = list_repo_files(repo_id="balacoon/tts") model_files = [x for x in repo_files if x.endswith("_cpu.addon")] model_name = gr.Dropdown( label="Model", choices=model_files, ) with gr.Column(variant="panel"): speaker = gr.Dropdown(label="Speaker", choices=[]) def set_model(model_name_str: str): """ gets value from `model_name`, loads model, re-initializes tts object, gets list of speakers that model supports and set them to `speaker` """ model_path = hf_hub_download( repo_id="balacoon/tts", filename=model_name_str ) global tts tts = TTS(model_path) speakers = tts.get_speakers() value = speakers[-1] return gr.Dropdown.update( choices=speakers, value=value, visible=True ) model_name.change(set_model, inputs=model_name, outputs=speaker) with gr.Row(variant="panel"): generate = gr.Button("Generate") with gr.Row(variant="panel"): audio = gr.Audio() def synthesize_audio(text_str: str, speaker_str: str = ""): """ gets utterance to synthesize from `text` Textbox and speaker name from `speaker` dropdown list. speaker name might be empty for single-speaker models. Synthesizes the waveform and updates `audio` with it. """ if not text_str: logging.info("text or speaker are not provided") return None global tts if len(text_str) > 1024: text_str = text_str[:1024] samples = cast(TTS, tts).synthesize(text_str, speaker_str) return gr.Audio.update(value=(cast(TTS, tts).get_sampling_rate(), samples)) generate.click(synthesize_audio, inputs=[text, speaker], outputs=audio) demo.launch() if __name__ == "__main__": main()