File size: 2,355 Bytes
33051d3
 
4e01295
c764bfb
33051d3
 
 
 
 
c764bfb
4e01295
33051d3
4e01295
 
9a433d2
 
 
 
 
33051d3
bed109a
c764bfb
33051d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6fa7850
33051d3
 
 
 
 
 
 
c764bfb
33051d3
c764bfb
33051d3
7337c95
33051d3
e780472
33051d3
 
c764bfb
33051d3
 
 
 
 
c764bfb
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import logging
from typing import cast
from pdfminer.high_level import extract_text
from huggingface_hub import hf_hub_download, list_repo_files
import gradio as gr
from balacoon_tts import TTS

# Global TTS module, initialized from a selected model
tts = None

def read_pdf(file):
    text = extract_text(file.name)
    return text

def synthesize_audio(file, model_name_str, speaker_str):
    text_str = read_pdf(file)
    if len(text_str) > 1024:
        text_str = text_str[:1024]
    samples = tts.synthesize(text_str, speaker_str)
    return (samples, tts.get_sampling_rate())

def main():
    logging.basicConfig(level=logging.INFO)
    with gr.Interface(fn=None, title="PDF TO SPEECH CONVERTER", layout="rows", debug=True) as iface:
        gr.Markdown(
            """
            <h1 align="center">PDF TO SPEECH CONVERTER</h1>
            1. Insert a PDF 
            2. Select the model to synthesize with
            3. Select speaker
            4. Hit "Generate" and listen to the result!
            When you select a model for the first time, it may take some time to download it.
            This project is designed to bring the joy of reading without the hassle of looking over.
            If you want an audiobook, you've got it!
            """
        )

        file_input = gr.inputs.File(label="Upload PDF")

        model_files = list_repo_files(repo_id="balacoon/tts")
        model_name_dropdown = gr.inputs.Dropdown(label="Model", choices=model_files)

        def set_model(model_name_str: str):
            model_path = hf_hub_download(repo_id="balacoon/tts", filename=model_name_str)
            global tts
            tts = TTS(model_path)
            speakers = tts.get_speakers()
            default_speaker = speakers[-1]
            return speakers, default_speaker

        model_name_dropdown.set_action(set_model)

        speaker_dropdown = gr.inputs.Dropdown(label="Speaker", choices=[])

        audio = gr.outputs.Audio(label="Generated Audio", type="numpy")

        def generate_audio(file, model_name_str, speaker_str):
            return synthesize_audio(file, model_name_str, speaker_str)

        iface.add_input(file_input)
        iface.add_input(model_name_dropdown)
        iface.add_input(speaker_dropdown)
        iface.add_output(audio)
        iface.run()

if __name__ == "__main__":
    main()