File size: 2,369 Bytes
4e01295
 
c764bfb
 
 
 
 
6fa7850
c764bfb
4e01295
 
6fa7850
 
4e01295
 
bed109a
 
 
 
 
 
 
 
 
 
e780472
7337c95
bed109a
c764bfb
 
d1d085b
 
1c30e88
e780472
6fa7850
d1d085b
 
c764bfb
d1d085b
 
 
 
 
 
 
 
 
6fa7850
d1d085b
 
 
6fa7850
d1d085b
 
 
c764bfb
18b6735
c764bfb
c5fd4f3
 
5b13b80
c5fd4f3
 
 
 
 
7337c95
e780472
 
 
868b522
c764bfb
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import gradio as gr
from pdfminer.high_level import extract_text
import logging
from typing import cast
from balacoon_tts import TTS
from huggingface_hub import hf_hub_download, list_repo_files

# Global tts module, initialized from a model selected
tts = None

def read_pdf(file):
    with open(file.name, "rb") as f:
        text = extract_text(f)
    return text

def set_model(model_name_str):
    """
    Gets value from `model_name`, loads the model,
    re-initializes the tts object, and gets a list of
    speakers that the model supports and sets them to `speaker`.
    """
    model_path = hf_hub_download(repo_id="balacoon/tts", filename=model_name_str)
    global tts
    tts = TTS(model_path)
    speakers = tts.get_speakers()
    value = speakers[0] if speakers else None
    return speakers, value

def main():
    logging.basicConfig(level=logging.INFO)
    repo_files = list_repo_files(repo_id="balacoon/tts")
    model_files = [x for x in repo_files if x.endswith("_cpu.addon")]
    model_name_dropdown = gr.inputs.Dropdown(label="Model", choices=model_files)
    speaker_dropdown = gr.inputs.Dropdown(label="Speaker", choices=[])

    file_input = gr.inputs.File(label="Select a PDF File", type="file")
    text = gr.outputs.Textbox()

    def synthesize_audio(file, model_name_str, speaker_str):
        """
        Gets the selected PDF `file`, model name from `model_name`,
        and speaker name from `speaker`. Synthesizes the audio waveform
        from the text extracted from the PDF and returns it.
        """
        if file is None or file.name == "":
            logging.info("No file selected.")
            return None

        text_str = read_pdf(file)
        if len(text_str) > 1024:
            text_str = text_str[:1024]

        global tts
        samples = cast(TTS, tts).synthesize(text_str, speaker_str)
        return (cast(TTS, tts).get_sampling_rate(), samples)

    audio = gr.outputs.Audio(label="Generated Audio", type="numpy")

    iface = gr.Interface(
        fn=synthesize_audio,
        inputs=[file_input, model_name_dropdown, speaker_dropdown],
        outputs=audio,
        title="PDF TO SPEECH CONVERTER",
        layout="rows",
        debug=True
    )

    model_name_dropdown.set_action(set_model)
    model_name_dropdown.set_action(set_model)

    iface.launch()


if __name__ == "__main__":
    main()