File size: 2,266 Bytes
4e01295
 
c764bfb
 
 
 
 
6fa7850
c764bfb
4e01295
 
6fa7850
 
4e01295
 
bed109a
 
 
 
 
 
 
 
 
 
 
 
 
 
c764bfb
 
d1d085b
 
1c30e88
d1d085b
6fa7850
d1d085b
 
c764bfb
d1d085b
 
 
 
 
 
 
 
 
6fa7850
d1d085b
 
 
6fa7850
d1d085b
 
 
c764bfb
18b6735
c764bfb
c5fd4f3
 
bed109a
c5fd4f3
 
 
 
 
d1d085b
c764bfb
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import gradio as gr
from pdfminer.high_level import extract_text
import logging
from typing import cast
from balacoon_tts import TTS
from huggingface_hub import hf_hub_download, list_repo_files

# Global tts module, initialized from a model selected
tts = None

def read_pdf(file):
    with open(file.name, "rb") as f:
        text = extract_text(f)
    return text

def set_model(model_name_str):
    """
    Gets value from `model_name`, loads the model,
    re-initializes the tts object, and gets a list of
    speakers that the model supports and sets them to `speaker`.
    """
    model_path = hf_hub_download(repo_id="balacoon/tts", filename=model_name_str)
    global tts
    tts = TTS(model_path)
    speakers = tts.get_speakers()
    value = speakers[-1]
    speaker.choices = speakers
    speaker.value = value

def main():
    logging.basicConfig(level=logging.INFO)
    repo_files = list_repo_files(repo_id="balacoon/tts")
    model_files = [x for x in repo_files if x.endswith("_cpu.addon")]
    model_name_dropdown = gr.inputs.Dropdown(label="Model", choices=model_files)
    speaker = gr.inputs.Dropdown(label="Speaker", choices=[])

    file_input = gr.inputs.File(label="Select a PDF File", type="file")
    text = gr.outputs.Textbox()

    def synthesize_audio(file, model_name_str, speaker_str):
        """
        Gets the selected PDF `file`, model name from `model_name`,
        and speaker name from `speaker`. Synthesizes the audio waveform
        from the text extracted from the PDF and returns it.
        """
        if file is None or file.name == "":
            logging.info("No file selected.")
            return None

        text_str = read_pdf(file)
        if len(text_str) > 1024:
            text_str = text_str[:1024]

        global tts
        samples = cast(TTS, tts).synthesize(text_str, speaker_str)
        return (cast(TTS, tts).get_sampling_rate(), samples)

    audio = gr.outputs.Audio(label="Generated Audio", type="numpy")

    iface = gr.Interface(
        fn=synthesize_audio,
        inputs=[file_input, model_name_dropdown, speaker],
        outputs=audio,
        title="PDF TO SPEECH CONVERTER",
        layout="rows",
        debug=True
    )
    iface.launch()


if __name__ == "__main__":
    main()