File size: 2,477 Bytes
4e01295
 
c764bfb
 
 
 
 
6fa7850
c764bfb
4e01295
 
6fa7850
 
4e01295
 
c764bfb
 
6fa7850
 
 
 
 
 
 
c764bfb
6fa7850
 
 
c764bfb
6fa7850
 
 
 
 
 
 
c764bfb
6fa7850
c764bfb
6fa7850
 
c764bfb
6fa7850
c764bfb
6fa7850
 
 
c764bfb
6fa7850
 
c764bfb
6fa7850
 
c764bfb
 
6fa7850
 
c764bfb
6fa7850
c764bfb
6fa7850
c764bfb
6fa7850
 
 
 
c764bfb
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import gradio as gr
from pdfminer.high_level import extract_text
import logging
from typing import cast
from balacoon_tts import TTS
from huggingface_hub import hf_hub_download, list_repo_files

# Global tts module, initialized from a model selected
tts = None

def read_pdf(file):
    with open(file.name, "rb") as f:
        text = extract_text(f)
    return text

def main():
    logging.basicConfig(level=logging.INFO)
    with gr.Interface(fn=None, title="PDF TO SPEECH CONVERTER", layout="rows", debug=True) as iface:
        repo_files = list_repo_files(repo_id="balacoon/tts")
        model_files = [x for x in repo_files if x.endswith("_cpu.addon")]
        model_name = gr.inputs.Dropdown(label="Model", choices=model_files)
        speaker = gr.inputs.Dropdown(label="Speaker", choices=[])

        def set_model(model_name_str):
            """
            Gets value from `model_name`, loads the model,
            re-initializes the tts object, and gets a list of
            speakers that the model supports and sets them to `speaker`.
            """
            model_path = hf_hub_download(repo_id="balacoon/tts", filename=model_name_str)
            global tts
            tts = TTS(model_path)
            speakers = tts.get_speakers()
            value = speakers[-1]
            speaker.choices = speakers
            speaker.value = value

        model_name.onChange(set_model)

        file_input = gr.inputs.File(label="Select a PDF File", type="file")
        text = gr.outputs.Textbox()

        def synthesize_audio(file, model_name_str, speaker_str):
            """
            Gets the selected PDF `file`, model name from `model_name`,
            and speaker name from `speaker`. Synthesizes the audio waveform
            from the text extracted from the PDF and returns it.
            """
            if file is None or file.name == "":
                logging.info("No file selected.")
                return None

            text_str = read_pdf(file)
            if len(text_str) > 1024:
                text_str = text_str[:1024]

            global tts
            samples = cast(TTS, tts).synthesize(text_str, speaker_str)
            return (cast(TTS, tts).get_sampling_rate(), samples)

        audio = gr.outputs.Audio(label="Generated Audio")

        iface.inputs = [file_input, model_name, speaker]
        iface.outputs = audio
        iface.fn = synthesize_audio
        iface.launch()


if __name__ == "__main__":
    main()