File size: 3,128 Bytes
4be7077
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import io
import modal
from fastapi import FastAPI, Request, status
from fastapi.responses import Response, JSONResponse

app = modal.App("kokoro-tts-api-cpu")


image = (
    modal.Image.debian_slim(python_version="3.11")
    .apt_install("git", "libsndfile1", "espeak-ng")
    .pip_install(
        "torch==2.3.0",
        "soundfile",
        "kokoro>=0.9.4",
        "fastapi",
        "numpy"
    ).run_commands(
        "pip install --force-reinstall --no-binary soundfile soundfile",)
    .env({"HF_HOME": "/cache"})
)

CACHE_PATH = "/cache"
hf_cache = modal.Volume.from_name("kokoro-hf-cache", create_if_missing=True)

web_app = FastAPI(
    title="Kokoro TTS API",
    description="A serverless API for generating speech from text using the Kokoro model.",
    version="1.0.0"
)

VOICE_PREFIX_MAP = {"en": "a", "us": "a", "gb": "b", "uk": "b", "es": "e", "fr": "f"}
def voice_to_lang(voice: str) -> str:
    prefix = voice.split("_", 1)[0].lower()
    return prefix if prefix in "abehijpz" else VOICE_PREFIX_MAP.get(prefix, "a")

@app.function(
    image=image,
    volumes={CACHE_PATH: hf_cache},
    cpu=4,  
    timeout=180,
    container_idle_timeout=300,
)
@modal.asgi_app()
def fastapi_app():
    """
    This function hosts our FastAPI application on Modal.
    """
    print("πŸš€ Kokoro TTS API container is starting up...")

    @web_app.post("/",
        summary="Synthesize Speech",
        description="""
        Converts text to speech.
        - **text**: The string of text to synthesize.
        - **voice**: (Optional) The voice ID to use (e.g., "a_heart", "b_female", "e_male"). Defaults to "a_heart".
        """
    )
    async def tts_endpoint(request: Request):
        try:
            body = await request.json()
            text_to_synthesize = body["text"]
            voice_id = body.get("voice", "af_heart")
        except Exception:
            return JSONResponse(
                status_code=status.HTTP_400_BAD_REQUEST,
                content={"error": "Invalid request. Body must be JSON with a 'text' key."},
            )

        print(f"Synthesizing text: '{text_to_synthesize[:50]}...' with voice: {voice_id}")

        from kokoro import KPipeline
        import soundfile as sf
        import torch
        import numpy as np

        torch.hub.set_dir(CACHE_PATH)
        lang = voice_to_lang(voice_id)
        pipe = KPipeline(lang_code=lang)

        
        all_chunks = []
        for _, _, chunk in pipe(text_to_synthesize, voice=voice_id):
            all_chunks.append(chunk)

        if not all_chunks:
            return JSONResponse(
                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                content={"error": "TTS generation failed to produce audio."},
            )

        full_audio = np.concatenate(all_chunks)

        buffer = io.BytesIO()
        sf.write(buffer, full_audio, 24_000, format="WAV", subtype="PCM_16")
        
        buffer.seek(0) 

        hf_cache.commit()
        print("Synthesis complete. Returning audio file.")

        return Response(content=buffer.getvalue(), media_type="audio/wav")

    return web_app