Revanth-ml commited on
Commit
4be7077
·
verified ·
1 Parent(s): 7da4c38

Create deploy_kokora_app_cpu_modal_labs.py

Browse files
Files changed (1) hide show
  1. deploy_kokora_app_cpu_modal_labs.py +104 -0
deploy_kokora_app_cpu_modal_labs.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import modal
3
+ from fastapi import FastAPI, Request, status
4
+ from fastapi.responses import Response, JSONResponse
5
+
6
+ app = modal.App("kokoro-tts-api-cpu")
7
+
8
+
9
+ image = (
10
+ modal.Image.debian_slim(python_version="3.11")
11
+ .apt_install("git", "libsndfile1", "espeak-ng")
12
+ .pip_install(
13
+ "torch==2.3.0",
14
+ "soundfile",
15
+ "kokoro>=0.9.4",
16
+ "fastapi",
17
+ "numpy"
18
+ ).run_commands(
19
+ "pip install --force-reinstall --no-binary soundfile soundfile",)
20
+ .env({"HF_HOME": "/cache"})
21
+ )
22
+
23
+ CACHE_PATH = "/cache"
24
+ hf_cache = modal.Volume.from_name("kokoro-hf-cache", create_if_missing=True)
25
+
26
+ web_app = FastAPI(
27
+ title="Kokoro TTS API",
28
+ description="A serverless API for generating speech from text using the Kokoro model.",
29
+ version="1.0.0"
30
+ )
31
+
32
+ VOICE_PREFIX_MAP = {"en": "a", "us": "a", "gb": "b", "uk": "b", "es": "e", "fr": "f"}
33
+ def voice_to_lang(voice: str) -> str:
34
+ prefix = voice.split("_", 1)[0].lower()
35
+ return prefix if prefix in "abehijpz" else VOICE_PREFIX_MAP.get(prefix, "a")
36
+
37
+ @app.function(
38
+ image=image,
39
+ volumes={CACHE_PATH: hf_cache},
40
+ cpu=4,
41
+ timeout=180,
42
+ container_idle_timeout=300,
43
+ )
44
+ @modal.asgi_app()
45
+ def fastapi_app():
46
+ """
47
+ This function hosts our FastAPI application on Modal.
48
+ """
49
+ print("🚀 Kokoro TTS API container is starting up...")
50
+
51
+ @web_app.post("/",
52
+ summary="Synthesize Speech",
53
+ description="""
54
+ Converts text to speech.
55
+ - **text**: The string of text to synthesize.
56
+ - **voice**: (Optional) The voice ID to use (e.g., "a_heart", "b_female", "e_male"). Defaults to "a_heart".
57
+ """
58
+ )
59
+ async def tts_endpoint(request: Request):
60
+ try:
61
+ body = await request.json()
62
+ text_to_synthesize = body["text"]
63
+ voice_id = body.get("voice", "af_heart")
64
+ except Exception:
65
+ return JSONResponse(
66
+ status_code=status.HTTP_400_BAD_REQUEST,
67
+ content={"error": "Invalid request. Body must be JSON with a 'text' key."},
68
+ )
69
+
70
+ print(f"Synthesizing text: '{text_to_synthesize[:50]}...' with voice: {voice_id}")
71
+
72
+ from kokoro import KPipeline
73
+ import soundfile as sf
74
+ import torch
75
+ import numpy as np
76
+
77
+ torch.hub.set_dir(CACHE_PATH)
78
+ lang = voice_to_lang(voice_id)
79
+ pipe = KPipeline(lang_code=lang)
80
+
81
+
82
+ all_chunks = []
83
+ for _, _, chunk in pipe(text_to_synthesize, voice=voice_id):
84
+ all_chunks.append(chunk)
85
+
86
+ if not all_chunks:
87
+ return JSONResponse(
88
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
89
+ content={"error": "TTS generation failed to produce audio."},
90
+ )
91
+
92
+ full_audio = np.concatenate(all_chunks)
93
+
94
+ buffer = io.BytesIO()
95
+ sf.write(buffer, full_audio, 24_000, format="WAV", subtype="PCM_16")
96
+
97
+ buffer.seek(0)
98
+
99
+ hf_cache.commit()
100
+ print("Synthesis complete. Returning audio file.")
101
+
102
+ return Response(content=buffer.getvalue(), media_type="audio/wav")
103
+
104
+ return web_app