import io import modal from fastapi import FastAPI, Request, status from fastapi.responses import Response, JSONResponse app = modal.App("kokoro-tts-api-cpu") image = ( modal.Image.debian_slim(python_version="3.11") .apt_install("git", "libsndfile1", "espeak-ng") .pip_install( "torch==2.3.0", "soundfile", "kokoro>=0.9.4", "fastapi", "numpy" ).run_commands( "pip install --force-reinstall --no-binary soundfile soundfile",) .env({"HF_HOME": "/cache"}) ) CACHE_PATH = "/cache" hf_cache = modal.Volume.from_name("kokoro-hf-cache", create_if_missing=True) web_app = FastAPI( title="Kokoro TTS API", description="A serverless API for generating speech from text using the Kokoro model.", version="1.0.0" ) VOICE_PREFIX_MAP = {"en": "a", "us": "a", "gb": "b", "uk": "b", "es": "e", "fr": "f"} def voice_to_lang(voice: str) -> str: prefix = voice.split("_", 1)[0].lower() return prefix if prefix in "abehijpz" else VOICE_PREFIX_MAP.get(prefix, "a") @app.function( image=image, volumes={CACHE_PATH: hf_cache}, cpu=4, timeout=180, container_idle_timeout=300, ) @modal.asgi_app() def fastapi_app(): """ This function hosts our FastAPI application on Modal. """ print("🚀 Kokoro TTS API container is starting up...") @web_app.post("/", summary="Synthesize Speech", description=""" Converts text to speech. - **text**: The string of text to synthesize. - **voice**: (Optional) The voice ID to use (e.g., "a_heart", "b_female", "e_male"). Defaults to "a_heart". """ ) async def tts_endpoint(request: Request): try: body = await request.json() text_to_synthesize = body["text"] voice_id = body.get("voice", "af_heart") except Exception: return JSONResponse( status_code=status.HTTP_400_BAD_REQUEST, content={"error": "Invalid request. Body must be JSON with a 'text' key."}, ) print(f"Synthesizing text: '{text_to_synthesize[:50]}...' with voice: {voice_id}") from kokoro import KPipeline import soundfile as sf import torch import numpy as np torch.hub.set_dir(CACHE_PATH) lang = voice_to_lang(voice_id) pipe = KPipeline(lang_code=lang) all_chunks = [] for _, _, chunk in pipe(text_to_synthesize, voice=voice_id): all_chunks.append(chunk) if not all_chunks: return JSONResponse( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, content={"error": "TTS generation failed to produce audio."}, ) full_audio = np.concatenate(all_chunks) buffer = io.BytesIO() sf.write(buffer, full_audio, 24_000, format="WAV", subtype="PCM_16") buffer.seek(0) hf_cache.commit() print("Synthesis complete. Returning audio file.") return Response(content=buffer.getvalue(), media_type="audio/wav") return web_app