Spaces:

Agents-MCP-Hackathon
/

SlideDeck-AI

Running

App Files Files Community

SlideDeck-AI / deploy_kokora_app_cpu_modal_labs.py

Revanth-ml

Create deploy_kokora_app_cpu_modal_labs.py

4be7077 verified 16 days ago

raw

history blame

3.13 kB

	import io
	import modal
	from fastapi import FastAPI, Request, status
	from fastapi.responses import Response, JSONResponse

	app = modal.App("kokoro-tts-api-cpu")


	image = (
	modal.Image.debian_slim(python_version="3.11")
	.apt_install("git", "libsndfile1", "espeak-ng")
	.pip_install(
	"torch==2.3.0",
	"soundfile",
	"kokoro>=0.9.4",
	"fastapi",
	"numpy"
	).run_commands(
	"pip install --force-reinstall --no-binary soundfile soundfile",)
	.env({"HF_HOME": "/cache"})
	)

	CACHE_PATH = "/cache"
	hf_cache = modal.Volume.from_name("kokoro-hf-cache", create_if_missing=True)

	web_app = FastAPI(
	title="Kokoro TTS API",
	description="A serverless API for generating speech from text using the Kokoro model.",
	version="1.0.0"
	)

	VOICE_PREFIX_MAP = {"en": "a", "us": "a", "gb": "b", "uk": "b", "es": "e", "fr": "f"}
	def voice_to_lang(voice: str) -> str:
	prefix = voice.split("_", 1)[0].lower()
	return prefix if prefix in "abehijpz" else VOICE_PREFIX_MAP.get(prefix, "a")

	@app.function(
	image=image,
	volumes={CACHE_PATH: hf_cache},
	cpu=4,
	timeout=180,
	container_idle_timeout=300,
	)
	@modal.asgi_app()
	def fastapi_app():
	"""
	This function hosts our FastAPI application on Modal.
	"""
	print("🚀 Kokoro TTS API container is starting up...")

	@web_app.post("/",
	summary="Synthesize Speech",
	description="""
	Converts text to speech.
	- text: The string of text to synthesize.
	- voice: (Optional) The voice ID to use (e.g., "a_heart", "b_female", "e_male"). Defaults to "a_heart".
	"""
	)
	async def tts_endpoint(request: Request):
	try:
	body = await request.json()
	text_to_synthesize = body["text"]
	voice_id = body.get("voice", "af_heart")
	except Exception:
	return JSONResponse(
	status_code=status.HTTP_400_BAD_REQUEST,
	content={"error": "Invalid request. Body must be JSON with a 'text' key."},
	)

	print(f"Synthesizing text: '{text_to_synthesize[:50]}...' with voice: {voice_id}")

	from kokoro import KPipeline
	import soundfile as sf
	import torch
	import numpy as np

	torch.hub.set_dir(CACHE_PATH)
	lang = voice_to_lang(voice_id)
	pipe = KPipeline(lang_code=lang)


	all_chunks = []
	for _, _, chunk in pipe(text_to_synthesize, voice=voice_id):
	all_chunks.append(chunk)

	if not all_chunks:
	return JSONResponse(
	status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
	content={"error": "TTS generation failed to produce audio."},
	)

	full_audio = np.concatenate(all_chunks)

	buffer = io.BytesIO()
	sf.write(buffer, full_audio, 24_000, format="WAV", subtype="PCM_16")

	buffer.seek(0)

	hf_cache.commit()
	print("Synthesis complete. Returning audio file.")

	return Response(content=buffer.getvalue(), media_type="audio/wav")

	return web_app