atharva27
/

orpheus-tts-endpoint

Text-to-Speech

Model card Files Files and versions

xet

Community

atharva27 commited on May 3

Commit

3a8dbfa

verified ·

1 Parent(s): 9cc3956

Update handler.py

Browse files

Files changed (1) hide show

handler.py +125 -36

handler.py CHANGED Viewed

@@ -1,40 +1,129 @@
-# handler.py
 import os
-import base64
 import torch
-from orpheus_tts import OrpheusModel
-# This is called once at container startup
-def init():
-    global model, device
-    # pick CUDA if available, else CPU
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    model = OrpheusModel(
-        model_name="canopylabs/orpheus-3b-0.1-ft",
-        dtype=torch.bfloat16
-    ).to(device)
-# This is called on each HTTP request
-def inference(request):
-    """
-    Expects JSON: { "text": "Hello world!" }
-    Returns JSON: { "audio_base64": "<base64-wav-bytes>" }
-    """
-    payload = request.json()
-    text = payload.get("text", "")
-    # generate raw PCM chunks
-    pcm_chunks = model.generate_speech(prompt=text)
-    pcm_bytes  = b"".join(pcm_chunks)
-    # wrap in a 24 kHz 16-bit WAV header
-    import io, wave
-    buf = io.BytesIO()
-    with wave.open(buf, "wb") as wf:
-        wf.setnchannels(1)
-        wf.setsampwidth(2)
-        wf.setframerate(24000)
-        wf.writeframes(pcm_bytes)
-    wav = buf.getvalue()
-    b64 = base64.b64encode(wav).decode("utf-8")
-    return { "audio_base64": b64 }

 import os
+import re
+import time
+import asyncio
+import numpy as np
 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from snac import SNAC
+from livekit import rtc, api
+class EndpointHandler:
+    def __init__(self, path: str = ""):
+        # Load the Orpheus TTS model and tokenizer from the given path (Hub repository).
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.tokenizer = AutoTokenizer.from_pretrained(path)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            path, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
+        )
+        self.model.to(self.device)
+        self.model.eval()
+        # Load the SNAC audio codec model for decoding audio tokens (24 kHz speech model).
+        self.audio_codec = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to(self.device)
+        self.audio_codec.eval()
+        # Store LiveKit credentials from environment (if provided).
+        self.livekit_url = os.getenv("LIVEKIT_URL")
+        self.livekit_api_key = os.getenv("LIVEKIT_API_KEY")
+        self.livekit_api_secret = os.getenv("LIVEKIT_API_SECRET")
+        self.livekit_room = os.getenv("LIVEKIT_ROOM")  # default room name (optional)
+    def __call__(self, data: dict) -> list:
+        # Extract input text and optional voice and LiveKit parameters.
+        text_input = data.get("inputs") or data.get("text") or ""
+        if not isinstance(text_input, str) or text_input.strip() == "":
+            raise ValueError("No text input provided for TTS")
+        voice = data.get("voice", "tara")  # default voice (e.g., "tara")
+        # Format prompt with voice name (Orpheus expects prompts like "voice: text").
+        prompt = f"{voice}: {text_input}"
+        # Encode prompt and generate output tokens with the TTS model.
+        input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to(self.device)
+        generate_kwargs = {
+            "max_new_tokens": 1024,               # allow sufficient tokens for audio output
+            "do_sample": True,
+            "temperature": 0.8,
+            "top_p": 0.95,
+            "repetition_penalty": 1.1,           # >=1.1 for stable speech generation:contentReference[oaicite:2]{index=2}
+            "pad_token_id": self.tokenizer.eos_token_id,
+        }
+        output_ids = self.model.generate(input_ids, **generate_kwargs)
+        # The generated sequence includes the prompt; isolate newly generated tokens:
+        generated_tokens = output_ids[0, input_ids.size(1):]
+        output_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=False)
+        # Extract audio token IDs (SNAC codec tokens) from the model output.
+        audio_token_ids = [int(m) for m in re.findall(r"<custom_token_(\d+)>", output_text)]
+        if not audio_token_ids:
+            return [{"error": "TTS model produced no audio tokens"}]
+        # Convert list of token IDs into SNAC codec input tensors (7 tokens per audio frame):contentReference[oaicite:3]{index=3}.
+        # If the number of tokens is not a multiple of 7, pad with zeros (silence) to complete the last frame.
+        if len(audio_token_ids) % 7 != 0:
+            pad_len = 7 - (len(audio_token_ids) % 7)
+            audio_token_ids.extend([0] * pad_len)
+        audio_ids = torch.tensor(audio_token_ids, dtype=torch.int32, device=self.device).reshape(-1, 7)
+        # Separate hierarchical codec codebooks: coarse (level 0), mid (level 1), fine (level 2).
+        codes_0 = audio_ids[:, 0].unsqueeze(0)  # shape (1, N_frames)
+        codes_1 = torch.stack((audio_ids[:, 1], audio_ids[:, 4]), dim=1).flatten().unsqueeze(0)
+        codes_2 = torch.stack((audio_ids[:, 2], audio_ids[:, 3], audio_ids[:, 5], audio_ids[:, 6]), dim=1).flatten().unsqueeze(0)
+        # Decode audio tokens to waveform audio using SNAC codec model.
+        with torch.inference_mode():
+            audio_wave = self.audio_codec.decode([codes_0, codes_1, codes_2])
+        audio_wave = audio_wave.squeeze().cpu().numpy()  # shape: (num_samples,)
+        # Convert waveform from float (-1.0 to 1.0) to 16-bit PCM samples.
+        audio_pcm = (audio_wave * 32767.0).astype(np.int16)
+        sample_rate = 24000  # Hz (SNAC 24 kHz model output)
+        num_channels = 1
+        # Determine LiveKit connection info (from request or env).
+        lk_url = data.get("livekit_url", self.livekit_url)
+        lk_token = data.get("livekit_token", None)
+        room_name = data.get("livekit_room", self.livekit_room)
+        identity = data.get("livekit_identity", f"tts-agent-{int(time.time())}")
+        participant_name = data.get("livekit_name", "TTS Agent")
+        if not lk_token:
+            # If no direct token is provided, generate one using API key/secret.
+            if not (lk_url and self.livekit_api_key and self.livekit_api_secret and room_name):
+                return [{"error": "LiveKit connection information missing"}]
+            token_builder = api.AccessToken(self.livekit_api_key, self.livekit_api_secret)
+            token_builder.with_identity(identity).with_name(participant_name)
+            token_builder.with_grants(api.VideoGrants(room_join=True, room=room_name))
+            lk_token = token_builder.to_jwt()
+        # Asynchronous function to connect to LiveKit and stream audio frames.
+        async def stream_audio():
+            room = rtc.Room()
+            try:
+                await room.connect(lk_url, lk_token, options=rtc.RoomOptions(auto_subscribe=True))
+            except Exception as e:
+                return f"Failed to connect to LiveKit: {e}"
+            # Create an audio track for streaming the TTS output.
+            source = rtc.AudioSource(sample_rate, num_channels)
+            track = rtc.LocalAudioTrack.create_audio_track("tts-audio", source)
+            await room.local_participant.publish_track(track, rtc.TrackPublishOptions(name="TTS Audio"))
+            # Stream the audio in chunks for real-time playback.
+            frame_duration = 0.05  # 50 ms per frame
+            frame_samples = int(sample_rate * frame_duration)
+            total_samples = len(audio_pcm)
+            for start in range(0, total_samples, frame_samples):
+                end = min(start + frame_samples, total_samples)
+                chunk = audio_pcm[start:end]
+                # Create an AudioFrame and copy the PCM chunk into it:contentReference[oaicite:4]{index=4}.
+                frame = rtc.AudioFrame.create(sample_rate=sample_rate, num_channels=num_channels,
+                                              samples_per_channel=len(chunk) // num_channels)
+                frame_buffer = np.frombuffer(frame.data, dtype=np.int16)
+                np.copyto(frame_buffer[:len(chunk)], chunk)
+                await source.capture_frame(frame)
+                # Sleep to maintain real-time pace (synchronize with frame duration).
+                await asyncio.sleep(frame_duration)
+            # Disconnect from the room after streaming is finished.
+            await room.disconnect()
+            return None
+        # Run the streaming coroutine and wait for completion.
+        error = asyncio.run(stream_audio())
+        if error:
+            return [{"error": error}]
+        # Return a success status (audio is delivered via LiveKit, not in the HTTP response).
+        return [{"status": "success"}]