import os import torch import tempfile import uuid import logging from typing import Optional from huggingface_hub import snapshot_download from fastapi import FastAPI, HTTPException, UploadFile, File, Form from fastapi.responses import FileResponse from pydantic import BaseModel from TTS.api import TTS # Set environment variables for Coqui TTS os.environ["COQUI_TOS_AGREED"] = "1" # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) app = FastAPI( title="Coqui TTS C-3PO API", description="Text-to-Speech API using Coqui TTS with C-3PO fine-tuned voice model", version="1.0.0" ) class TTSRequest(BaseModel): text: str language: str = "en" class CoquiTTSService: def __init__(self): self.device = "cuda" if torch.cuda.is_available() else "cpu" logger.info(f"Using device: {self.device}") # Download and initialize the C-3PO fine-tuned model try: logger.info("Downloading C-3PO fine-tuned XTTS model from Hugging Face...") # Download the model files from Hugging Face model_path = snapshot_download( repo_id="Borcherding/XTTS-v2_C3PO", local_dir="./models/XTTS-v2_C3PO", local_dir_use_symlinks=False ) logger.info(f"Model downloaded to: {model_path}") # Initialize TTS with the downloaded C-3PO model config_path = os.path.join(model_path, "config.json") if os.path.exists(config_path): logger.info("Loading C-3PO fine-tuned model...") self.tts = TTS( model_path=model_path, config_path=config_path, progress_bar=False, gpu=torch.cuda.is_available() ).to(self.device) logger.info("C-3PO fine-tuned model loaded successfully!") else: # Fallback to using the model by name if config not found logger.info("Config not found, trying to load by repo ID...") self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(self.device) logger.info("Fallback XTTS v2 model loaded!") # Store model path for reference audio self.model_path = model_path # Check for speakers if hasattr(self.tts, 'speakers') and self.tts.speakers: logger.info(f"Available speakers: {len(self.tts.speakers)}") self.default_speaker = self.tts.speakers[0] if self.tts.speakers else None else: logger.info("No preset speakers available - voice cloning mode") self.default_speaker = None except Exception as e: logger.error(f"Failed to load C-3PO model: {e}") logger.info("Falling back to standard XTTS v2 model...") try: self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(self.device) self.model_path = None self.default_speaker = None logger.info("Fallback XTTS v2 model loaded!") except Exception as fallback_error: logger.error(f"Fallback model also failed: {fallback_error}") raise fallback_error def get_c3po_reference_audio(self): """Get reference audio file for C-3PO voice if available""" if self.model_path: # Look for reference audio files in the model directory possible_ref_files = [ "reference.wav", "speaker.wav", "c3po.wav", "sample.wav", "reference_audio.wav" ] for ref_file in possible_ref_files: ref_path = os.path.join(self.model_path, ref_file) if os.path.exists(ref_path): logger.info(f"Found C-3PO reference audio: {ref_path}") return ref_path return None def generate_speech(self, text: str, speaker_wav_path: Optional[str] = None, language: str = "en", use_c3po_voice: bool = True) -> str: """Generate speech using Coqui TTS with optional C-3PO voice""" try: # Validate text length if len(text) < 2: raise HTTPException(status_code=400, detail="Text too short") if len(text) > 500: raise HTTPException(status_code=400, detail="Text too long (max 500 characters)") # Generate unique output filename output_filename = f"c3po_tts_output_{uuid.uuid4().hex}.wav" output_path = os.path.join(tempfile.gettempdir(), output_filename) # Determine which speaker to use final_speaker_wav = speaker_wav_path # If no speaker provided and C-3PO voice requested, try to use reference audio if not final_speaker_wav and use_c3po_voice: c3po_ref = self.get_c3po_reference_audio() if c3po_ref: final_speaker_wav = c3po_ref logger.info("Using C-3PO reference audio for voice synthesis") if final_speaker_wav: # Voice cloning mode logger.info("Generating speech with voice cloning...") wav = self.tts.tts( text=text, speaker_wav=final_speaker_wav, language=language ) # Save the audio import torchaudio if isinstance(wav, list): wav = torch.tensor(wav) if wav.dim() == 1: wav = wav.unsqueeze(0) torchaudio.save(output_path, wav, 22050) elif self.default_speaker: # Use preset speaker logger.info(f"Generating speech with preset speaker: {self.default_speaker}") self.tts.tts_to_file( text=text, speaker=self.default_speaker, language=language, file_path=output_path ) else: # Try without speaker (some models support this) logger.info("Generating speech without specific speaker...") self.tts.tts_to_file( text=text, language=language, file_path=output_path ) if not os.path.exists(output_path): raise HTTPException(status_code=500, detail="Failed to generate audio file") logger.info(f"Speech generated successfully: {output_path}") return output_path except Exception as e: logger.error(f"Error generating speech: {e}") if isinstance(e, HTTPException): raise e raise HTTPException(status_code=500, detail=f"Speech generation failed: {str(e)}") # Initialize TTS service logger.info("Initializing Coqui TTS service...") try: tts_service = CoquiTTSService() logger.info("TTS service initialized successfully") except Exception as e: logger.error(f"Failed to initialize TTS service: {e}") tts_service = None @app.get("/") async def root(): """Root endpoint with API information""" return { "message": "Coqui TTS C-3PO API", "status": "healthy" if tts_service else "error", "model": "XTTS v2", "voice_cloning": True } @app.get("/health") async def health_check(): """Health check endpoint""" if not tts_service: raise HTTPException(status_code=503, detail="TTS service not available") c3po_ref_available = tts_service.get_c3po_reference_audio() is not None return { "status": "healthy", "device": tts_service.device, "model": "C-3PO Fine-tuned XTTS v2 (Coqui TTS)", "default_speaker": tts_service.default_speaker, "voice_cloning_available": True, "c3po_voice_available": c3po_ref_available, "model_path": getattr(tts_service, 'model_path', None) } @app.post("/tts") async def text_to_speech( text: str = Form(...), language: str = Form("en"), speaker_file: UploadFile = File(None), use_c3po_voice: bool = Form(True) ): """ Convert text to speech using Coqui TTS - **text**: Text to convert to speech (2-500 characters) - **language**: Language code (default: "en") - **speaker_file**: Reference audio file for voice cloning (optional) - **use_c3po_voice**: Use C-3PO voice if no speaker file provided (default: True) """ if not tts_service: raise HTTPException(status_code=503, detail="TTS service not available") if not text.strip(): raise HTTPException(status_code=400, detail="Text cannot be empty") speaker_temp_path = None try: # Handle speaker file if provided if speaker_file is not None: if not speaker_file.content_type or not speaker_file.content_type.startswith('audio/'): raise HTTPException(status_code=400, detail="Speaker file must be an audio file") # Save uploaded file temporarily speaker_temp_path = os.path.join( tempfile.gettempdir(), f"speaker_{uuid.uuid4().hex}.wav" ) with open(speaker_temp_path, "wb") as buffer: content = await speaker_file.read() buffer.write(content) logger.info(f"Speaker file saved: {speaker_temp_path}") # Generate speech output_path = tts_service.generate_speech(text, speaker_temp_path, language, use_c3po_voice) # Clean up temporary speaker file if speaker_temp_path and os.path.exists(speaker_temp_path): try: os.remove(speaker_temp_path) except: pass # Return the generated audio voice_type = "custom" if speaker_file else ("c3po" if use_c3po_voice else "default") return FileResponse( output_path, media_type="audio/wav", filename=f"c3po_tts_{voice_type}_{uuid.uuid4().hex}.wav", headers={"Content-Disposition": "attachment"} ) except Exception as e: # Clean up on error if speaker_temp_path and os.path.exists(speaker_temp_path): try: os.remove(speaker_temp_path) except: pass logger.error(f"Error in TTS endpoint: {e}") if isinstance(e, HTTPException): raise e raise HTTPException(status_code=500, detail=str(e)) @app.post("/tts-c3po") async def text_to_speech_c3po( text: str = Form(...), language: str = Form("en") ): """ Convert text to speech using C-3PO voice specifically - **text**: Text to convert to speech (2-500 characters) - **language**: Language code (default: "en") """ if not tts_service: raise HTTPException(status_code=503, detail="TTS service not available") if not text.strip(): raise HTTPException(status_code=400, detail="Text cannot be empty") # Check if C-3PO voice is available c3po_ref = tts_service.get_c3po_reference_audio() if not c3po_ref: raise HTTPException(status_code=503, detail="C-3PO reference audio not available") try: # Generate speech with C-3PO voice output_path = tts_service.generate_speech(text, None, language, use_c3po_voice=True) return FileResponse( output_path, media_type="audio/wav", filename=f"c3po_voice_{uuid.uuid4().hex}.wav", headers={"Content-Disposition": "attachment"} ) except Exception as e: logger.error(f"Error in C-3PO TTS endpoint: {e}") if isinstance(e, HTTPException): raise e raise HTTPException(status_code=500, detail=str(e)) @app.post("/tts-json") async def text_to_speech_json(request: TTSRequest): """ Convert text to speech using JSON request with C-3PO voice - **request**: TTSRequest containing text and language """ if not tts_service: raise HTTPException(status_code=503, detail="TTS service not available") if not request.text.strip(): raise HTTPException(status_code=400, detail="Text cannot be empty") try: # Generate speech with C-3PO voice by default output_path = tts_service.generate_speech(request.text, None, request.language, use_c3po_voice=True) return FileResponse( output_path, media_type="audio/wav", filename=f"c3po_tts_{request.language}_{uuid.uuid4().hex}.wav", headers={"Content-Disposition": "attachment"} ) except Exception as e: logger.error(f"Error in TTS JSON endpoint: {e}") if isinstance(e, HTTPException): raise e raise HTTPException(status_code=500, detail=str(e)) @app.get("/models") async def list_models(): """List available TTS models""" try: # Create a temporary TTS instance to list models temp_tts = TTS() models = temp_tts.list_models() return {"models": models[:20]} # Return first 20 models except Exception as e: logger.error(f"Error listing models: {e}") raise HTTPException(status_code=500, detail="Failed to list models") if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)