Spaces:
Running
Running
Avinyaa
commited on
Commit
·
9a88d9c
1
Parent(s):
c53fcc3
new
Browse files- README.md +109 -1
- app.py +169 -0
- client_example.py +73 -0
- dockerfile +28 -0
- requirements.txt +9 -0
- test.py +10 -0
README.md
CHANGED
@@ -7,4 +7,112 @@ sdk: docker
|
|
7 |
pinned: false
|
8 |
---
|
9 |
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
pinned: false
|
8 |
---
|
9 |
|
10 |
+
# TTS API
|
11 |
+
|
12 |
+
A FastAPI-based Text-to-Speech API using XTTS-v2 for voice cloning.
|
13 |
+
|
14 |
+
## Features
|
15 |
+
|
16 |
+
- Convert text to speech using voice cloning
|
17 |
+
- Upload reference speaker audio files
|
18 |
+
- Support for multiple languages
|
19 |
+
- RESTful API with automatic documentation
|
20 |
+
- Docker support
|
21 |
+
|
22 |
+
## Setup
|
23 |
+
|
24 |
+
### Local Development
|
25 |
+
|
26 |
+
1. Install dependencies:
|
27 |
+
```bash
|
28 |
+
pip install -r requirements.txt
|
29 |
+
```
|
30 |
+
|
31 |
+
2. Run the API:
|
32 |
+
```bash
|
33 |
+
python app.py
|
34 |
+
```
|
35 |
+
|
36 |
+
The API will be available at `http://localhost:8000`
|
37 |
+
|
38 |
+
### Using Docker
|
39 |
+
|
40 |
+
1. Build the Docker image:
|
41 |
+
```bash
|
42 |
+
docker build -t tts-api .
|
43 |
+
```
|
44 |
+
|
45 |
+
2. Run the container:
|
46 |
+
```bash
|
47 |
+
docker run -p 8000:8000 tts-api
|
48 |
+
```
|
49 |
+
|
50 |
+
## API Endpoints
|
51 |
+
|
52 |
+
### Health Check
|
53 |
+
- **GET** `/health` - Check API status
|
54 |
+
|
55 |
+
### Text-to-Speech
|
56 |
+
- **POST** `/tts` - Convert text to speech with uploaded speaker file
|
57 |
+
- **Parameters:**
|
58 |
+
- `text` (form): Text to convert to speech
|
59 |
+
- `language` (form): Language code (default: "en")
|
60 |
+
- `speaker_file` (file): Reference speaker audio file
|
61 |
+
|
62 |
+
### API Documentation
|
63 |
+
- **GET** `/docs` - Interactive API documentation (Swagger UI)
|
64 |
+
- **GET** `/redoc` - Alternative API documentation
|
65 |
+
|
66 |
+
## Usage Examples
|
67 |
+
|
68 |
+
### Using Python requests
|
69 |
+
|
70 |
+
```python
|
71 |
+
import requests
|
72 |
+
|
73 |
+
# Prepare the request
|
74 |
+
url = "http://localhost:8000/tts"
|
75 |
+
data = {
|
76 |
+
"text": "Hello, this is a test of voice cloning!",
|
77 |
+
"language": "en"
|
78 |
+
}
|
79 |
+
files = {
|
80 |
+
"speaker_file": open("path/to/speaker.wav", "rb")
|
81 |
+
}
|
82 |
+
|
83 |
+
# Make the request
|
84 |
+
response = requests.post(url, data=data, files=files)
|
85 |
+
|
86 |
+
# Save the generated audio
|
87 |
+
if response.status_code == 200:
|
88 |
+
with open("output.wav", "wb") as f:
|
89 |
+
f.write(response.content)
|
90 |
+
print("Speech generated successfully!")
|
91 |
+
```
|
92 |
+
|
93 |
+
### Using curl
|
94 |
+
|
95 |
+
```bash
|
96 |
+
curl -X POST "http://localhost:8000/tts" \
|
97 |
+
-F "text=Hello, this is a test!" \
|
98 |
+
-F "language=en" \
|
99 |
+
-F "speaker_file=@path/to/speaker.wav" \
|
100 |
+
--output generated_speech.wav
|
101 |
+
```
|
102 |
+
|
103 |
+
### Using the provided client example
|
104 |
+
|
105 |
+
```bash
|
106 |
+
python client_example.py
|
107 |
+
```
|
108 |
+
|
109 |
+
## Requirements
|
110 |
+
|
111 |
+
- Python 3.8+
|
112 |
+
- CUDA-compatible GPU (recommended for faster processing)
|
113 |
+
- Audio file in supported format (WAV, MP3, etc.) for speaker reference
|
114 |
+
|
115 |
+
## Model
|
116 |
+
|
117 |
+
This API uses the XTTS-v2_C3PO model for voice cloning, which is automatically downloaded when building the Docker image.
|
118 |
+
|
app.py
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, HTTPException, UploadFile, File, Form
|
2 |
+
from fastapi.responses import FileResponse
|
3 |
+
from pydantic import BaseModel
|
4 |
+
from TTS.api import TTS
|
5 |
+
import os
|
6 |
+
import tempfile
|
7 |
+
import uuid
|
8 |
+
import torch
|
9 |
+
from typing import Optional
|
10 |
+
import logging
|
11 |
+
|
12 |
+
# Configure logging
|
13 |
+
logging.basicConfig(level=logging.INFO)
|
14 |
+
logger = logging.getLogger(__name__)
|
15 |
+
|
16 |
+
app = FastAPI(title="TTS API", description="Text-to-Speech API using XTTS-v2", version="1.0.0")
|
17 |
+
|
18 |
+
class TTSRequest(BaseModel):
|
19 |
+
text: str
|
20 |
+
language: str = "en"
|
21 |
+
|
22 |
+
class TTSService:
|
23 |
+
def __init__(self):
|
24 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
25 |
+
logger.info(f"Using device: {self.device}")
|
26 |
+
|
27 |
+
# Use absolute paths for the model
|
28 |
+
model_path = "/app/XTTS-v2_C3PO/"
|
29 |
+
config_path = "/app/XTTS-v2_C3PO/config.json"
|
30 |
+
|
31 |
+
# Check if model files exist
|
32 |
+
if not os.path.exists(config_path):
|
33 |
+
logger.warning(f"Custom model config not found at {config_path}")
|
34 |
+
# List contents of model directory for debugging
|
35 |
+
model_dir = "/app/XTTS-v2_C3PO"
|
36 |
+
if os.path.exists(model_dir):
|
37 |
+
logger.info(f"Contents of {model_dir}: {os.listdir(model_dir)}")
|
38 |
+
else:
|
39 |
+
logger.warning(f"Model directory {model_dir} does not exist")
|
40 |
+
|
41 |
+
# Fallback to default XTTS model
|
42 |
+
logger.info("Falling back to default XTTS model")
|
43 |
+
try:
|
44 |
+
self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(self.device)
|
45 |
+
logger.info("Default TTS model loaded successfully")
|
46 |
+
return
|
47 |
+
except Exception as e:
|
48 |
+
logger.error(f"Failed to load default TTS model: {e}")
|
49 |
+
raise e
|
50 |
+
|
51 |
+
try:
|
52 |
+
self.tts = TTS(
|
53 |
+
model_path=model_path,
|
54 |
+
config_path=config_path,
|
55 |
+
progress_bar=False,
|
56 |
+
gpu=torch.cuda.is_available()
|
57 |
+
).to(self.device)
|
58 |
+
logger.info("Custom TTS model loaded successfully")
|
59 |
+
except Exception as e:
|
60 |
+
logger.error(f"Failed to load custom TTS model: {e}")
|
61 |
+
# Fallback to default model
|
62 |
+
logger.info("Falling back to default XTTS model")
|
63 |
+
try:
|
64 |
+
self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(self.device)
|
65 |
+
logger.info("Default TTS model loaded successfully")
|
66 |
+
except Exception as fallback_e:
|
67 |
+
logger.error(f"Failed to load default TTS model: {fallback_e}")
|
68 |
+
raise fallback_e
|
69 |
+
|
70 |
+
def generate_speech(self, text: str, speaker_wav_path: str, language: str = "en") -> str:
|
71 |
+
"""Generate speech and return the path to the output file"""
|
72 |
+
try:
|
73 |
+
# Create a unique filename for the output
|
74 |
+
output_filename = f"output_{uuid.uuid4().hex}.wav"
|
75 |
+
output_path = os.path.join(tempfile.gettempdir(), output_filename)
|
76 |
+
|
77 |
+
# Generate speech
|
78 |
+
self.tts.tts_to_file(
|
79 |
+
text=text,
|
80 |
+
file_path=output_path,
|
81 |
+
speaker_wav=speaker_wav_path,
|
82 |
+
language=language
|
83 |
+
)
|
84 |
+
|
85 |
+
return output_path
|
86 |
+
except Exception as e:
|
87 |
+
logger.error(f"Error generating speech: {e}")
|
88 |
+
raise HTTPException(status_code=500, detail=f"Failed to generate speech: {str(e)}")
|
89 |
+
|
90 |
+
# Initialize TTS service
|
91 |
+
tts_service = TTSService()
|
92 |
+
|
93 |
+
@app.get("/")
|
94 |
+
async def root():
|
95 |
+
return {"message": "TTS API is running", "status": "healthy"}
|
96 |
+
|
97 |
+
@app.get("/health")
|
98 |
+
async def health_check():
|
99 |
+
return {"status": "healthy", "device": tts_service.device}
|
100 |
+
|
101 |
+
@app.post("/tts")
|
102 |
+
async def text_to_speech(
|
103 |
+
text: str = Form(...),
|
104 |
+
language: str = Form("en"),
|
105 |
+
speaker_file: UploadFile = File(...)
|
106 |
+
):
|
107 |
+
"""
|
108 |
+
Convert text to speech using a reference speaker voice
|
109 |
+
|
110 |
+
- **text**: The text to convert to speech
|
111 |
+
- **language**: Language code (default: "en")
|
112 |
+
- **speaker_file**: Audio file containing the reference speaker voice
|
113 |
+
"""
|
114 |
+
|
115 |
+
if not text.strip():
|
116 |
+
raise HTTPException(status_code=400, detail="Text cannot be empty")
|
117 |
+
|
118 |
+
# Validate file type
|
119 |
+
if not speaker_file.content_type.startswith('audio/'):
|
120 |
+
raise HTTPException(status_code=400, detail="Speaker file must be an audio file")
|
121 |
+
|
122 |
+
try:
|
123 |
+
# Save uploaded speaker file temporarily
|
124 |
+
speaker_temp_path = "XTTS-v2_C3PO/reference.wav"
|
125 |
+
|
126 |
+
with open(speaker_temp_path, "wb") as buffer:
|
127 |
+
content = await speaker_file.read()
|
128 |
+
buffer.write(content)
|
129 |
+
|
130 |
+
# Generate speech
|
131 |
+
output_path = tts_service.generate_speech(text, speaker_temp_path, language)
|
132 |
+
|
133 |
+
|
134 |
+
# Return the generated audio file
|
135 |
+
return FileResponse(
|
136 |
+
output_path,
|
137 |
+
media_type="audio/wav",
|
138 |
+
filename=f"tts_output_{uuid.uuid4().hex}.wav",
|
139 |
+
headers={"Content-Disposition": "attachment"}
|
140 |
+
)
|
141 |
+
|
142 |
+
except Exception as e:
|
143 |
+
# Clean up files in case of error
|
144 |
+
if 'speaker_temp_path' in locals() and os.path.exists(speaker_temp_path):
|
145 |
+
os.remove(speaker_temp_path)
|
146 |
+
|
147 |
+
logger.error(f"Error in TTS endpoint: {e}")
|
148 |
+
raise HTTPException(status_code=500, detail=str(e))
|
149 |
+
|
150 |
+
@app.post("/tts-with-url")
|
151 |
+
async def text_to_speech_with_url(request: TTSRequest, speaker_wav_url: str):
|
152 |
+
"""
|
153 |
+
Convert text to speech using a reference speaker voice from URL
|
154 |
+
|
155 |
+
- **request**: TTSRequest containing text and language
|
156 |
+
- **speaker_wav_url**: URL to the reference speaker audio file
|
157 |
+
"""
|
158 |
+
|
159 |
+
if not request.text.strip():
|
160 |
+
raise HTTPException(status_code=400, detail="Text cannot be empty")
|
161 |
+
|
162 |
+
try:
|
163 |
+
# For this endpoint, you would need to download the file from URL
|
164 |
+
# This is a simplified version - you might want to add URL validation and download logic
|
165 |
+
raise HTTPException(status_code=501, detail="URL-based speaker input not implemented yet")
|
166 |
+
|
167 |
+
except Exception as e:
|
168 |
+
logger.error(f"Error in TTS URL endpoint: {e}")
|
169 |
+
raise HTTPException(status_code=500, detail=str(e))
|
client_example.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import os
|
3 |
+
|
4 |
+
def test_tts_api():
|
5 |
+
"""Example of how to use the TTS API"""
|
6 |
+
|
7 |
+
# API endpoint
|
8 |
+
url = "http://localhost:8000/tts"
|
9 |
+
|
10 |
+
# Text to convert to speech
|
11 |
+
text = "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."
|
12 |
+
|
13 |
+
# Path to your speaker reference audio file
|
14 |
+
speaker_file_path = "/path/to/target/speaker.wav" # Update this path
|
15 |
+
|
16 |
+
# Check if speaker file exists
|
17 |
+
if not os.path.exists(speaker_file_path):
|
18 |
+
print(f"Error: Speaker file not found at {speaker_file_path}")
|
19 |
+
print("Please update the speaker_file_path variable with a valid audio file path")
|
20 |
+
return
|
21 |
+
|
22 |
+
# Prepare the request
|
23 |
+
data = {
|
24 |
+
"text": text,
|
25 |
+
"language": "en"
|
26 |
+
}
|
27 |
+
|
28 |
+
files = {
|
29 |
+
"speaker_file": open(speaker_file_path, "rb")
|
30 |
+
}
|
31 |
+
|
32 |
+
try:
|
33 |
+
print("Sending request to TTS API...")
|
34 |
+
response = requests.post(url, data=data, files=files)
|
35 |
+
|
36 |
+
if response.status_code == 200:
|
37 |
+
# Save the generated audio
|
38 |
+
output_filename = "generated_speech.wav"
|
39 |
+
with open(output_filename, "wb") as f:
|
40 |
+
f.write(response.content)
|
41 |
+
print(f"Success! Generated speech saved as {output_filename}")
|
42 |
+
else:
|
43 |
+
print(f"Error: {response.status_code}")
|
44 |
+
print(response.text)
|
45 |
+
|
46 |
+
except requests.exceptions.ConnectionError:
|
47 |
+
print("Error: Could not connect to the API. Make sure the server is running on http://localhost:8000")
|
48 |
+
except Exception as e:
|
49 |
+
print(f"Error: {e}")
|
50 |
+
finally:
|
51 |
+
files["speaker_file"].close()
|
52 |
+
|
53 |
+
def check_api_health():
|
54 |
+
"""Check if the API is running"""
|
55 |
+
try:
|
56 |
+
response = requests.get("http://localhost:8000/health")
|
57 |
+
if response.status_code == 200:
|
58 |
+
print("API is healthy:", response.json())
|
59 |
+
else:
|
60 |
+
print("API health check failed:", response.status_code)
|
61 |
+
except requests.exceptions.ConnectionError:
|
62 |
+
print("API is not running. Start it with: python app.py")
|
63 |
+
|
64 |
+
if __name__ == "__main__":
|
65 |
+
print("TTS API Client Example")
|
66 |
+
print("=" * 30)
|
67 |
+
|
68 |
+
# First check if API is running
|
69 |
+
check_api_health()
|
70 |
+
print()
|
71 |
+
|
72 |
+
# Test the TTS functionality
|
73 |
+
test_tts_api()
|
dockerfile
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.11
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
# Install git and git-lfs
|
6 |
+
RUN apt-get update && apt-get install -y git git-lfs && rm -rf /var/lib/apt/lists/*
|
7 |
+
|
8 |
+
# Initialize git lfs
|
9 |
+
RUN git lfs install
|
10 |
+
|
11 |
+
COPY requirements.txt .
|
12 |
+
|
13 |
+
RUN pip install uv
|
14 |
+
RUN uv pip install --no-cache-dir -r requirements.txt --system
|
15 |
+
|
16 |
+
echo "Cloning the XTTS-v2_C3PO model..."
|
17 |
+
# Clone the XTTS-v2_C3PO model and verify it
|
18 |
+
RUN git clone https://huggingface.co/Borcherding/XTTS-v2_C3PO && \
|
19 |
+
ls -la XTTS-v2_C3PO/ && \
|
20 |
+
echo "Model directory contents:" && \
|
21 |
+
find XTTS-v2_C3PO/ -type f -name "*.json" -o -name "*.pth" -o -name "*.pt" | head -10
|
22 |
+
|
23 |
+
COPY . .
|
24 |
+
|
25 |
+
# Expose the port
|
26 |
+
EXPOSE 8000
|
27 |
+
|
28 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
coqui-tts
|
2 |
+
pandas
|
3 |
+
scikit-learn
|
4 |
+
fastapi
|
5 |
+
uvicorn[standard]
|
6 |
+
python-multipart
|
7 |
+
torch
|
8 |
+
torchaudio
|
9 |
+
requests
|
test.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from TTS.api import TTS
|
2 |
+
|
3 |
+
tts = TTS(model_path="XTTS-v2_C3PO/",
|
4 |
+
config_path="XTTS-v2_C3PO/config.json", progress_bar=False, gpu=True).to(self.device)
|
5 |
+
|
6 |
+
# generate speech by cloning a voice using default settings
|
7 |
+
tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
|
8 |
+
file_path="output.wav",
|
9 |
+
speaker_wav="/path/to/target/speaker.wav",
|
10 |
+
language="en")
|