Spaces:
Running
Running
Avinyaa
commited on
Commit
·
9acb9c3
1
Parent(s):
bbaf488
new
Browse files- .gitignore +24 -1
- Dockerfile +2 -8
- README.md +123 -37
- XTTS-v2_C3PO +0 -1
- app.py +82 -90
- client_example.py +90 -32
- requirements.txt +3 -6
- test.py +21 -8
- test_kokoro_install.py +1 -0
.gitignore
CHANGED
@@ -1 +1,24 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Generated audio files
|
2 |
+
*.wav
|
3 |
+
*.mp3
|
4 |
+
|
5 |
+
# Python cache
|
6 |
+
__pycache__/
|
7 |
+
*.pyc
|
8 |
+
*.pyo
|
9 |
+
|
10 |
+
# Temporary files
|
11 |
+
*.tmp
|
12 |
+
*.temp
|
13 |
+
|
14 |
+
# Environment files
|
15 |
+
.env
|
16 |
+
.venv/
|
17 |
+
|
18 |
+
# IDE files
|
19 |
+
.vscode/
|
20 |
+
.idea/
|
21 |
+
|
22 |
+
# OS files
|
23 |
+
.DS_Store
|
24 |
+
Thumbs.db
|
Dockerfile
CHANGED
@@ -6,8 +6,8 @@ WORKDIR /app
|
|
6 |
ENV NUMBA_CACHE_DIR=/tmp/numba_cache
|
7 |
ENV NUMBA_DISABLE_JIT=1
|
8 |
|
9 |
-
# Install git and
|
10 |
-
RUN apt-get update && apt-get install -y git git-lfs && rm -rf /var/lib/apt/lists/*
|
11 |
|
12 |
# Initialize git lfs
|
13 |
RUN git lfs install
|
@@ -17,12 +17,6 @@ COPY requirements.txt .
|
|
17 |
RUN pip install uv
|
18 |
RUN uv pip install --no-cache-dir -r requirements.txt --system
|
19 |
|
20 |
-
# Clone the XTTS-v2_C3PO model and verify it
|
21 |
-
RUN git clone https://huggingface.co/Borcherding/XTTS-v2_C3PO && \
|
22 |
-
ls -la XTTS-v2_C3PO/ && \
|
23 |
-
echo "Model directory contents:" && \
|
24 |
-
find XTTS-v2_C3PO/ -type f -name "*.json" -o -name "*.pth" -o -name "*.pt" | head -10
|
25 |
-
|
26 |
COPY . .
|
27 |
|
28 |
# Expose the port
|
|
|
6 |
ENV NUMBA_CACHE_DIR=/tmp/numba_cache
|
7 |
ENV NUMBA_DISABLE_JIT=1
|
8 |
|
9 |
+
# Install git, git-lfs, and espeak-ng for Kokoro TTS
|
10 |
+
RUN apt-get update && apt-get install -y git git-lfs espeak-ng && rm -rf /var/lib/apt/lists/*
|
11 |
|
12 |
# Initialize git lfs
|
13 |
RUN git lfs install
|
|
|
17 |
RUN pip install uv
|
18 |
RUN uv pip install --no-cache-dir -r requirements.txt --system
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
COPY . .
|
21 |
|
22 |
# Expose the port
|
README.md
CHANGED
@@ -1,103 +1,175 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
colorFrom: indigo
|
5 |
colorTo: yellow
|
6 |
sdk: docker
|
7 |
pinned: false
|
8 |
---
|
9 |
|
10 |
-
# TTS API
|
11 |
|
12 |
-
A FastAPI-based Text-to-Speech API using
|
13 |
|
14 |
## Features
|
15 |
|
16 |
-
- Convert text to speech using
|
17 |
-
-
|
18 |
-
-
|
19 |
- RESTful API with automatic documentation
|
20 |
- Docker support
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
## Setup
|
23 |
|
24 |
### Local Development
|
25 |
|
26 |
-
1. Install dependencies:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
```bash
|
28 |
pip install -r requirements.txt
|
29 |
```
|
30 |
|
31 |
-
|
32 |
```bash
|
33 |
-
|
34 |
```
|
35 |
|
36 |
-
The API will be available at `http://localhost:
|
37 |
|
38 |
### Using Docker
|
39 |
|
40 |
1. Build the Docker image:
|
41 |
```bash
|
42 |
-
docker build -t tts-api .
|
43 |
```
|
44 |
|
45 |
2. Run the container:
|
46 |
```bash
|
47 |
-
docker run -p
|
48 |
```
|
49 |
|
50 |
## API Endpoints
|
51 |
|
52 |
### Health Check
|
53 |
-
- **GET** `/health` - Check API status
|
|
|
|
|
|
|
54 |
|
55 |
-
### Text-to-Speech
|
56 |
-
- **POST** `/tts` - Convert text to speech
|
57 |
- **Parameters:**
|
58 |
- `text` (form): Text to convert to speech
|
59 |
-
- `
|
60 |
-
- `
|
|
|
|
|
|
|
|
|
61 |
|
62 |
### API Documentation
|
63 |
- **GET** `/docs` - Interactive API documentation (Swagger UI)
|
64 |
- **GET** `/redoc` - Alternative API documentation
|
65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
## Usage Examples
|
67 |
|
68 |
-
### Using Python requests
|
69 |
|
70 |
```python
|
71 |
import requests
|
72 |
|
73 |
# Prepare the request
|
74 |
-
url = "http://localhost:
|
75 |
data = {
|
76 |
-
"text": "Hello, this is
|
77 |
-
"
|
|
|
78 |
}
|
79 |
-
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
}
|
82 |
|
|
|
|
|
83 |
# Make the request
|
84 |
-
response = requests.post(url,
|
85 |
|
86 |
# Save the generated audio
|
87 |
if response.status_code == 200:
|
88 |
-
with open("
|
89 |
f.write(response.content)
|
90 |
print("Speech generated successfully!")
|
91 |
```
|
92 |
|
93 |
-
### Using curl
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
```bash
|
96 |
-
curl -X POST "http://localhost:
|
97 |
-
-
|
98 |
-
-
|
99 |
-
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
101 |
```
|
102 |
|
103 |
### Using the provided client example
|
@@ -108,11 +180,25 @@ python client_example.py
|
|
108 |
|
109 |
## Requirements
|
110 |
|
111 |
-
- Python 3.
|
112 |
-
-
|
113 |
-
-
|
|
|
|
|
114 |
|
115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
|
117 |
-
This
|
118 |
|
|
|
1 |
---
|
2 |
+
title: Kokoro TTS API
|
3 |
+
emoji: 🎤
|
4 |
colorFrom: indigo
|
5 |
colorTo: yellow
|
6 |
sdk: docker
|
7 |
pinned: false
|
8 |
---
|
9 |
|
10 |
+
# Kokoro TTS API
|
11 |
|
12 |
+
A FastAPI-based Text-to-Speech API using Kokoro, an open-weight TTS model with 82 million parameters.
|
13 |
|
14 |
## Features
|
15 |
|
16 |
+
- Convert text to speech using Kokoro TTS
|
17 |
+
- Multiple voice options (af_heart, af_sky, af_bella, etc.)
|
18 |
+
- Automatic language detection
|
19 |
- RESTful API with automatic documentation
|
20 |
- Docker support
|
21 |
+
- Lightweight and fast processing
|
22 |
+
- Apache-licensed weights
|
23 |
+
|
24 |
+
## About Kokoro
|
25 |
+
|
26 |
+
[Kokoro](/kˈOkəɹO/) is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models while being significantly faster and more cost-efficient. With Apache-licensed weights, Kokoro can be deployed anywhere from production environments to personal projects.
|
27 |
|
28 |
## Setup
|
29 |
|
30 |
### Local Development
|
31 |
|
32 |
+
1. Install system dependencies:
|
33 |
+
```bash
|
34 |
+
# On Ubuntu/Debian
|
35 |
+
sudo apt-get install espeak-ng
|
36 |
+
|
37 |
+
# On macOS
|
38 |
+
brew install espeak
|
39 |
+
```
|
40 |
+
|
41 |
+
2. Install Python dependencies:
|
42 |
```bash
|
43 |
pip install -r requirements.txt
|
44 |
```
|
45 |
|
46 |
+
3. Run the API:
|
47 |
```bash
|
48 |
+
uvicorn app:app --host 0.0.0.0 --port 7860
|
49 |
```
|
50 |
|
51 |
+
The API will be available at `http://localhost:7860`
|
52 |
|
53 |
### Using Docker
|
54 |
|
55 |
1. Build the Docker image:
|
56 |
```bash
|
57 |
+
docker build -t kokoro-tts-api .
|
58 |
```
|
59 |
|
60 |
2. Run the container:
|
61 |
```bash
|
62 |
+
docker run -p 7860:7860 kokoro-tts-api
|
63 |
```
|
64 |
|
65 |
## API Endpoints
|
66 |
|
67 |
### Health Check
|
68 |
+
- **GET** `/health` - Check API status and device information
|
69 |
+
|
70 |
+
### Available Voices
|
71 |
+
- **GET** `/voices` - Get list of available voices
|
72 |
|
73 |
+
### Text-to-Speech (Form Data)
|
74 |
+
- **POST** `/tts` - Convert text to speech using form data
|
75 |
- **Parameters:**
|
76 |
- `text` (form): Text to convert to speech
|
77 |
+
- `voice` (form): Voice to use (default: "af_heart")
|
78 |
+
- `lang_code` (form): Language code (default: "a" for auto-detect)
|
79 |
+
|
80 |
+
### Text-to-Speech (JSON)
|
81 |
+
- **POST** `/tts-json` - Convert text to speech using JSON request body
|
82 |
+
- **Body:** JSON object with `text`, `voice`, and `lang_code` fields
|
83 |
|
84 |
### API Documentation
|
85 |
- **GET** `/docs` - Interactive API documentation (Swagger UI)
|
86 |
- **GET** `/redoc` - Alternative API documentation
|
87 |
|
88 |
+
## Available Voices
|
89 |
+
|
90 |
+
- `af_heart` - Female voice (Heart)
|
91 |
+
- `af_sky` - Female voice (Sky)
|
92 |
+
- `af_bella` - Female voice (Bella)
|
93 |
+
- `af_sarah` - Female voice (Sarah)
|
94 |
+
- `af_nicole` - Female voice (Nicole)
|
95 |
+
- `am_adam` - Male voice (Adam)
|
96 |
+
- `am_michael` - Male voice (Michael)
|
97 |
+
- `am_edward` - Male voice (Edward)
|
98 |
+
- `am_lewis` - Male voice (Lewis)
|
99 |
+
|
100 |
## Usage Examples
|
101 |
|
102 |
+
### Using Python requests (Form Data)
|
103 |
|
104 |
```python
|
105 |
import requests
|
106 |
|
107 |
# Prepare the request
|
108 |
+
url = "http://localhost:7860/tts"
|
109 |
data = {
|
110 |
+
"text": "Hello, this is Kokoro TTS in action!",
|
111 |
+
"voice": "af_heart",
|
112 |
+
"lang_code": "a"
|
113 |
}
|
114 |
+
|
115 |
+
# Make the request
|
116 |
+
response = requests.post(url, data=data)
|
117 |
+
|
118 |
+
# Save the generated audio
|
119 |
+
if response.status_code == 200:
|
120 |
+
with open("kokoro_output.wav", "wb") as f:
|
121 |
+
f.write(response.content)
|
122 |
+
print("Speech generated successfully!")
|
123 |
+
```
|
124 |
+
|
125 |
+
### Using Python requests (JSON)
|
126 |
+
|
127 |
+
```python
|
128 |
+
import requests
|
129 |
+
|
130 |
+
# Prepare the JSON request
|
131 |
+
url = "http://localhost:7860/tts-json"
|
132 |
+
data = {
|
133 |
+
"text": "Kokoro delivers high-quality speech synthesis!",
|
134 |
+
"voice": "af_bella",
|
135 |
+
"lang_code": "a"
|
136 |
}
|
137 |
|
138 |
+
headers = {"Content-Type": "application/json"}
|
139 |
+
|
140 |
# Make the request
|
141 |
+
response = requests.post(url, json=data, headers=headers)
|
142 |
|
143 |
# Save the generated audio
|
144 |
if response.status_code == 200:
|
145 |
+
with open("kokoro_json_output.wav", "wb") as f:
|
146 |
f.write(response.content)
|
147 |
print("Speech generated successfully!")
|
148 |
```
|
149 |
|
150 |
+
### Using curl (Form Data)
|
151 |
+
|
152 |
+
```bash
|
153 |
+
curl -X POST "http://localhost:7860/tts" \
|
154 |
+
-F "text=Hello from Kokoro TTS!" \
|
155 |
+
-F "voice=af_heart" \
|
156 |
+
-F "lang_code=a" \
|
157 |
+
--output kokoro_speech.wav
|
158 |
+
```
|
159 |
+
|
160 |
+
### Using curl (JSON)
|
161 |
|
162 |
```bash
|
163 |
+
curl -X POST "http://localhost:7860/tts-json" \
|
164 |
+
-H "Content-Type: application/json" \
|
165 |
+
-d '{"text":"Hello from Kokoro TTS!","voice":"af_heart","lang_code":"a"}' \
|
166 |
+
--output kokoro_speech.wav
|
167 |
+
```
|
168 |
+
|
169 |
+
### Get Available Voices
|
170 |
+
|
171 |
+
```bash
|
172 |
+
curl http://localhost:7860/voices
|
173 |
```
|
174 |
|
175 |
### Using the provided client example
|
|
|
180 |
|
181 |
## Requirements
|
182 |
|
183 |
+
- Python 3.11+
|
184 |
+
- espeak-ng system package
|
185 |
+
- CUDA-compatible GPU (optional, for faster processing)
|
186 |
+
|
187 |
+
## Model Information
|
188 |
|
189 |
+
This API uses Kokoro TTS, which:
|
190 |
+
- Has 82 million parameters
|
191 |
+
- Supports multiple voices and languages
|
192 |
+
- Provides fast, high-quality speech synthesis
|
193 |
+
- Uses Apache-licensed weights
|
194 |
+
- Requires minimal system resources compared to larger models
|
195 |
+
|
196 |
+
## Testing
|
197 |
+
|
198 |
+
Run the standalone test:
|
199 |
+
```bash
|
200 |
+
python test.py
|
201 |
+
```
|
202 |
|
203 |
+
This will generate audio files demonstrating Kokoro's capabilities.
|
204 |
|
XTTS-v2_C3PO
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
Subproject commit 4a9c0315b5b82f33bced654b0773e74832f2bb9a
|
|
|
|
app.py
CHANGED
@@ -1,169 +1,161 @@
|
|
1 |
-
from fastapi import FastAPI, HTTPException,
|
2 |
from fastapi.responses import FileResponse
|
3 |
from pydantic import BaseModel
|
4 |
-
from
|
|
|
|
|
5 |
import os
|
6 |
import tempfile
|
7 |
import uuid
|
8 |
-
import torch
|
9 |
-
from typing import Optional
|
10 |
import logging
|
|
|
11 |
|
12 |
# Configure logging
|
13 |
logging.basicConfig(level=logging.INFO)
|
14 |
logger = logging.getLogger(__name__)
|
15 |
|
16 |
-
app = FastAPI(title="TTS API", description="Text-to-Speech API using
|
17 |
|
18 |
class TTSRequest(BaseModel):
|
19 |
text: str
|
20 |
-
|
|
|
21 |
|
22 |
-
class
|
23 |
def __init__(self):
|
24 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
25 |
logger.info(f"Using device: {self.device}")
|
26 |
|
27 |
-
# Use absolute paths for the model
|
28 |
-
model_path = "XTTS-v2_C3PO/"
|
29 |
-
config_path = "XTTS-v2_C3PO/config.json"
|
30 |
-
|
31 |
-
# Check if model files exist
|
32 |
-
if not os.path.exists(config_path):
|
33 |
-
logger.warning(f"Custom model config not found at {config_path}")
|
34 |
-
# List contents of model directory for debugging
|
35 |
-
model_dir = "/app/XTTS-v2_C3PO"
|
36 |
-
if os.path.exists(model_dir):
|
37 |
-
logger.info(f"Contents of {model_dir}: {os.listdir(model_dir)}")
|
38 |
-
else:
|
39 |
-
logger.warning(f"Model directory {model_dir} does not exist")
|
40 |
-
|
41 |
-
# Fallback to default XTTS model
|
42 |
-
logger.info("Falling back to default XTTS model")
|
43 |
-
try:
|
44 |
-
self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(self.device)
|
45 |
-
logger.info("Default TTS model loaded successfully")
|
46 |
-
return
|
47 |
-
except Exception as e:
|
48 |
-
logger.error(f"Failed to load default TTS model: {e}")
|
49 |
-
raise e
|
50 |
-
|
51 |
try:
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
progress_bar=False,
|
56 |
-
gpu=torch.cuda.is_available()
|
57 |
-
).to(self.device)
|
58 |
-
logger.info("Custom TTS model loaded successfully")
|
59 |
except Exception as e:
|
60 |
-
logger.error(f"Failed to load
|
61 |
-
|
62 |
-
logger.info("Falling back to default XTTS model")
|
63 |
-
try:
|
64 |
-
self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(self.device)
|
65 |
-
logger.info("Default TTS model loaded successfully")
|
66 |
-
except Exception as fallback_e:
|
67 |
-
logger.error(f"Failed to load default TTS model: {fallback_e}")
|
68 |
-
raise fallback_e
|
69 |
|
70 |
-
def generate_speech(self, text: str,
|
71 |
"""Generate speech and return the path to the output file"""
|
72 |
try:
|
73 |
# Create a unique filename for the output
|
74 |
-
output_filename = f"
|
75 |
output_path = os.path.join(tempfile.gettempdir(), output_filename)
|
76 |
|
77 |
-
#
|
78 |
-
self.
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
return output_path
|
86 |
except Exception as e:
|
87 |
logger.error(f"Error generating speech: {e}")
|
88 |
raise HTTPException(status_code=500, detail=f"Failed to generate speech: {str(e)}")
|
89 |
|
90 |
-
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
|
93 |
@app.get("/")
|
94 |
async def root():
|
95 |
-
return {"message": "TTS API is running", "status": "healthy"}
|
96 |
|
97 |
@app.get("/health")
|
98 |
async def health_check():
|
99 |
return {"status": "healthy", "device": tts_service.device}
|
100 |
|
|
|
|
|
|
|
|
|
|
|
101 |
@app.post("/tts")
|
102 |
async def text_to_speech(
|
103 |
text: str = Form(...),
|
104 |
-
|
105 |
-
|
106 |
):
|
107 |
"""
|
108 |
-
Convert text to speech using
|
109 |
|
110 |
- **text**: The text to convert to speech
|
111 |
-
- **
|
112 |
-
- **
|
113 |
"""
|
114 |
|
115 |
if not text.strip():
|
116 |
raise HTTPException(status_code=400, detail="Text cannot be empty")
|
117 |
|
118 |
-
# Validate
|
119 |
-
|
120 |
-
|
|
|
|
|
|
|
|
|
121 |
|
122 |
try:
|
123 |
-
# Save uploaded speaker file temporarily
|
124 |
-
speaker_temp_path = "XTTS-v2_C3PO/reference.wav"
|
125 |
-
|
126 |
-
with open(speaker_temp_path, "wb") as buffer:
|
127 |
-
content = await speaker_file.read()
|
128 |
-
buffer.write(content)
|
129 |
-
|
130 |
# Generate speech
|
131 |
-
output_path = tts_service.generate_speech(text,
|
132 |
-
|
133 |
|
134 |
# Return the generated audio file
|
135 |
return FileResponse(
|
136 |
output_path,
|
137 |
media_type="audio/wav",
|
138 |
-
filename=f"
|
139 |
headers={"Content-Disposition": "attachment"}
|
140 |
)
|
141 |
|
142 |
except Exception as e:
|
143 |
-
# Clean up files in case of error
|
144 |
-
if 'speaker_temp_path' in locals() and os.path.exists(speaker_temp_path):
|
145 |
-
os.remove(speaker_temp_path)
|
146 |
-
|
147 |
logger.error(f"Error in TTS endpoint: {e}")
|
148 |
raise HTTPException(status_code=500, detail=str(e))
|
149 |
|
150 |
-
@app.post("/tts-
|
151 |
-
async def
|
152 |
"""
|
153 |
-
Convert text to speech using
|
154 |
|
155 |
-
- **request**: TTSRequest containing text and
|
156 |
-
- **speaker_wav_url**: URL to the reference speaker audio file
|
157 |
"""
|
158 |
|
159 |
if not request.text.strip():
|
160 |
raise HTTPException(status_code=400, detail="Text cannot be empty")
|
161 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
try:
|
163 |
-
#
|
164 |
-
|
165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
|
167 |
except Exception as e:
|
168 |
-
logger.error(f"Error in TTS
|
169 |
raise HTTPException(status_code=500, detail=str(e))
|
|
|
1 |
+
from fastapi import FastAPI, HTTPException, Form
|
2 |
from fastapi.responses import FileResponse
|
3 |
from pydantic import BaseModel
|
4 |
+
from kokoro import KPipeline
|
5 |
+
import soundfile as sf
|
6 |
+
import torch
|
7 |
import os
|
8 |
import tempfile
|
9 |
import uuid
|
|
|
|
|
10 |
import logging
|
11 |
+
from typing import Optional
|
12 |
|
13 |
# Configure logging
|
14 |
logging.basicConfig(level=logging.INFO)
|
15 |
logger = logging.getLogger(__name__)
|
16 |
|
17 |
+
app = FastAPI(title="Kokoro TTS API", description="Text-to-Speech API using Kokoro", version="1.0.0")
|
18 |
|
19 |
class TTSRequest(BaseModel):
|
20 |
text: str
|
21 |
+
voice: str = "af_heart"
|
22 |
+
lang_code: str = "a"
|
23 |
|
24 |
+
class KokoroTTSService:
|
25 |
def __init__(self):
|
26 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
27 |
logger.info(f"Using device: {self.device}")
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
try:
|
30 |
+
# Initialize Kokoro pipeline with default language
|
31 |
+
self.pipeline = KPipeline(lang_code='a')
|
32 |
+
logger.info("Kokoro TTS pipeline loaded successfully")
|
|
|
|
|
|
|
|
|
33 |
except Exception as e:
|
34 |
+
logger.error(f"Failed to load Kokoro TTS pipeline: {e}")
|
35 |
+
raise e
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
+
def generate_speech(self, text: str, voice: str = "af_heart", lang_code: str = "a") -> str:
|
38 |
"""Generate speech and return the path to the output file"""
|
39 |
try:
|
40 |
# Create a unique filename for the output
|
41 |
+
output_filename = f"kokoro_output_{uuid.uuid4().hex}.wav"
|
42 |
output_path = os.path.join(tempfile.gettempdir(), output_filename)
|
43 |
|
44 |
+
# Update pipeline language if different
|
45 |
+
if self.pipeline.lang_code != lang_code:
|
46 |
+
self.pipeline = KPipeline(lang_code=lang_code)
|
47 |
+
|
48 |
+
# Generate speech using Kokoro
|
49 |
+
generator = self.pipeline(text, voice=voice)
|
50 |
+
|
51 |
+
# Get the first (and typically only) audio output
|
52 |
+
for i, (gs, ps, audio) in enumerate(generator):
|
53 |
+
logger.info(f"Generated audio segment {i}: gs={gs}, ps={ps}")
|
54 |
+
# Save the audio to file
|
55 |
+
sf.write(output_path, audio, 24000)
|
56 |
+
break # Take the first generated audio
|
57 |
|
58 |
return output_path
|
59 |
except Exception as e:
|
60 |
logger.error(f"Error generating speech: {e}")
|
61 |
raise HTTPException(status_code=500, detail=f"Failed to generate speech: {str(e)}")
|
62 |
|
63 |
+
def get_available_voices(self):
|
64 |
+
"""Return list of available voices"""
|
65 |
+
# Common Kokoro voices - you may want to expand this list
|
66 |
+
return [
|
67 |
+
"af_heart", "af_sky", "af_bella", "af_sarah", "af_nicole",
|
68 |
+
"am_adam", "am_michael", "am_edward", "am_lewis"
|
69 |
+
]
|
70 |
+
|
71 |
+
# Initialize Kokoro TTS service
|
72 |
+
tts_service = KokoroTTSService()
|
73 |
|
74 |
@app.get("/")
|
75 |
async def root():
|
76 |
+
return {"message": "Kokoro TTS API is running", "status": "healthy"}
|
77 |
|
78 |
@app.get("/health")
|
79 |
async def health_check():
|
80 |
return {"status": "healthy", "device": tts_service.device}
|
81 |
|
82 |
+
@app.get("/voices")
|
83 |
+
async def get_voices():
|
84 |
+
"""Get list of available voices"""
|
85 |
+
return {"voices": tts_service.get_available_voices()}
|
86 |
+
|
87 |
@app.post("/tts")
|
88 |
async def text_to_speech(
|
89 |
text: str = Form(...),
|
90 |
+
voice: str = Form("af_heart"),
|
91 |
+
lang_code: str = Form("a")
|
92 |
):
|
93 |
"""
|
94 |
+
Convert text to speech using Kokoro TTS
|
95 |
|
96 |
- **text**: The text to convert to speech
|
97 |
+
- **voice**: Voice to use (default: "af_heart")
|
98 |
+
- **lang_code**: Language code (default: "a" for auto-detect)
|
99 |
"""
|
100 |
|
101 |
if not text.strip():
|
102 |
raise HTTPException(status_code=400, detail="Text cannot be empty")
|
103 |
|
104 |
+
# Validate voice
|
105 |
+
available_voices = tts_service.get_available_voices()
|
106 |
+
if voice not in available_voices:
|
107 |
+
raise HTTPException(
|
108 |
+
status_code=400,
|
109 |
+
detail=f"Voice '{voice}' not available. Available voices: {available_voices}"
|
110 |
+
)
|
111 |
|
112 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
# Generate speech
|
114 |
+
output_path = tts_service.generate_speech(text, voice, lang_code)
|
|
|
115 |
|
116 |
# Return the generated audio file
|
117 |
return FileResponse(
|
118 |
output_path,
|
119 |
media_type="audio/wav",
|
120 |
+
filename=f"kokoro_tts_{voice}_{uuid.uuid4().hex}.wav",
|
121 |
headers={"Content-Disposition": "attachment"}
|
122 |
)
|
123 |
|
124 |
except Exception as e:
|
|
|
|
|
|
|
|
|
125 |
logger.error(f"Error in TTS endpoint: {e}")
|
126 |
raise HTTPException(status_code=500, detail=str(e))
|
127 |
|
128 |
+
@app.post("/tts-json")
|
129 |
+
async def text_to_speech_json(request: TTSRequest):
|
130 |
"""
|
131 |
+
Convert text to speech using JSON request body
|
132 |
|
133 |
+
- **request**: TTSRequest containing text, voice, and lang_code
|
|
|
134 |
"""
|
135 |
|
136 |
if not request.text.strip():
|
137 |
raise HTTPException(status_code=400, detail="Text cannot be empty")
|
138 |
|
139 |
+
# Validate voice
|
140 |
+
available_voices = tts_service.get_available_voices()
|
141 |
+
if request.voice not in available_voices:
|
142 |
+
raise HTTPException(
|
143 |
+
status_code=400,
|
144 |
+
detail=f"Voice '{request.voice}' not available. Available voices: {available_voices}"
|
145 |
+
)
|
146 |
+
|
147 |
try:
|
148 |
+
# Generate speech
|
149 |
+
output_path = tts_service.generate_speech(request.text, request.voice, request.lang_code)
|
150 |
+
|
151 |
+
# Return the generated audio file
|
152 |
+
return FileResponse(
|
153 |
+
output_path,
|
154 |
+
media_type="audio/wav",
|
155 |
+
filename=f"kokoro_tts_{request.voice}_{uuid.uuid4().hex}.wav",
|
156 |
+
headers={"Content-Disposition": "attachment"}
|
157 |
+
)
|
158 |
|
159 |
except Exception as e:
|
160 |
+
logger.error(f"Error in TTS JSON endpoint: {e}")
|
161 |
raise HTTPException(status_code=500, detail=str(e))
|
client_example.py
CHANGED
@@ -1,41 +1,70 @@
|
|
1 |
import requests
|
2 |
-
import
|
3 |
|
4 |
-
def
|
5 |
-
"""Example of how to use the TTS API"""
|
6 |
|
7 |
# API endpoint
|
8 |
-
url = "http://localhost:
|
9 |
|
10 |
-
# Text to convert to speech
|
11 |
-
text = "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
-
#
|
14 |
-
|
15 |
|
16 |
-
#
|
17 |
-
|
18 |
-
print(f"Error: Speaker file not found at {speaker_file_path}")
|
19 |
-
print("Please update the speaker_file_path variable with a valid audio file path")
|
20 |
-
return
|
21 |
|
22 |
-
# Prepare the request
|
23 |
data = {
|
24 |
"text": text,
|
25 |
-
"
|
|
|
26 |
}
|
27 |
|
28 |
-
|
29 |
-
"
|
30 |
}
|
31 |
|
32 |
try:
|
33 |
-
print("Sending request to TTS API...")
|
34 |
-
response = requests.post(url,
|
35 |
|
36 |
if response.status_code == 200:
|
37 |
# Save the generated audio
|
38 |
-
output_filename = "
|
39 |
with open(output_filename, "wb") as f:
|
40 |
f.write(response.content)
|
41 |
print(f"Success! Generated speech saved as {output_filename}")
|
@@ -44,30 +73,59 @@ def test_tts_api():
|
|
44 |
print(response.text)
|
45 |
|
46 |
except requests.exceptions.ConnectionError:
|
47 |
-
print("Error: Could not connect to the API. Make sure the server is running on http://localhost:
|
48 |
except Exception as e:
|
49 |
print(f"Error: {e}")
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
def check_api_health():
|
54 |
"""Check if the API is running"""
|
55 |
try:
|
56 |
-
response = requests.get("http://localhost:
|
57 |
if response.status_code == 200:
|
58 |
print("API is healthy:", response.json())
|
|
|
59 |
else:
|
60 |
print("API health check failed:", response.status_code)
|
|
|
61 |
except requests.exceptions.ConnectionError:
|
62 |
-
print("API is not running. Start it with:
|
|
|
63 |
|
64 |
if __name__ == "__main__":
|
65 |
-
print("TTS API Client Example")
|
66 |
-
print("=" *
|
67 |
|
68 |
# First check if API is running
|
69 |
-
check_api_health()
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import requests
|
2 |
+
import json
|
3 |
|
4 |
+
def test_kokoro_tts_api():
|
5 |
+
"""Example of how to use the Kokoro TTS API"""
|
6 |
|
7 |
# API endpoint
|
8 |
+
url = "http://localhost:7860/tts"
|
9 |
|
10 |
+
# Text to convert to speech (using the example from the user's request)
|
11 |
+
text = """
|
12 |
+
[Kokoro](/kˈOkəɹO/) is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models while being significantly faster and more cost-efficient. With Apache-licensed weights, [Kokoro](/kˈOkəɹO/) can be deployed anywhere from production environments to personal projects.
|
13 |
+
"""
|
14 |
+
|
15 |
+
# Prepare the request data
|
16 |
+
data = {
|
17 |
+
"text": text,
|
18 |
+
"voice": "af_heart", # Available voices: af_heart, af_sky, af_bella, etc.
|
19 |
+
"lang_code": "a" # 'a' for auto-detect
|
20 |
+
}
|
21 |
+
|
22 |
+
try:
|
23 |
+
print("Sending request to Kokoro TTS API...")
|
24 |
+
response = requests.post(url, data=data)
|
25 |
+
|
26 |
+
if response.status_code == 200:
|
27 |
+
# Save the generated audio
|
28 |
+
output_filename = "kokoro_generated_speech.wav"
|
29 |
+
with open(output_filename, "wb") as f:
|
30 |
+
f.write(response.content)
|
31 |
+
print(f"Success! Generated speech saved as {output_filename}")
|
32 |
+
else:
|
33 |
+
print(f"Error: {response.status_code}")
|
34 |
+
print(response.text)
|
35 |
+
|
36 |
+
except requests.exceptions.ConnectionError:
|
37 |
+
print("Error: Could not connect to the API. Make sure the server is running on http://localhost:7860")
|
38 |
+
except Exception as e:
|
39 |
+
print(f"Error: {e}")
|
40 |
+
|
41 |
+
def test_kokoro_tts_json_api():
|
42 |
+
"""Example of using the JSON endpoint"""
|
43 |
|
44 |
+
# API endpoint
|
45 |
+
url = "http://localhost:7860/tts-json"
|
46 |
|
47 |
+
# Text to convert to speech
|
48 |
+
text = "Hello, this is a test of the Kokoro TTS system using the JSON API endpoint."
|
|
|
|
|
|
|
49 |
|
50 |
+
# Prepare the JSON request
|
51 |
data = {
|
52 |
"text": text,
|
53 |
+
"voice": "af_bella",
|
54 |
+
"lang_code": "a"
|
55 |
}
|
56 |
|
57 |
+
headers = {
|
58 |
+
"Content-Type": "application/json"
|
59 |
}
|
60 |
|
61 |
try:
|
62 |
+
print("Sending JSON request to Kokoro TTS API...")
|
63 |
+
response = requests.post(url, json=data, headers=headers)
|
64 |
|
65 |
if response.status_code == 200:
|
66 |
# Save the generated audio
|
67 |
+
output_filename = "kokoro_json_speech.wav"
|
68 |
with open(output_filename, "wb") as f:
|
69 |
f.write(response.content)
|
70 |
print(f"Success! Generated speech saved as {output_filename}")
|
|
|
73 |
print(response.text)
|
74 |
|
75 |
except requests.exceptions.ConnectionError:
|
76 |
+
print("Error: Could not connect to the API. Make sure the server is running on http://localhost:7860")
|
77 |
except Exception as e:
|
78 |
print(f"Error: {e}")
|
79 |
+
|
80 |
+
def get_available_voices():
|
81 |
+
"""Get list of available voices"""
|
82 |
+
try:
|
83 |
+
response = requests.get("http://localhost:7860/voices")
|
84 |
+
if response.status_code == 200:
|
85 |
+
voices = response.json()
|
86 |
+
print("Available voices:", voices["voices"])
|
87 |
+
return voices["voices"]
|
88 |
+
else:
|
89 |
+
print("Failed to get voices:", response.status_code)
|
90 |
+
return []
|
91 |
+
except requests.exceptions.ConnectionError:
|
92 |
+
print("API is not running. Start it with: uvicorn app:app --host 0.0.0.0 --port 7860")
|
93 |
+
return []
|
94 |
|
95 |
def check_api_health():
|
96 |
"""Check if the API is running"""
|
97 |
try:
|
98 |
+
response = requests.get("http://localhost:7860/health")
|
99 |
if response.status_code == 200:
|
100 |
print("API is healthy:", response.json())
|
101 |
+
return True
|
102 |
else:
|
103 |
print("API health check failed:", response.status_code)
|
104 |
+
return False
|
105 |
except requests.exceptions.ConnectionError:
|
106 |
+
print("API is not running. Start it with: uvicorn app:app --host 0.0.0.0 --port 7860")
|
107 |
+
return False
|
108 |
|
109 |
if __name__ == "__main__":
|
110 |
+
print("Kokoro TTS API Client Example")
|
111 |
+
print("=" * 35)
|
112 |
|
113 |
# First check if API is running
|
114 |
+
if check_api_health():
|
115 |
+
print()
|
116 |
+
|
117 |
+
# Get available voices
|
118 |
+
voices = get_available_voices()
|
119 |
+
print()
|
120 |
+
|
121 |
+
# Test the TTS functionality with form data
|
122 |
+
print("Testing form-data endpoint...")
|
123 |
+
test_kokoro_tts_api()
|
124 |
+
print()
|
125 |
+
|
126 |
+
# Test the TTS functionality with JSON
|
127 |
+
print("Testing JSON endpoint...")
|
128 |
+
test_kokoro_tts_json_api()
|
129 |
+
else:
|
130 |
+
print("\nPlease start the API server first:")
|
131 |
+
print("uvicorn app:app --host 0.0.0.0 --port 7860")
|
requirements.txt
CHANGED
@@ -1,10 +1,7 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
scikit-learn
|
4 |
fastapi
|
5 |
uvicorn[standard]
|
6 |
python-multipart
|
7 |
torch
|
8 |
-
torchaudio
|
9 |
-
requests
|
10 |
-
librosa==0.10.1
|
|
|
1 |
+
kokoro>=0.9.2
|
2 |
+
soundfile
|
|
|
3 |
fastapi
|
4 |
uvicorn[standard]
|
5 |
python-multipart
|
6 |
torch
|
7 |
+
torchaudio
|
|
|
|
test.py
CHANGED
@@ -1,10 +1,23 @@
|
|
1 |
-
from
|
|
|
|
|
2 |
|
3 |
-
|
4 |
-
|
5 |
|
6 |
-
#
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from kokoro import KPipeline
|
2 |
+
import soundfile as sf
|
3 |
+
import torch
|
4 |
|
5 |
+
# Initialize Kokoro pipeline
|
6 |
+
pipeline = KPipeline(lang_code='a')
|
7 |
|
8 |
+
# Text to convert to speech
|
9 |
+
text = '''
|
10 |
+
[Kokoro](/kˈOkəɹO/) is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models while being significantly faster and more cost-efficient. With Apache-licensed weights, [Kokoro](/kˈOkəɹO/) can be deployed anywhere from production environments to personal projects.
|
11 |
+
'''
|
12 |
+
|
13 |
+
# Generate speech using Kokoro
|
14 |
+
generator = pipeline(text, voice='af_heart')
|
15 |
+
|
16 |
+
# Process and save the generated audio
|
17 |
+
for i, (gs, ps, audio) in enumerate(generator):
|
18 |
+
print(f"Segment {i}: gs={gs}, ps={ps}")
|
19 |
+
# Save each segment as a separate file
|
20 |
+
sf.write(f'{i}.wav', audio, 24000)
|
21 |
+
print(f"Saved segment {i} as {i}.wav")
|
22 |
+
|
23 |
+
print("Speech generation completed!")
|
test_kokoro_install.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
|