Avinyaa commited on
Commit
9a88d9c
·
1 Parent(s): c53fcc3
Files changed (6) hide show
  1. README.md +109 -1
  2. app.py +169 -0
  3. client_example.py +73 -0
  4. dockerfile +28 -0
  5. requirements.txt +9 -0
  6. test.py +10 -0
README.md CHANGED
@@ -7,4 +7,112 @@ sdk: docker
7
  pinned: false
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  pinned: false
8
  ---
9
 
10
+ # TTS API
11
+
12
+ A FastAPI-based Text-to-Speech API using XTTS-v2 for voice cloning.
13
+
14
+ ## Features
15
+
16
+ - Convert text to speech using voice cloning
17
+ - Upload reference speaker audio files
18
+ - Support for multiple languages
19
+ - RESTful API with automatic documentation
20
+ - Docker support
21
+
22
+ ## Setup
23
+
24
+ ### Local Development
25
+
26
+ 1. Install dependencies:
27
+ ```bash
28
+ pip install -r requirements.txt
29
+ ```
30
+
31
+ 2. Run the API:
32
+ ```bash
33
+ python app.py
34
+ ```
35
+
36
+ The API will be available at `http://localhost:8000`
37
+
38
+ ### Using Docker
39
+
40
+ 1. Build the Docker image:
41
+ ```bash
42
+ docker build -t tts-api .
43
+ ```
44
+
45
+ 2. Run the container:
46
+ ```bash
47
+ docker run -p 8000:8000 tts-api
48
+ ```
49
+
50
+ ## API Endpoints
51
+
52
+ ### Health Check
53
+ - **GET** `/health` - Check API status
54
+
55
+ ### Text-to-Speech
56
+ - **POST** `/tts` - Convert text to speech with uploaded speaker file
57
+ - **Parameters:**
58
+ - `text` (form): Text to convert to speech
59
+ - `language` (form): Language code (default: "en")
60
+ - `speaker_file` (file): Reference speaker audio file
61
+
62
+ ### API Documentation
63
+ - **GET** `/docs` - Interactive API documentation (Swagger UI)
64
+ - **GET** `/redoc` - Alternative API documentation
65
+
66
+ ## Usage Examples
67
+
68
+ ### Using Python requests
69
+
70
+ ```python
71
+ import requests
72
+
73
+ # Prepare the request
74
+ url = "http://localhost:8000/tts"
75
+ data = {
76
+ "text": "Hello, this is a test of voice cloning!",
77
+ "language": "en"
78
+ }
79
+ files = {
80
+ "speaker_file": open("path/to/speaker.wav", "rb")
81
+ }
82
+
83
+ # Make the request
84
+ response = requests.post(url, data=data, files=files)
85
+
86
+ # Save the generated audio
87
+ if response.status_code == 200:
88
+ with open("output.wav", "wb") as f:
89
+ f.write(response.content)
90
+ print("Speech generated successfully!")
91
+ ```
92
+
93
+ ### Using curl
94
+
95
+ ```bash
96
+ curl -X POST "http://localhost:8000/tts" \
97
+ -F "text=Hello, this is a test!" \
98
+ -F "language=en" \
99
+ -F "speaker_file=@path/to/speaker.wav" \
100
+ --output generated_speech.wav
101
+ ```
102
+
103
+ ### Using the provided client example
104
+
105
+ ```bash
106
+ python client_example.py
107
+ ```
108
+
109
+ ## Requirements
110
+
111
+ - Python 3.8+
112
+ - CUDA-compatible GPU (recommended for faster processing)
113
+ - Audio file in supported format (WAV, MP3, etc.) for speaker reference
114
+
115
+ ## Model
116
+
117
+ This API uses the XTTS-v2_C3PO model for voice cloning, which is automatically downloaded when building the Docker image.
118
+
app.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException, UploadFile, File, Form
2
+ from fastapi.responses import FileResponse
3
+ from pydantic import BaseModel
4
+ from TTS.api import TTS
5
+ import os
6
+ import tempfile
7
+ import uuid
8
+ import torch
9
+ from typing import Optional
10
+ import logging
11
+
12
+ # Configure logging
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
+ app = FastAPI(title="TTS API", description="Text-to-Speech API using XTTS-v2", version="1.0.0")
17
+
18
+ class TTSRequest(BaseModel):
19
+ text: str
20
+ language: str = "en"
21
+
22
+ class TTSService:
23
+ def __init__(self):
24
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
25
+ logger.info(f"Using device: {self.device}")
26
+
27
+ # Use absolute paths for the model
28
+ model_path = "/app/XTTS-v2_C3PO/"
29
+ config_path = "/app/XTTS-v2_C3PO/config.json"
30
+
31
+ # Check if model files exist
32
+ if not os.path.exists(config_path):
33
+ logger.warning(f"Custom model config not found at {config_path}")
34
+ # List contents of model directory for debugging
35
+ model_dir = "/app/XTTS-v2_C3PO"
36
+ if os.path.exists(model_dir):
37
+ logger.info(f"Contents of {model_dir}: {os.listdir(model_dir)}")
38
+ else:
39
+ logger.warning(f"Model directory {model_dir} does not exist")
40
+
41
+ # Fallback to default XTTS model
42
+ logger.info("Falling back to default XTTS model")
43
+ try:
44
+ self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(self.device)
45
+ logger.info("Default TTS model loaded successfully")
46
+ return
47
+ except Exception as e:
48
+ logger.error(f"Failed to load default TTS model: {e}")
49
+ raise e
50
+
51
+ try:
52
+ self.tts = TTS(
53
+ model_path=model_path,
54
+ config_path=config_path,
55
+ progress_bar=False,
56
+ gpu=torch.cuda.is_available()
57
+ ).to(self.device)
58
+ logger.info("Custom TTS model loaded successfully")
59
+ except Exception as e:
60
+ logger.error(f"Failed to load custom TTS model: {e}")
61
+ # Fallback to default model
62
+ logger.info("Falling back to default XTTS model")
63
+ try:
64
+ self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(self.device)
65
+ logger.info("Default TTS model loaded successfully")
66
+ except Exception as fallback_e:
67
+ logger.error(f"Failed to load default TTS model: {fallback_e}")
68
+ raise fallback_e
69
+
70
+ def generate_speech(self, text: str, speaker_wav_path: str, language: str = "en") -> str:
71
+ """Generate speech and return the path to the output file"""
72
+ try:
73
+ # Create a unique filename for the output
74
+ output_filename = f"output_{uuid.uuid4().hex}.wav"
75
+ output_path = os.path.join(tempfile.gettempdir(), output_filename)
76
+
77
+ # Generate speech
78
+ self.tts.tts_to_file(
79
+ text=text,
80
+ file_path=output_path,
81
+ speaker_wav=speaker_wav_path,
82
+ language=language
83
+ )
84
+
85
+ return output_path
86
+ except Exception as e:
87
+ logger.error(f"Error generating speech: {e}")
88
+ raise HTTPException(status_code=500, detail=f"Failed to generate speech: {str(e)}")
89
+
90
+ # Initialize TTS service
91
+ tts_service = TTSService()
92
+
93
+ @app.get("/")
94
+ async def root():
95
+ return {"message": "TTS API is running", "status": "healthy"}
96
+
97
+ @app.get("/health")
98
+ async def health_check():
99
+ return {"status": "healthy", "device": tts_service.device}
100
+
101
+ @app.post("/tts")
102
+ async def text_to_speech(
103
+ text: str = Form(...),
104
+ language: str = Form("en"),
105
+ speaker_file: UploadFile = File(...)
106
+ ):
107
+ """
108
+ Convert text to speech using a reference speaker voice
109
+
110
+ - **text**: The text to convert to speech
111
+ - **language**: Language code (default: "en")
112
+ - **speaker_file**: Audio file containing the reference speaker voice
113
+ """
114
+
115
+ if not text.strip():
116
+ raise HTTPException(status_code=400, detail="Text cannot be empty")
117
+
118
+ # Validate file type
119
+ if not speaker_file.content_type.startswith('audio/'):
120
+ raise HTTPException(status_code=400, detail="Speaker file must be an audio file")
121
+
122
+ try:
123
+ # Save uploaded speaker file temporarily
124
+ speaker_temp_path = "XTTS-v2_C3PO/reference.wav"
125
+
126
+ with open(speaker_temp_path, "wb") as buffer:
127
+ content = await speaker_file.read()
128
+ buffer.write(content)
129
+
130
+ # Generate speech
131
+ output_path = tts_service.generate_speech(text, speaker_temp_path, language)
132
+
133
+
134
+ # Return the generated audio file
135
+ return FileResponse(
136
+ output_path,
137
+ media_type="audio/wav",
138
+ filename=f"tts_output_{uuid.uuid4().hex}.wav",
139
+ headers={"Content-Disposition": "attachment"}
140
+ )
141
+
142
+ except Exception as e:
143
+ # Clean up files in case of error
144
+ if 'speaker_temp_path' in locals() and os.path.exists(speaker_temp_path):
145
+ os.remove(speaker_temp_path)
146
+
147
+ logger.error(f"Error in TTS endpoint: {e}")
148
+ raise HTTPException(status_code=500, detail=str(e))
149
+
150
+ @app.post("/tts-with-url")
151
+ async def text_to_speech_with_url(request: TTSRequest, speaker_wav_url: str):
152
+ """
153
+ Convert text to speech using a reference speaker voice from URL
154
+
155
+ - **request**: TTSRequest containing text and language
156
+ - **speaker_wav_url**: URL to the reference speaker audio file
157
+ """
158
+
159
+ if not request.text.strip():
160
+ raise HTTPException(status_code=400, detail="Text cannot be empty")
161
+
162
+ try:
163
+ # For this endpoint, you would need to download the file from URL
164
+ # This is a simplified version - you might want to add URL validation and download logic
165
+ raise HTTPException(status_code=501, detail="URL-based speaker input not implemented yet")
166
+
167
+ except Exception as e:
168
+ logger.error(f"Error in TTS URL endpoint: {e}")
169
+ raise HTTPException(status_code=500, detail=str(e))
client_example.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import os
3
+
4
+ def test_tts_api():
5
+ """Example of how to use the TTS API"""
6
+
7
+ # API endpoint
8
+ url = "http://localhost:8000/tts"
9
+
10
+ # Text to convert to speech
11
+ text = "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."
12
+
13
+ # Path to your speaker reference audio file
14
+ speaker_file_path = "/path/to/target/speaker.wav" # Update this path
15
+
16
+ # Check if speaker file exists
17
+ if not os.path.exists(speaker_file_path):
18
+ print(f"Error: Speaker file not found at {speaker_file_path}")
19
+ print("Please update the speaker_file_path variable with a valid audio file path")
20
+ return
21
+
22
+ # Prepare the request
23
+ data = {
24
+ "text": text,
25
+ "language": "en"
26
+ }
27
+
28
+ files = {
29
+ "speaker_file": open(speaker_file_path, "rb")
30
+ }
31
+
32
+ try:
33
+ print("Sending request to TTS API...")
34
+ response = requests.post(url, data=data, files=files)
35
+
36
+ if response.status_code == 200:
37
+ # Save the generated audio
38
+ output_filename = "generated_speech.wav"
39
+ with open(output_filename, "wb") as f:
40
+ f.write(response.content)
41
+ print(f"Success! Generated speech saved as {output_filename}")
42
+ else:
43
+ print(f"Error: {response.status_code}")
44
+ print(response.text)
45
+
46
+ except requests.exceptions.ConnectionError:
47
+ print("Error: Could not connect to the API. Make sure the server is running on http://localhost:8000")
48
+ except Exception as e:
49
+ print(f"Error: {e}")
50
+ finally:
51
+ files["speaker_file"].close()
52
+
53
+ def check_api_health():
54
+ """Check if the API is running"""
55
+ try:
56
+ response = requests.get("http://localhost:8000/health")
57
+ if response.status_code == 200:
58
+ print("API is healthy:", response.json())
59
+ else:
60
+ print("API health check failed:", response.status_code)
61
+ except requests.exceptions.ConnectionError:
62
+ print("API is not running. Start it with: python app.py")
63
+
64
+ if __name__ == "__main__":
65
+ print("TTS API Client Example")
66
+ print("=" * 30)
67
+
68
+ # First check if API is running
69
+ check_api_health()
70
+ print()
71
+
72
+ # Test the TTS functionality
73
+ test_tts_api()
dockerfile ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11
2
+
3
+ WORKDIR /app
4
+
5
+ # Install git and git-lfs
6
+ RUN apt-get update && apt-get install -y git git-lfs && rm -rf /var/lib/apt/lists/*
7
+
8
+ # Initialize git lfs
9
+ RUN git lfs install
10
+
11
+ COPY requirements.txt .
12
+
13
+ RUN pip install uv
14
+ RUN uv pip install --no-cache-dir -r requirements.txt --system
15
+
16
+ echo "Cloning the XTTS-v2_C3PO model..."
17
+ # Clone the XTTS-v2_C3PO model and verify it
18
+ RUN git clone https://huggingface.co/Borcherding/XTTS-v2_C3PO && \
19
+ ls -la XTTS-v2_C3PO/ && \
20
+ echo "Model directory contents:" && \
21
+ find XTTS-v2_C3PO/ -type f -name "*.json" -o -name "*.pth" -o -name "*.pt" | head -10
22
+
23
+ COPY . .
24
+
25
+ # Expose the port
26
+ EXPOSE 8000
27
+
28
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ coqui-tts
2
+ pandas
3
+ scikit-learn
4
+ fastapi
5
+ uvicorn[standard]
6
+ python-multipart
7
+ torch
8
+ torchaudio
9
+ requests
test.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from TTS.api import TTS
2
+
3
+ tts = TTS(model_path="XTTS-v2_C3PO/",
4
+ config_path="XTTS-v2_C3PO/config.json", progress_bar=False, gpu=True).to(self.device)
5
+
6
+ # generate speech by cloning a voice using default settings
7
+ tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
8
+ file_path="output.wav",
9
+ speaker_wav="/path/to/target/speaker.wav",
10
+ language="en")