Spaces:
Running
Running
from flask import Flask, request, jsonify | |
import whisper | |
import tempfile | |
import os | |
from flask_cors import CORS | |
from werkzeug.utils import secure_filename | |
import logging | |
from datetime import datetime | |
# Configure logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
app = Flask(__name__) | |
CORS(app) | |
# Configuration | |
app.config['MAX_CONTENT_LENGTH'] = 100 * 1024 * 1024 # 100MB max file size | |
ALLOWED_EXTENSIONS = {'wav', 'mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'webm', 'flac'} | |
# Load Whisper model (you can change the model size: tiny, base, small, medium, large) | |
MODEL_SIZE = "base" # Change this to your preferred model size | |
logger.info(f"Loading Whisper model: {MODEL_SIZE}") | |
model = whisper.load_model(MODEL_SIZE) | |
logger.info("Whisper model loaded successfully") | |
def allowed_file(filename): | |
"""Check if the file extension is allowed""" | |
return '.' in filename and \ | |
filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS | |
def format_timestamp(seconds): | |
"""Convert seconds to HH:MM:SS.mmm format""" | |
hours = int(seconds // 3600) | |
minutes = int((seconds % 3600) // 60) | |
secs = seconds % 60 | |
return f"{hours:02d}:{minutes:02d}:{secs:06.3f}" | |
def health_check(): | |
"""Health check endpoint""" | |
return jsonify({ | |
"status": "healthy", | |
"message": "Whisper Transcription API is running", | |
"model": MODEL_SIZE, | |
"timestamp": datetime.now().isoformat() | |
}) | |
def transcribe_audio(): | |
""" | |
Transcribe audio file and return word-level timestamps | |
Expected form data: | |
- audio_file: The audio file to transcribe | |
- language (optional): Language code (e.g., 'en', 'es', 'fr') | |
- task (optional): 'transcribe' or 'translate' (default: transcribe) | |
""" | |
try: | |
# Check if audio file is present | |
if 'audio_file' not in request.files: | |
return jsonify({'error': 'No audio file provided'}), 400 | |
file = request.files['audio_file'] | |
if file.filename == '': | |
return jsonify({'error': 'No file selected'}), 400 | |
if not allowed_file(file.filename): | |
return jsonify({ | |
'error': f'File type not allowed. Supported formats: {", ".join(ALLOWED_EXTENSIONS)}' | |
}), 400 | |
# Get optional parameters | |
language = request.form.get('language', None) | |
task = request.form.get('task', 'transcribe') | |
if task not in ['transcribe', 'translate']: | |
return jsonify({'error': 'Task must be either "transcribe" or "translate"'}), 400 | |
# Save uploaded file temporarily | |
with tempfile.NamedTemporaryFile(delete=False, suffix=f".{file.filename.rsplit('.', 1)[1].lower()}") as tmp_file: | |
file.save(tmp_file.name) | |
temp_path = tmp_file.name | |
logger.info(f"Processing file: {file.filename}") | |
try: | |
# Transcribe with word-level timestamps | |
result = model.transcribe( | |
temp_path, | |
language=language, | |
task=task, | |
word_timestamps=True, | |
verbose=False | |
) | |
# Extract word-level data | |
word_segments = [] | |
for segment in result.get("segments", []): | |
if "words" in segment: | |
for word_data in segment["words"]: | |
word_segments.append({ | |
"word": word_data.get("word", "").strip(), | |
"start": word_data.get("start", 0), | |
"end": word_data.get("end", 0), | |
"start_formatted": format_timestamp(word_data.get("start", 0)), | |
"end_formatted": format_timestamp(word_data.get("end", 0)), | |
"confidence": word_data.get("probability", 0) | |
}) | |
# Prepare response | |
response_data = { | |
"success": True, | |
"filename": secure_filename(file.filename), | |
"language": result.get("language", "unknown"), | |
"task": task, | |
"duration": result.get("segments", [{}])[-1].get("end", 0) if result.get("segments") else 0, | |
"text": result.get("text", ""), | |
"word_count": len(word_segments), | |
"segments": result.get("segments", []), | |
"words": word_segments, | |
"model_used": MODEL_SIZE, | |
"processing_time": None # You can add timing if needed | |
} | |
logger.info(f"Successfully transcribed {len(word_segments)} words from {file.filename}") | |
return jsonify(response_data) | |
except Exception as e: | |
logger.error(f"Transcription error: {str(e)}") | |
return jsonify({'error': f'Transcription failed: {str(e)}'}), 500 | |
finally: | |
# Clean up temporary file | |
if os.path.exists(temp_path): | |
os.unlink(temp_path) | |
except Exception as e: | |
logger.error(f"API error: {str(e)}") | |
return jsonify({'error': f'Server error: {str(e)}'}), 500 | |
def available_models(): | |
"""Get information about available Whisper models""" | |
models_info = { | |
"current_model": MODEL_SIZE, | |
"available_models": { | |
"tiny": {"size": "~39 MB", "speed": "~32x", "accuracy": "lowest"}, | |
"base": {"size": "~74 MB", "speed": "~16x", "accuracy": "low"}, | |
"small": {"size": "~244 MB", "speed": "~6x", "accuracy": "medium"}, | |
"medium": {"size": "~769 MB", "speed": "~2x", "accuracy": "high"}, | |
"large": {"size": "~1550 MB", "speed": "~1x", "accuracy": "highest"} | |
}, | |
"supported_languages": [ | |
"en", "zh", "de", "es", "ru", "ko", "fr", "ja", "pt", "tr", "pl", "ca", "nl", | |
"ar", "sv", "it", "id", "hi", "fi", "vi", "he", "uk", "el", "ms", "cs", "ro", | |
"da", "hu", "ta", "no", "th", "ur", "hr", "bg", "lt", "la", "mi", "ml", "cy", | |
"sk", "te", "fa", "lv", "bn", "sr", "az", "sl", "kn", "et", "mk", "br", "eu", | |
"is", "hy", "ne", "mn", "bs", "kk", "sq", "sw", "gl", "mr", "pa", "si", "km", | |
"sn", "yo", "so", "af", "oc", "ka", "be", "tg", "sd", "gu", "am", "yi", "lo", | |
"uz", "fo", "ht", "ps", "tk", "nn", "mt", "sa", "lb", "my", "bo", "tl", "mg", | |
"as", "tt", "haw", "ln", "ha", "ba", "jw", "su" | |
] | |
} | |
return jsonify(models_info) | |
def too_large(e): | |
return jsonify({'error': 'File too large. Maximum size is 100MB'}), 413 | |
def not_found(e): | |
return jsonify({'error': 'Endpoint not found'}), 404 | |
def internal_error(e): | |
return jsonify({'error': 'Internal server error'}), 500 | |
if __name__ == '__main__': | |
print(f""" | |
Whisper Transcription API Server | |
================================ | |
Model: {MODEL_SIZE} | |
Endpoints: | |
- GET / : Health check | |
- POST /transcribe : Transcribe audio file | |
- GET /models : Available models info | |
Supported formats: {', '.join(ALLOWED_EXTENSIONS)} | |
Max file size: 100MB | |
""") | |
app.run(debug=True, host='0.0.0.0', port=7860) |