Spaces:
Running
Running
File size: 7,500 Bytes
6c3cd1d b3cef24 ced86e4 6c3cd1d b3cef24 6c3cd1d b3cef24 6c3cd1d ced86e4 b3cef24 6c3cd1d b3cef24 6c3cd1d b3cef24 6c3cd1d b3cef24 6c3cd1d b3cef24 6c3cd1d 7e9b550 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
from flask import Flask, request, jsonify
import whisper
import tempfile
import os
from flask_cors import CORS
from werkzeug.utils import secure_filename
import logging
from datetime import datetime
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = Flask(__name__)
CORS(app)
# Configuration
app.config['MAX_CONTENT_LENGTH'] = 100 * 1024 * 1024 # 100MB max file size
ALLOWED_EXTENSIONS = {'wav', 'mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'webm', 'flac'}
# Load Whisper model (you can change the model size: tiny, base, small, medium, large)
MODEL_SIZE = "base" # Change this to your preferred model size
logger.info(f"Loading Whisper model: {MODEL_SIZE}")
model = whisper.load_model(MODEL_SIZE)
logger.info("Whisper model loaded successfully")
def allowed_file(filename):
"""Check if the file extension is allowed"""
return '.' in filename and \
filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
def format_timestamp(seconds):
"""Convert seconds to HH:MM:SS.mmm format"""
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = seconds % 60
return f"{hours:02d}:{minutes:02d}:{secs:06.3f}"
@app.route('/', methods=['GET'])
def health_check():
"""Health check endpoint"""
return jsonify({
"status": "healthy",
"message": "Whisper Transcription API is running",
"model": MODEL_SIZE,
"timestamp": datetime.now().isoformat()
})
@app.route('/transcribe', methods=['POST'])
def transcribe_audio():
"""
Transcribe audio file and return word-level timestamps
Expected form data:
- audio_file: The audio file to transcribe
- language (optional): Language code (e.g., 'en', 'es', 'fr')
- task (optional): 'transcribe' or 'translate' (default: transcribe)
"""
try:
# Check if audio file is present
if 'audio_file' not in request.files:
return jsonify({'error': 'No audio file provided'}), 400
file = request.files['audio_file']
if file.filename == '':
return jsonify({'error': 'No file selected'}), 400
if not allowed_file(file.filename):
return jsonify({
'error': f'File type not allowed. Supported formats: {", ".join(ALLOWED_EXTENSIONS)}'
}), 400
# Get optional parameters
language = request.form.get('language', None)
task = request.form.get('task', 'transcribe')
if task not in ['transcribe', 'translate']:
return jsonify({'error': 'Task must be either "transcribe" or "translate"'}), 400
# Save uploaded file temporarily
with tempfile.NamedTemporaryFile(delete=False, suffix=f".{file.filename.rsplit('.', 1)[1].lower()}") as tmp_file:
file.save(tmp_file.name)
temp_path = tmp_file.name
logger.info(f"Processing file: {file.filename}")
try:
# Transcribe with word-level timestamps
result = model.transcribe(
temp_path,
language=language,
task=task,
word_timestamps=True,
verbose=False
)
# Extract word-level data
word_segments = []
for segment in result.get("segments", []):
if "words" in segment:
for word_data in segment["words"]:
word_segments.append({
"word": word_data.get("word", "").strip(),
"start": word_data.get("start", 0),
"end": word_data.get("end", 0),
"start_formatted": format_timestamp(word_data.get("start", 0)),
"end_formatted": format_timestamp(word_data.get("end", 0)),
"confidence": word_data.get("probability", 0)
})
# Prepare response
response_data = {
"success": True,
"filename": secure_filename(file.filename),
"language": result.get("language", "unknown"),
"task": task,
"duration": result.get("segments", [{}])[-1].get("end", 0) if result.get("segments") else 0,
"text": result.get("text", ""),
"word_count": len(word_segments),
"segments": result.get("segments", []),
"words": word_segments,
"model_used": MODEL_SIZE,
"processing_time": None # You can add timing if needed
}
logger.info(f"Successfully transcribed {len(word_segments)} words from {file.filename}")
return jsonify(response_data)
except Exception as e:
logger.error(f"Transcription error: {str(e)}")
return jsonify({'error': f'Transcription failed: {str(e)}'}), 500
finally:
# Clean up temporary file
if os.path.exists(temp_path):
os.unlink(temp_path)
except Exception as e:
logger.error(f"API error: {str(e)}")
return jsonify({'error': f'Server error: {str(e)}'}), 500
@app.route('/models', methods=['GET'])
def available_models():
"""Get information about available Whisper models"""
models_info = {
"current_model": MODEL_SIZE,
"available_models": {
"tiny": {"size": "~39 MB", "speed": "~32x", "accuracy": "lowest"},
"base": {"size": "~74 MB", "speed": "~16x", "accuracy": "low"},
"small": {"size": "~244 MB", "speed": "~6x", "accuracy": "medium"},
"medium": {"size": "~769 MB", "speed": "~2x", "accuracy": "high"},
"large": {"size": "~1550 MB", "speed": "~1x", "accuracy": "highest"}
},
"supported_languages": [
"en", "zh", "de", "es", "ru", "ko", "fr", "ja", "pt", "tr", "pl", "ca", "nl",
"ar", "sv", "it", "id", "hi", "fi", "vi", "he", "uk", "el", "ms", "cs", "ro",
"da", "hu", "ta", "no", "th", "ur", "hr", "bg", "lt", "la", "mi", "ml", "cy",
"sk", "te", "fa", "lv", "bn", "sr", "az", "sl", "kn", "et", "mk", "br", "eu",
"is", "hy", "ne", "mn", "bs", "kk", "sq", "sw", "gl", "mr", "pa", "si", "km",
"sn", "yo", "so", "af", "oc", "ka", "be", "tg", "sd", "gu", "am", "yi", "lo",
"uz", "fo", "ht", "ps", "tk", "nn", "mt", "sa", "lb", "my", "bo", "tl", "mg",
"as", "tt", "haw", "ln", "ha", "ba", "jw", "su"
]
}
return jsonify(models_info)
@app.errorhandler(413)
def too_large(e):
return jsonify({'error': 'File too large. Maximum size is 100MB'}), 413
@app.errorhandler(404)
def not_found(e):
return jsonify({'error': 'Endpoint not found'}), 404
@app.errorhandler(500)
def internal_error(e):
return jsonify({'error': 'Internal server error'}), 500
if __name__ == '__main__':
print(f"""
Whisper Transcription API Server
================================
Model: {MODEL_SIZE}
Endpoints:
- GET / : Health check
- POST /transcribe : Transcribe audio file
- GET /models : Available models info
Supported formats: {', '.join(ALLOWED_EXTENSIONS)}
Max file size: 100MB
""")
app.run(debug=True, host='0.0.0.0', port=7860) |