from flask import Flask, request, jsonify import whisper import tempfile import os from flask_cors import CORS from werkzeug.utils import secure_filename import logging from datetime import datetime # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) app = Flask(__name__) CORS(app) # Configuration app.config['MAX_CONTENT_LENGTH'] = 100 * 1024 * 1024 # 100MB max file size ALLOWED_EXTENSIONS = {'wav', 'mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'webm', 'flac'} # Load Whisper model (you can change the model size: tiny, base, small, medium, large) MODEL_SIZE = "base" # Change this to your preferred model size logger.info(f"Loading Whisper model: {MODEL_SIZE}") model = whisper.load_model(MODEL_SIZE) logger.info("Whisper model loaded successfully") def allowed_file(filename): """Check if the file extension is allowed""" return '.' in filename and \ filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS def format_timestamp(seconds): """Convert seconds to HH:MM:SS.mmm format""" hours = int(seconds // 3600) minutes = int((seconds % 3600) // 60) secs = seconds % 60 return f"{hours:02d}:{minutes:02d}:{secs:06.3f}" @app.route('/', methods=['GET']) def health_check(): """Health check endpoint""" return jsonify({ "status": "healthy", "message": "Whisper Transcription API is running", "model": MODEL_SIZE, "timestamp": datetime.now().isoformat() }) @app.route('/transcribe', methods=['POST']) def transcribe_audio(): """ Transcribe audio file and return word-level timestamps Expected form data: - audio_file: The audio file to transcribe - language (optional): Language code (e.g., 'en', 'es', 'fr') - task (optional): 'transcribe' or 'translate' (default: transcribe) """ try: # Check if audio file is present if 'audio_file' not in request.files: return jsonify({'error': 'No audio file provided'}), 400 file = request.files['audio_file'] if file.filename == '': return jsonify({'error': 'No file selected'}), 400 if not allowed_file(file.filename): return jsonify({ 'error': f'File type not allowed. Supported formats: {", ".join(ALLOWED_EXTENSIONS)}' }), 400 # Get optional parameters language = request.form.get('language', None) task = request.form.get('task', 'transcribe') if task not in ['transcribe', 'translate']: return jsonify({'error': 'Task must be either "transcribe" or "translate"'}), 400 # Save uploaded file temporarily with tempfile.NamedTemporaryFile(delete=False, suffix=f".{file.filename.rsplit('.', 1)[1].lower()}") as tmp_file: file.save(tmp_file.name) temp_path = tmp_file.name logger.info(f"Processing file: {file.filename}") try: # Transcribe with word-level timestamps result = model.transcribe( temp_path, language=language, task=task, word_timestamps=True, verbose=False ) # Extract word-level data word_segments = [] for segment in result.get("segments", []): if "words" in segment: for word_data in segment["words"]: word_segments.append({ "word": word_data.get("word", "").strip(), "start": word_data.get("start", 0), "end": word_data.get("end", 0), "start_formatted": format_timestamp(word_data.get("start", 0)), "end_formatted": format_timestamp(word_data.get("end", 0)), "confidence": word_data.get("probability", 0) }) # Prepare response response_data = { "success": True, "filename": secure_filename(file.filename), "language": result.get("language", "unknown"), "task": task, "duration": result.get("segments", [{}])[-1].get("end", 0) if result.get("segments") else 0, "text": result.get("text", ""), "word_count": len(word_segments), "segments": result.get("segments", []), "words": word_segments, "model_used": MODEL_SIZE, "processing_time": None # You can add timing if needed } logger.info(f"Successfully transcribed {len(word_segments)} words from {file.filename}") return jsonify(response_data) except Exception as e: logger.error(f"Transcription error: {str(e)}") return jsonify({'error': f'Transcription failed: {str(e)}'}), 500 finally: # Clean up temporary file if os.path.exists(temp_path): os.unlink(temp_path) except Exception as e: logger.error(f"API error: {str(e)}") return jsonify({'error': f'Server error: {str(e)}'}), 500 @app.route('/models', methods=['GET']) def available_models(): """Get information about available Whisper models""" models_info = { "current_model": MODEL_SIZE, "available_models": { "tiny": {"size": "~39 MB", "speed": "~32x", "accuracy": "lowest"}, "base": {"size": "~74 MB", "speed": "~16x", "accuracy": "low"}, "small": {"size": "~244 MB", "speed": "~6x", "accuracy": "medium"}, "medium": {"size": "~769 MB", "speed": "~2x", "accuracy": "high"}, "large": {"size": "~1550 MB", "speed": "~1x", "accuracy": "highest"} }, "supported_languages": [ "en", "zh", "de", "es", "ru", "ko", "fr", "ja", "pt", "tr", "pl", "ca", "nl", "ar", "sv", "it", "id", "hi", "fi", "vi", "he", "uk", "el", "ms", "cs", "ro", "da", "hu", "ta", "no", "th", "ur", "hr", "bg", "lt", "la", "mi", "ml", "cy", "sk", "te", "fa", "lv", "bn", "sr", "az", "sl", "kn", "et", "mk", "br", "eu", "is", "hy", "ne", "mn", "bs", "kk", "sq", "sw", "gl", "mr", "pa", "si", "km", "sn", "yo", "so", "af", "oc", "ka", "be", "tg", "sd", "gu", "am", "yi", "lo", "uz", "fo", "ht", "ps", "tk", "nn", "mt", "sa", "lb", "my", "bo", "tl", "mg", "as", "tt", "haw", "ln", "ha", "ba", "jw", "su" ] } return jsonify(models_info) @app.errorhandler(413) def too_large(e): return jsonify({'error': 'File too large. Maximum size is 100MB'}), 413 @app.errorhandler(404) def not_found(e): return jsonify({'error': 'Endpoint not found'}), 404 @app.errorhandler(500) def internal_error(e): return jsonify({'error': 'Internal server error'}), 500 if __name__ == '__main__': print(f""" Whisper Transcription API Server ================================ Model: {MODEL_SIZE} Endpoints: - GET / : Health check - POST /transcribe : Transcribe audio file - GET /models : Available models info Supported formats: {', '.join(ALLOWED_EXTENSIONS)} Max file size: 100MB """) app.run(debug=True, host='0.0.0.0', port=7860)