from flask_cors import CORS from flask import Flask, request, jsonify import tempfile import os from werkzeug.utils import secure_filename import logging from datetime import datetime # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) app = Flask(__name__) CORS(app) # Configuration app.config['MAX_CONTENT_LENGTH'] = 100 * 1024 * 1024 # 100MB max file size ALLOWED_EXTENSIONS = {'wav', 'mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'webm', 'flac'} # Initialize model variable model = None MODEL_SIZE = "medium" # Use 'tiny' for faster loading on Hugging Face Spaces, change to 'base', 'small', 'medium', or 'large' as needed # Set cache directory for Hugging Face Spaces def setup_cache_directory(): """Setup cache directory for Hugging Face Spaces""" # Create a writable cache directory in the current working directory cache_dir = os.path.join(os.getcwd(), ".whisper_cache") os.makedirs(cache_dir, exist_ok=True) # Set environment variables for Whisper cache os.environ['XDG_CACHE_HOME'] = cache_dir os.environ['WHISPER_CACHE'] = cache_dir logger.info(f"Cache directory set to: {cache_dir}") return cache_dir def load_whisper_model(): """Load Whisper model with proper error handling""" global model try: # Setup cache directory first cache_dir = setup_cache_directory() # Try multiple import strategies for openai-whisper whisper_module = None # Strategy 1: Direct import (most common) try: import whisper as whisper_module except ImportError: pass # Strategy 2: Try importing as openai_whisper if whisper_module is None: try: import openai_whisper as whisper_module except ImportError: pass # Strategy 3: Try importing with explicit path if whisper_module is None: try: import sys import importlib.util # This is a fallback - usually not needed import whisper as whisper_module except ImportError: pass if whisper_module is None: logger.error("OpenAI Whisper not installed. Install with: pip install openai-whisper") return False # Check if the module has the load_model function if not hasattr(whisper_module, 'load_model'): logger.error("Whisper module found but missing 'load_model' function") logger.error("This suggests you have the wrong 'whisper' package installed") logger.error("Solution:") logger.error("1. pip uninstall whisper") logger.error("2. pip uninstall openai-whisper (if exists)") logger.error("3. pip install openai-whisper") logger.error("4. pip install torch torchaudio") return False logger.info(f"Loading Whisper model: {MODEL_SIZE}") logger.info(f"Using cache directory: {cache_dir}") # Load model with explicit download root try: model = whisper_module.load_model(MODEL_SIZE, download_root=cache_dir) except TypeError: # Fallback if download_root parameter is not supported model = whisper_module.load_model(MODEL_SIZE) logger.info("Whisper model loaded successfully") return True except ImportError as e: logger.error(f"Import error: {e}") logger.error("OpenAI Whisper not installed. Install with: pip install openai-whisper torch torchaudio") return False except AttributeError as e: logger.error(f"Whisper import error: {e}") logger.error("Make sure you have the correct whisper package installed:") logger.error("Solution:") logger.error("1. pip uninstall whisper") logger.error("2. pip install openai-whisper torch torchaudio") return False except PermissionError as e: logger.error(f"Permission error: {e}") logger.error("Cannot write to cache directory. This might be a Hugging Face Spaces limitation.") logger.error("Trying alternative cache locations...") # Try alternative cache locations alternative_dirs = [ "/tmp/.whisper_cache", os.path.expanduser("~/.whisper_cache"), "./whisper_models" ] for alt_dir in alternative_dirs: try: os.makedirs(alt_dir, exist_ok=True) os.environ['XDG_CACHE_HOME'] = alt_dir os.environ['WHISPER_CACHE'] = alt_dir logger.info(f"Trying alternative cache: {alt_dir}") import whisper model = whisper.load_model(MODEL_SIZE, download_root=alt_dir) logger.info(f"Successfully loaded model with cache: {alt_dir}") return True except Exception as alt_e: logger.warning(f"Alternative cache {alt_dir} failed: {alt_e}") continue logger.error("All cache directory attempts failed") return False except Exception as e: logger.error(f"Error loading Whisper model: {e}") logger.error("This could be due to:") logger.error("- Insufficient memory") logger.error("- Missing PyTorch/CUDA dependencies") logger.error("- Network issues downloading the model") logger.error("- Hugging Face Spaces limitations") return False # Try to load the model at startup model_loaded = load_whisper_model() def allowed_file(filename): """Check if the file extension is allowed""" return '.' in filename and \ filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS def format_timestamp(seconds): """Convert seconds to HH:MM:SS.mmm format""" hours = int(seconds // 3600) minutes = int((seconds % 3600) // 60) secs = seconds % 60 return f"{hours:02d}:{minutes:02d}:{secs:06.3f}" @app.route('/', methods=['GET']) def health_check(): """Health check endpoint""" return jsonify({ "status": "healthy" if model_loaded else "unhealthy", "message": "Whisper Transcription API is running" if model_loaded else "Whisper model failed to load", "model": MODEL_SIZE if model_loaded else "none", "model_loaded": model_loaded, "timestamp": datetime.now().isoformat() }) @app.route('/transcribe', methods=['POST']) def transcribe_audio(): """ Transcribe audio file and return word-level timestamps Expected form data: - audio_file: The audio file to transcribe - language (optional): Language code (e.g., 'en', 'es', 'fr') - task (optional): 'transcribe' or 'translate' (default: transcribe) """ try: # Check if model is loaded if not model_loaded or model is None: return jsonify({ 'error': 'Whisper model not loaded. Please check server logs and ensure openai-whisper is installed correctly.' }), 503 # Check if audio file is present if 'audio_file' not in request.files: return jsonify({'error': 'No audio file provided'}), 400 file = request.files['audio_file'] if file.filename == '': return jsonify({'error': 'No file selected'}), 400 if not allowed_file(file.filename): return jsonify({ 'error': f'File type not allowed. Supported formats: {", ".join(ALLOWED_EXTENSIONS)}' }), 400 # Get optional parameters language = request.form.get('language', None) task = request.form.get('task', 'transcribe') if task not in ['transcribe', 'translate']: return jsonify({'error': 'Task must be either "transcribe" or "translate"'}), 400 # Save uploaded file temporarily with tempfile.NamedTemporaryFile(delete=False, suffix=f".{file.filename.rsplit('.', 1)[1].lower()}") as tmp_file: file.save(tmp_file.name) temp_path = tmp_file.name logger.info(f"Processing file: {file.filename}") try: # Transcribe with word-level timestamps result = model.transcribe( temp_path, language=language, task=task, word_timestamps=True, verbose=False ) # Extract word-level data word_segments = [] for segment in result.get("segments", []): if "words" in segment: for word_data in segment["words"]: word_segments.append({ "word": word_data.get("word", "").strip(), "start": word_data.get("start", 0), "end": word_data.get("end", 0), "start_formatted": format_timestamp(word_data.get("start", 0)), "end_formatted": format_timestamp(word_data.get("end", 0)), "confidence": word_data.get("probability", 0) }) # Prepare response response_data = { "success": True, "filename": secure_filename(file.filename), "language": result.get("language", "unknown"), "task": task, "duration": result.get("segments", [{}])[-1].get("end", 0) if result.get("segments") else 0, "text": result.get("text", ""), "word_count": len(word_segments), "segments": result.get("segments", []), "words": word_segments, "model_used": MODEL_SIZE, "processing_time": None # You can add timing if needed } logger.info(f"Successfully transcribed {len(word_segments)} words from {file.filename}") return jsonify(response_data) except Exception as e: logger.error(f"Transcription error: {str(e)}") return jsonify({'error': f'Transcription failed: {str(e)}'}), 500 finally: # Clean up temporary file if os.path.exists(temp_path): os.unlink(temp_path) except Exception as e: logger.error(f"API error: {str(e)}") return jsonify({'error': f'Server error: {str(e)}'}), 500 @app.route('/models', methods=['GET']) def available_models(): """Get information about available Whisper models""" models_info = { "current_model": MODEL_SIZE if model_loaded else "none", "model_loaded": model_loaded, "available_models": { "tiny": {"size": "~39 MB", "speed": "~32x", "accuracy": "lowest"}, "base": {"size": "~74 MB", "speed": "~16x", "accuracy": "low"}, "small": {"size": "~244 MB", "speed": "~6x", "accuracy": "medium"}, "medium": {"size": "~769 MB", "speed": "~2x", "accuracy": "high"}, "large": {"size": "~1550 MB", "speed": "~1x", "accuracy": "highest"} }, "supported_languages": [ "en", "zh", "de", "es", "ru", "ko", "fr", "ja", "pt", "tr", "pl", "ca", "nl", "ar", "sv", "it", "id", "hi", "fi", "vi", "he", "uk", "el", "ms", "cs", "ro", "da", "hu", "ta", "no", "th", "ur", "hr", "bg", "lt", "la", "mi", "ml", "cy", "sk", "te", "fa", "lv", "bn", "sr", "az", "sl", "kn", "et", "mk", "br", "eu", "is", "hy", "ne", "mn", "bs", "kk", "sq", "sw", "gl", "mr", "pa", "si", "km", "sn", "yo", "so", "af", "oc", "ka", "be", "tg", "sd", "gu", "am", "yi", "lo", "uz", "fo", "ht", "ps", "tk", "nn", "mt", "sa", "lb", "my", "bo", "tl", "mg", "as", "tt", "haw", "ln", "ha", "ba", "jw", "su" ], "installation_help": { "error": "Whisper model not loaded" if not model_loaded else None, "install_command": "pip install openai-whisper torch torchaudio", "uninstall_conflicts": "pip uninstall whisper (if you have conflicting whisper package)" } } return jsonify(models_info) @app.errorhandler(413) def too_large(e): return jsonify({'error': 'File too large. Maximum size is 100MB'}), 413 @app.errorhandler(404) def not_found(e): return jsonify({'error': 'Endpoint not found'}), 404 @app.errorhandler(500) def internal_error(e): return jsonify({'error': 'Internal server error'}), 500 if __name__ == '__main__': if not model_loaded: print(f""" ⚠️ WHISPER MODEL LOADING FAILED ⚠️ =================================== The Whisper model could not be loaded. Please check: 1. Install the correct package: pip install openai-whisper torch torchaudio 2. If you have conflicts, uninstall the wrong whisper package: pip uninstall whisper pip install openai-whisper 3. Make sure you have sufficient disk space for the model The server will start but transcription will not work until the model is loaded. """) else: print(f""" Whisper Transcription API Server ================================ Model: {MODEL_SIZE} ✅ Status: Ready Endpoints: - GET / : Health check - POST /transcribe : Transcribe audio file - GET /models : Available models info Supported formats: {', '.join(ALLOWED_EXTENSIONS)} Max file size: 100MB """) app.run(debug=True, host='0.0.0.0', port=7860)