Spaces:
Running
Running
from flask_cors import CORS | |
from flask import Flask, request, jsonify | |
import tempfile | |
import os | |
from werkzeug.utils import secure_filename | |
import logging | |
from datetime import datetime | |
# Configure logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
app = Flask(__name__) | |
CORS(app) | |
# Configuration | |
app.config['MAX_CONTENT_LENGTH'] = 100 * 1024 * 1024 # 100MB max file size | |
ALLOWED_EXTENSIONS = {'wav', 'mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'webm', 'flac'} | |
# Initialize model variable | |
model = None | |
MODEL_SIZE = "medium" # Use 'tiny' for faster loading on Hugging Face Spaces, change to 'base', 'small', 'medium', or 'large' as needed | |
# Set cache directory for Hugging Face Spaces | |
def setup_cache_directory(): | |
"""Setup cache directory for Hugging Face Spaces""" | |
# Create a writable cache directory in the current working directory | |
cache_dir = os.path.join(os.getcwd(), ".whisper_cache") | |
os.makedirs(cache_dir, exist_ok=True) | |
# Set environment variables for Whisper cache | |
os.environ['XDG_CACHE_HOME'] = cache_dir | |
os.environ['WHISPER_CACHE'] = cache_dir | |
logger.info(f"Cache directory set to: {cache_dir}") | |
return cache_dir | |
def load_whisper_model(): | |
"""Load Whisper model with proper error handling""" | |
global model | |
try: | |
# Setup cache directory first | |
cache_dir = setup_cache_directory() | |
# Try multiple import strategies for openai-whisper | |
whisper_module = None | |
# Strategy 1: Direct import (most common) | |
try: | |
import whisper as whisper_module | |
except ImportError: | |
pass | |
# Strategy 2: Try importing as openai_whisper | |
if whisper_module is None: | |
try: | |
import openai_whisper as whisper_module | |
except ImportError: | |
pass | |
# Strategy 3: Try importing with explicit path | |
if whisper_module is None: | |
try: | |
import sys | |
import importlib.util | |
# This is a fallback - usually not needed | |
import whisper as whisper_module | |
except ImportError: | |
pass | |
if whisper_module is None: | |
logger.error("OpenAI Whisper not installed. Install with: pip install openai-whisper") | |
return False | |
# Check if the module has the load_model function | |
if not hasattr(whisper_module, 'load_model'): | |
logger.error("Whisper module found but missing 'load_model' function") | |
logger.error("This suggests you have the wrong 'whisper' package installed") | |
logger.error("Solution:") | |
logger.error("1. pip uninstall whisper") | |
logger.error("2. pip uninstall openai-whisper (if exists)") | |
logger.error("3. pip install openai-whisper") | |
logger.error("4. pip install torch torchaudio") | |
return False | |
logger.info(f"Loading Whisper model: {MODEL_SIZE}") | |
logger.info(f"Using cache directory: {cache_dir}") | |
# Load model with explicit download root | |
try: | |
model = whisper_module.load_model(MODEL_SIZE, download_root=cache_dir) | |
except TypeError: | |
# Fallback if download_root parameter is not supported | |
model = whisper_module.load_model(MODEL_SIZE) | |
logger.info("Whisper model loaded successfully") | |
return True | |
except ImportError as e: | |
logger.error(f"Import error: {e}") | |
logger.error("OpenAI Whisper not installed. Install with: pip install openai-whisper torch torchaudio") | |
return False | |
except AttributeError as e: | |
logger.error(f"Whisper import error: {e}") | |
logger.error("Make sure you have the correct whisper package installed:") | |
logger.error("Solution:") | |
logger.error("1. pip uninstall whisper") | |
logger.error("2. pip install openai-whisper torch torchaudio") | |
return False | |
except PermissionError as e: | |
logger.error(f"Permission error: {e}") | |
logger.error("Cannot write to cache directory. This might be a Hugging Face Spaces limitation.") | |
logger.error("Trying alternative cache locations...") | |
# Try alternative cache locations | |
alternative_dirs = [ | |
"/tmp/.whisper_cache", | |
os.path.expanduser("~/.whisper_cache"), | |
"./whisper_models" | |
] | |
for alt_dir in alternative_dirs: | |
try: | |
os.makedirs(alt_dir, exist_ok=True) | |
os.environ['XDG_CACHE_HOME'] = alt_dir | |
os.environ['WHISPER_CACHE'] = alt_dir | |
logger.info(f"Trying alternative cache: {alt_dir}") | |
import whisper | |
model = whisper.load_model(MODEL_SIZE, download_root=alt_dir) | |
logger.info(f"Successfully loaded model with cache: {alt_dir}") | |
return True | |
except Exception as alt_e: | |
logger.warning(f"Alternative cache {alt_dir} failed: {alt_e}") | |
continue | |
logger.error("All cache directory attempts failed") | |
return False | |
except Exception as e: | |
logger.error(f"Error loading Whisper model: {e}") | |
logger.error("This could be due to:") | |
logger.error("- Insufficient memory") | |
logger.error("- Missing PyTorch/CUDA dependencies") | |
logger.error("- Network issues downloading the model") | |
logger.error("- Hugging Face Spaces limitations") | |
return False | |
# Try to load the model at startup | |
model_loaded = load_whisper_model() | |
def allowed_file(filename): | |
"""Check if the file extension is allowed""" | |
return '.' in filename and \ | |
filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS | |
def format_timestamp(seconds): | |
"""Convert seconds to HH:MM:SS.mmm format""" | |
hours = int(seconds // 3600) | |
minutes = int((seconds % 3600) // 60) | |
secs = seconds % 60 | |
return f"{hours:02d}:{minutes:02d}:{secs:06.3f}" | |
def health_check(): | |
"""Health check endpoint""" | |
return jsonify({ | |
"status": "healthy" if model_loaded else "unhealthy", | |
"message": "Whisper Transcription API is running" if model_loaded else "Whisper model failed to load", | |
"model": MODEL_SIZE if model_loaded else "none", | |
"model_loaded": model_loaded, | |
"timestamp": datetime.now().isoformat() | |
}) | |
def transcribe_audio(): | |
""" | |
Transcribe audio file and return word-level timestamps | |
Expected form data: | |
- audio_file: The audio file to transcribe | |
- language (optional): Language code (e.g., 'en', 'es', 'fr') | |
- task (optional): 'transcribe' or 'translate' (default: transcribe) | |
""" | |
try: | |
# Check if model is loaded | |
if not model_loaded or model is None: | |
return jsonify({ | |
'error': 'Whisper model not loaded. Please check server logs and ensure openai-whisper is installed correctly.' | |
}), 503 | |
# Check if audio file is present | |
if 'audio_file' not in request.files: | |
return jsonify({'error': 'No audio file provided'}), 400 | |
file = request.files['audio_file'] | |
if file.filename == '': | |
return jsonify({'error': 'No file selected'}), 400 | |
if not allowed_file(file.filename): | |
return jsonify({ | |
'error': f'File type not allowed. Supported formats: {", ".join(ALLOWED_EXTENSIONS)}' | |
}), 400 | |
# Get optional parameters | |
language = request.form.get('language', None) | |
task = request.form.get('task', 'transcribe') | |
if task not in ['transcribe', 'translate']: | |
return jsonify({'error': 'Task must be either "transcribe" or "translate"'}), 400 | |
# Save uploaded file temporarily | |
with tempfile.NamedTemporaryFile(delete=False, suffix=f".{file.filename.rsplit('.', 1)[1].lower()}") as tmp_file: | |
file.save(tmp_file.name) | |
temp_path = tmp_file.name | |
logger.info(f"Processing file: {file.filename}") | |
try: | |
# Transcribe with word-level timestamps | |
result = model.transcribe( | |
temp_path, | |
language=language, | |
task=task, | |
word_timestamps=True, | |
verbose=False | |
) | |
# Extract word-level data | |
word_segments = [] | |
for segment in result.get("segments", []): | |
if "words" in segment: | |
for word_data in segment["words"]: | |
word_segments.append({ | |
"word": word_data.get("word", "").strip(), | |
"start": word_data.get("start", 0), | |
"end": word_data.get("end", 0), | |
"start_formatted": format_timestamp(word_data.get("start", 0)), | |
"end_formatted": format_timestamp(word_data.get("end", 0)), | |
"confidence": word_data.get("probability", 0) | |
}) | |
# Prepare response | |
response_data = { | |
"success": True, | |
"filename": secure_filename(file.filename), | |
"language": result.get("language", "unknown"), | |
"task": task, | |
"duration": result.get("segments", [{}])[-1].get("end", 0) if result.get("segments") else 0, | |
"text": result.get("text", ""), | |
"word_count": len(word_segments), | |
"segments": result.get("segments", []), | |
"words": word_segments, | |
"model_used": MODEL_SIZE, | |
"processing_time": None # You can add timing if needed | |
} | |
logger.info(f"Successfully transcribed {len(word_segments)} words from {file.filename}") | |
return jsonify(response_data) | |
except Exception as e: | |
logger.error(f"Transcription error: {str(e)}") | |
return jsonify({'error': f'Transcription failed: {str(e)}'}), 500 | |
finally: | |
# Clean up temporary file | |
if os.path.exists(temp_path): | |
os.unlink(temp_path) | |
except Exception as e: | |
logger.error(f"API error: {str(e)}") | |
return jsonify({'error': f'Server error: {str(e)}'}), 500 | |
def available_models(): | |
"""Get information about available Whisper models""" | |
models_info = { | |
"current_model": MODEL_SIZE if model_loaded else "none", | |
"model_loaded": model_loaded, | |
"available_models": { | |
"tiny": {"size": "~39 MB", "speed": "~32x", "accuracy": "lowest"}, | |
"base": {"size": "~74 MB", "speed": "~16x", "accuracy": "low"}, | |
"small": {"size": "~244 MB", "speed": "~6x", "accuracy": "medium"}, | |
"medium": {"size": "~769 MB", "speed": "~2x", "accuracy": "high"}, | |
"large": {"size": "~1550 MB", "speed": "~1x", "accuracy": "highest"} | |
}, | |
"supported_languages": [ | |
"en", "zh", "de", "es", "ru", "ko", "fr", "ja", "pt", "tr", "pl", "ca", "nl", | |
"ar", "sv", "it", "id", "hi", "fi", "vi", "he", "uk", "el", "ms", "cs", "ro", | |
"da", "hu", "ta", "no", "th", "ur", "hr", "bg", "lt", "la", "mi", "ml", "cy", | |
"sk", "te", "fa", "lv", "bn", "sr", "az", "sl", "kn", "et", "mk", "br", "eu", | |
"is", "hy", "ne", "mn", "bs", "kk", "sq", "sw", "gl", "mr", "pa", "si", "km", | |
"sn", "yo", "so", "af", "oc", "ka", "be", "tg", "sd", "gu", "am", "yi", "lo", | |
"uz", "fo", "ht", "ps", "tk", "nn", "mt", "sa", "lb", "my", "bo", "tl", "mg", | |
"as", "tt", "haw", "ln", "ha", "ba", "jw", "su" | |
], | |
"installation_help": { | |
"error": "Whisper model not loaded" if not model_loaded else None, | |
"install_command": "pip install openai-whisper torch torchaudio", | |
"uninstall_conflicts": "pip uninstall whisper (if you have conflicting whisper package)" | |
} | |
} | |
return jsonify(models_info) | |
def too_large(e): | |
return jsonify({'error': 'File too large. Maximum size is 100MB'}), 413 | |
def not_found(e): | |
return jsonify({'error': 'Endpoint not found'}), 404 | |
def internal_error(e): | |
return jsonify({'error': 'Internal server error'}), 500 | |
if __name__ == '__main__': | |
if not model_loaded: | |
print(f""" | |
⚠️ WHISPER MODEL LOADING FAILED ⚠️ | |
=================================== | |
The Whisper model could not be loaded. Please check: | |
1. Install the correct package: | |
pip install openai-whisper torch torchaudio | |
2. If you have conflicts, uninstall the wrong whisper package: | |
pip uninstall whisper | |
pip install openai-whisper | |
3. Make sure you have sufficient disk space for the model | |
The server will start but transcription will not work until the model is loaded. | |
""") | |
else: | |
print(f""" | |
Whisper Transcription API Server | |
================================ | |
Model: {MODEL_SIZE} ✅ | |
Status: Ready | |
Endpoints: | |
- GET / : Health check | |
- POST /transcribe : Transcribe audio file | |
- GET /models : Available models info | |
Supported formats: {', '.join(ALLOWED_EXTENSIONS)} | |
Max file size: 100MB | |
""") | |
app.run(debug=True, host='0.0.0.0', port=7860) |