Whisper-api / main.py
NitinBot001's picture
Update main.py
7e971d4 verified
from flask_cors import CORS
from flask import Flask, request, jsonify
import tempfile
import os
from werkzeug.utils import secure_filename
import logging
from datetime import datetime
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = Flask(__name__)
CORS(app)
# Configuration
app.config['MAX_CONTENT_LENGTH'] = 100 * 1024 * 1024 # 100MB max file size
ALLOWED_EXTENSIONS = {'wav', 'mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'webm', 'flac'}
# Initialize model variable
model = None
MODEL_SIZE = "medium" # Use 'tiny' for faster loading on Hugging Face Spaces, change to 'base', 'small', 'medium', or 'large' as needed
# Set cache directory for Hugging Face Spaces
def setup_cache_directory():
"""Setup cache directory for Hugging Face Spaces"""
# Create a writable cache directory in the current working directory
cache_dir = os.path.join(os.getcwd(), ".whisper_cache")
os.makedirs(cache_dir, exist_ok=True)
# Set environment variables for Whisper cache
os.environ['XDG_CACHE_HOME'] = cache_dir
os.environ['WHISPER_CACHE'] = cache_dir
logger.info(f"Cache directory set to: {cache_dir}")
return cache_dir
def load_whisper_model():
"""Load Whisper model with proper error handling"""
global model
try:
# Setup cache directory first
cache_dir = setup_cache_directory()
# Try multiple import strategies for openai-whisper
whisper_module = None
# Strategy 1: Direct import (most common)
try:
import whisper as whisper_module
except ImportError:
pass
# Strategy 2: Try importing as openai_whisper
if whisper_module is None:
try:
import openai_whisper as whisper_module
except ImportError:
pass
# Strategy 3: Try importing with explicit path
if whisper_module is None:
try:
import sys
import importlib.util
# This is a fallback - usually not needed
import whisper as whisper_module
except ImportError:
pass
if whisper_module is None:
logger.error("OpenAI Whisper not installed. Install with: pip install openai-whisper")
return False
# Check if the module has the load_model function
if not hasattr(whisper_module, 'load_model'):
logger.error("Whisper module found but missing 'load_model' function")
logger.error("This suggests you have the wrong 'whisper' package installed")
logger.error("Solution:")
logger.error("1. pip uninstall whisper")
logger.error("2. pip uninstall openai-whisper (if exists)")
logger.error("3. pip install openai-whisper")
logger.error("4. pip install torch torchaudio")
return False
logger.info(f"Loading Whisper model: {MODEL_SIZE}")
logger.info(f"Using cache directory: {cache_dir}")
# Load model with explicit download root
try:
model = whisper_module.load_model(MODEL_SIZE, download_root=cache_dir)
except TypeError:
# Fallback if download_root parameter is not supported
model = whisper_module.load_model(MODEL_SIZE)
logger.info("Whisper model loaded successfully")
return True
except ImportError as e:
logger.error(f"Import error: {e}")
logger.error("OpenAI Whisper not installed. Install with: pip install openai-whisper torch torchaudio")
return False
except AttributeError as e:
logger.error(f"Whisper import error: {e}")
logger.error("Make sure you have the correct whisper package installed:")
logger.error("Solution:")
logger.error("1. pip uninstall whisper")
logger.error("2. pip install openai-whisper torch torchaudio")
return False
except PermissionError as e:
logger.error(f"Permission error: {e}")
logger.error("Cannot write to cache directory. This might be a Hugging Face Spaces limitation.")
logger.error("Trying alternative cache locations...")
# Try alternative cache locations
alternative_dirs = [
"/tmp/.whisper_cache",
os.path.expanduser("~/.whisper_cache"),
"./whisper_models"
]
for alt_dir in alternative_dirs:
try:
os.makedirs(alt_dir, exist_ok=True)
os.environ['XDG_CACHE_HOME'] = alt_dir
os.environ['WHISPER_CACHE'] = alt_dir
logger.info(f"Trying alternative cache: {alt_dir}")
import whisper
model = whisper.load_model(MODEL_SIZE, download_root=alt_dir)
logger.info(f"Successfully loaded model with cache: {alt_dir}")
return True
except Exception as alt_e:
logger.warning(f"Alternative cache {alt_dir} failed: {alt_e}")
continue
logger.error("All cache directory attempts failed")
return False
except Exception as e:
logger.error(f"Error loading Whisper model: {e}")
logger.error("This could be due to:")
logger.error("- Insufficient memory")
logger.error("- Missing PyTorch/CUDA dependencies")
logger.error("- Network issues downloading the model")
logger.error("- Hugging Face Spaces limitations")
return False
# Try to load the model at startup
model_loaded = load_whisper_model()
def allowed_file(filename):
"""Check if the file extension is allowed"""
return '.' in filename and \
filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
def format_timestamp(seconds):
"""Convert seconds to HH:MM:SS.mmm format"""
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = seconds % 60
return f"{hours:02d}:{minutes:02d}:{secs:06.3f}"
@app.route('/', methods=['GET'])
def health_check():
"""Health check endpoint"""
return jsonify({
"status": "healthy" if model_loaded else "unhealthy",
"message": "Whisper Transcription API is running" if model_loaded else "Whisper model failed to load",
"model": MODEL_SIZE if model_loaded else "none",
"model_loaded": model_loaded,
"timestamp": datetime.now().isoformat()
})
@app.route('/transcribe', methods=['POST'])
def transcribe_audio():
"""
Transcribe audio file and return word-level timestamps
Expected form data:
- audio_file: The audio file to transcribe
- language (optional): Language code (e.g., 'en', 'es', 'fr')
- task (optional): 'transcribe' or 'translate' (default: transcribe)
"""
try:
# Check if model is loaded
if not model_loaded or model is None:
return jsonify({
'error': 'Whisper model not loaded. Please check server logs and ensure openai-whisper is installed correctly.'
}), 503
# Check if audio file is present
if 'audio_file' not in request.files:
return jsonify({'error': 'No audio file provided'}), 400
file = request.files['audio_file']
if file.filename == '':
return jsonify({'error': 'No file selected'}), 400
if not allowed_file(file.filename):
return jsonify({
'error': f'File type not allowed. Supported formats: {", ".join(ALLOWED_EXTENSIONS)}'
}), 400
# Get optional parameters
language = request.form.get('language', None)
task = request.form.get('task', 'transcribe')
if task not in ['transcribe', 'translate']:
return jsonify({'error': 'Task must be either "transcribe" or "translate"'}), 400
# Save uploaded file temporarily
with tempfile.NamedTemporaryFile(delete=False, suffix=f".{file.filename.rsplit('.', 1)[1].lower()}") as tmp_file:
file.save(tmp_file.name)
temp_path = tmp_file.name
logger.info(f"Processing file: {file.filename}")
try:
# Transcribe with word-level timestamps
result = model.transcribe(
temp_path,
language=language,
task=task,
word_timestamps=True,
verbose=False
)
# Extract word-level data
word_segments = []
for segment in result.get("segments", []):
if "words" in segment:
for word_data in segment["words"]:
word_segments.append({
"word": word_data.get("word", "").strip(),
"start": word_data.get("start", 0),
"end": word_data.get("end", 0),
"start_formatted": format_timestamp(word_data.get("start", 0)),
"end_formatted": format_timestamp(word_data.get("end", 0)),
"confidence": word_data.get("probability", 0)
})
# Prepare response
response_data = {
"success": True,
"filename": secure_filename(file.filename),
"language": result.get("language", "unknown"),
"task": task,
"duration": result.get("segments", [{}])[-1].get("end", 0) if result.get("segments") else 0,
"text": result.get("text", ""),
"word_count": len(word_segments),
"segments": result.get("segments", []),
"words": word_segments,
"model_used": MODEL_SIZE,
"processing_time": None # You can add timing if needed
}
logger.info(f"Successfully transcribed {len(word_segments)} words from {file.filename}")
return jsonify(response_data)
except Exception as e:
logger.error(f"Transcription error: {str(e)}")
return jsonify({'error': f'Transcription failed: {str(e)}'}), 500
finally:
# Clean up temporary file
if os.path.exists(temp_path):
os.unlink(temp_path)
except Exception as e:
logger.error(f"API error: {str(e)}")
return jsonify({'error': f'Server error: {str(e)}'}), 500
@app.route('/models', methods=['GET'])
def available_models():
"""Get information about available Whisper models"""
models_info = {
"current_model": MODEL_SIZE if model_loaded else "none",
"model_loaded": model_loaded,
"available_models": {
"tiny": {"size": "~39 MB", "speed": "~32x", "accuracy": "lowest"},
"base": {"size": "~74 MB", "speed": "~16x", "accuracy": "low"},
"small": {"size": "~244 MB", "speed": "~6x", "accuracy": "medium"},
"medium": {"size": "~769 MB", "speed": "~2x", "accuracy": "high"},
"large": {"size": "~1550 MB", "speed": "~1x", "accuracy": "highest"}
},
"supported_languages": [
"en", "zh", "de", "es", "ru", "ko", "fr", "ja", "pt", "tr", "pl", "ca", "nl",
"ar", "sv", "it", "id", "hi", "fi", "vi", "he", "uk", "el", "ms", "cs", "ro",
"da", "hu", "ta", "no", "th", "ur", "hr", "bg", "lt", "la", "mi", "ml", "cy",
"sk", "te", "fa", "lv", "bn", "sr", "az", "sl", "kn", "et", "mk", "br", "eu",
"is", "hy", "ne", "mn", "bs", "kk", "sq", "sw", "gl", "mr", "pa", "si", "km",
"sn", "yo", "so", "af", "oc", "ka", "be", "tg", "sd", "gu", "am", "yi", "lo",
"uz", "fo", "ht", "ps", "tk", "nn", "mt", "sa", "lb", "my", "bo", "tl", "mg",
"as", "tt", "haw", "ln", "ha", "ba", "jw", "su"
],
"installation_help": {
"error": "Whisper model not loaded" if not model_loaded else None,
"install_command": "pip install openai-whisper torch torchaudio",
"uninstall_conflicts": "pip uninstall whisper (if you have conflicting whisper package)"
}
}
return jsonify(models_info)
@app.errorhandler(413)
def too_large(e):
return jsonify({'error': 'File too large. Maximum size is 100MB'}), 413
@app.errorhandler(404)
def not_found(e):
return jsonify({'error': 'Endpoint not found'}), 404
@app.errorhandler(500)
def internal_error(e):
return jsonify({'error': 'Internal server error'}), 500
if __name__ == '__main__':
if not model_loaded:
print(f"""
⚠️ WHISPER MODEL LOADING FAILED ⚠️
===================================
The Whisper model could not be loaded. Please check:
1. Install the correct package:
pip install openai-whisper torch torchaudio
2. If you have conflicts, uninstall the wrong whisper package:
pip uninstall whisper
pip install openai-whisper
3. Make sure you have sufficient disk space for the model
The server will start but transcription will not work until the model is loaded.
""")
else:
print(f"""
Whisper Transcription API Server
================================
Model: {MODEL_SIZE}
Status: Ready
Endpoints:
- GET / : Health check
- POST /transcribe : Transcribe audio file
- GET /models : Available models info
Supported formats: {', '.join(ALLOWED_EXTENSIONS)}
Max file size: 100MB
""")
app.run(debug=True, host='0.0.0.0', port=7860)