Spaces:
Runtime error
Runtime error
File size: 4,452 Bytes
8842208 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
from fastapi import FastAPI, File, UploadFile, HTTPException
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
import librosa
import torch
import numpy as np
import tempfile
import os
from functools import lru_cache
app = FastAPI(title="Speech Emotion Recognition API")
# Global variables for model caching
model = None
feature_extractor = None
id2label = None
@lru_cache(maxsize=1)
def load_model():
"""Load model once and cache it for CPU optimization"""
global model, feature_extractor, id2label
model_id = "firdhokk/speech-emotion-recognition-with-openai-whisper-large-v3"
# Force CPU usage for free tier
device = "cpu"
torch.set_num_threads(2) # Optimize for free CPU
model = AutoModelForAudioClassification.from_pretrained(
model_id,
torch_dtype=torch.float32, # Use float32 for CPU
device_map="cpu"
)
feature_extractor = AutoFeatureExtractor.from_pretrained(
model_id,
do_normalize=True
)
id2label = model.config.id2label
return model, feature_extractor, id2label
def preprocess_audio(audio_path, feature_extractor, max_duration=30.0):
"""Preprocess audio with memory optimization"""
audio_array, sampling_rate = librosa.load(
audio_path,
sr=feature_extractor.sampling_rate,
duration=max_duration # Limit duration for CPU efficiency
)
max_length = int(feature_extractor.sampling_rate * max_duration)
if len(audio_array) > max_length:
audio_array = audio_array[:max_length]
else:
audio_array = np.pad(audio_array, (0, max_length - len(audio_array)))
inputs = feature_extractor(
audio_array,
sampling_rate=feature_extractor.sampling_rate,
max_length=max_length,
truncation=True,
return_tensors="pt",
)
return inputs
@app.on_event("startup")
async def startup_event():
"""Load model on startup"""
load_model()
@app.post("/predict-emotion")
async def predict_emotion(file: UploadFile = File(...)):
"""Predict emotion from uploaded audio file"""
try:
# Validate file type
if not file.filename.lower().endswith(('.wav', '.mp3', '.m4a', '.flac')):
raise HTTPException(status_code=400, detail="Unsupported audio format")
# Save uploaded file temporarily
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
content = await file.read()
tmp_file.write(content)
tmp_file_path = tmp_file.name
try:
# Load cached model
model, feature_extractor, id2label = load_model()
# Preprocess and predict
inputs = preprocess_audio(tmp_file_path, feature_extractor)
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
predicted_id = torch.argmax(logits, dim=-1).item()
predicted_label = id2label[predicted_id]
# Get confidence scores
probabilities = torch.softmax(logits, dim=-1)
confidence = probabilities[0][predicted_id].item()
return {
"predicted_emotion": predicted_label,
"confidence": round(confidence, 4),
"all_emotions": {
id2label[i]: round(probabilities[0][i].item(), 4)
for i in range(len(id2label))
}
}
finally:
# Clean up temporary file
os.unlink(tmp_file_path)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Processing error: {str(e)}")
@app.get("/health")
async def health_check():
"""Health check endpoint"""
return {"status": "healthy", "model_loaded": model is not None}
@app.get("/")
async def root():
"""Root endpoint with API information"""
return {
"message": "Speech Emotion Recognition API",
"model": "Whisper Large V3",
"emotions": ["Angry", "Disgust", "Fearful", "Happy", "Neutral", "Sad", "Surprised"],
"endpoints": {
"predict": "/predict-emotion",
"health": "/health"
}
}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)
|