import gradio as gr
import torch
from transformers import AutoFeatureExtractor, AutoModel
import numpy as np
from sklearn.linear_model import LogisticRegression

# Load HeAR model and feature extractor
MODEL_ID = "google/hear"
feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_ID)
model = AutoModel.from_pretrained(MODEL_ID)

# Dummy classifier (replace with your trained classifier)
# For demonstration, we simulate a trained classifier with random weights
# In real use, train a classifier on HeAR embeddings using your labeled dataset
clf = LogisticRegression()
clf.classes_ = np.array(["Normal", "Abnormal"])
clf.coef_ = np.random.randn(1, 768)  # HeAR outputs 768-dim embeddings
clf.intercept_ = np.random.randn(1)

def extract_embedding(audio):
    # audio: tuple (sr, np.array)
    if audio is None:
        return None
    sr, y = audio
    # HeAR expects 2-second clips at 16kHz; pad/truncate as needed
    target_sr = 16000
    if sr != target_sr:
        import librosa
        y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
    y = y[:target_sr*2] if len(y) > target_sr*2 else np.pad(y, (0, max(0, target_sr*2-len(y))))
    inputs = feature_extractor(y, sampling_rate=target_sr, return_tensors="pt")
    with torch.no_grad():
        emb = model(**inputs).last_hidden_state.mean(dim=1).cpu().numpy()
    return emb

def predict(audio):
    emb = extract_embedding(audio)
    if emb is None:
        return "Please upload a heart or lung sound file."
    # Predict with the dummy classifier
    pred = clf.predict(emb)[0]
    prob = clf.predict_proba(emb)[0]
    return f"Prediction: **{pred}**\n\nConfidence: {max(prob):.2%}"

description = """
# Heart & Lung Sound Classifier (Demo)
Upload a heart or lung sound (WAV, MP3, etc.).  
This demo uses the [HeAR model](https://huggingface.co/google/hear) for health acoustic embeddings and a simple classifier for normal/abnormal prediction.  
**Note:** For best results, use 2-second clips. For real diagnosis, a classifier trained on labeled heart/lung sound data should be used.
"""

iface = gr.Interface(
    fn=predict,
    inputs=gr.Audio(sources=["upload", "microphone"], type="numpy", label="Upload Heart/Lung Sound"),
    outputs=gr.Markdown(),
    title="Heart & Lung Sound Classifier",
    description=description,
    allow_flagging="never"
)

if __name__ == "__main__":
    iface.launch()