import gradio as gr from gtts import gTTS import torch from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer import tempfile import nltk import speech_recognition as sr from pydub import AudioSegment # Ensure nltk tokenizer is available try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt') # Load sentiment models simple_model = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english") llm_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment") llm_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment") llm_pipe = pipeline("sentiment-analysis", model=llm_model, tokenizer=llm_tokenizer) def transcribe_audio(audio_path): recognizer = sr.Recognizer() sound = AudioSegment.from_file(audio_path) sound.export("temp.wav", format="wav") with sr.AudioFile("temp.wav") as source: audio = recognizer.record(source) try: return recognizer.recognize_google(audio) except sr.UnknownValueError: return "" except sr.RequestError: return "" def classify_sentiment(text, model_choice): if not text.strip(): return "🤔 Could not understand audio.", None result = ( simple_model(text)[0] if model_choice == "Simple Model" else llm_pipe(text)[0] ) label = result['label'] mood = { "POSITIVE": "😊 Happy / Positive", "NEGATIVE": "😞 Sad / Negative", "NEUTRAL": "😐 Neutral" }.get(label.upper(), label) tts = gTTS(text=mood) with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp: tts.save(tmp.name) return mood, tmp.name def predict(audio_file, model_choice): if audio_file is None: return "No input", None text = transcribe_audio(audio_file) return classify_sentiment(text, model_choice) with gr.Blocks() as demo: model_choice = gr.Dropdown(["Simple Model", "LLM Model"], value="Simple Model", label="Choose Model") gr.Markdown("## 🎙️ Mood from Voice (via Upload)") audio_input = gr.Audio(source="upload", type="filepath", label="Upload or Record Your Voice") output_text = gr.Textbox(label="Predicted Mood") output_audio = gr.Audio(label="Spoken Mood") audio_input.change(predict, inputs=[audio_input, model_choice], outputs=[output_text, output_audio]) demo.launch()