MaroofTechSorcerer commited on
Commit
58b0884
Β·
verified Β·
1 Parent(s): f9389b4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +458 -219
app.py CHANGED
@@ -1,18 +1,19 @@
 
1
  import os
2
  import streamlit as st
3
  import tempfile
4
  import torch
5
  import transformers
6
- from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
7
  import plotly.express as px
8
  import logging
9
  import warnings
10
  import whisper
11
  from pydub import AudioSegment
12
  import time
13
- import numpy as np
14
- import librosa
15
- import subprocess
16
 
17
  # Suppress warnings for a clean console
18
  logging.getLogger("torch").setLevel(logging.CRITICAL)
@@ -25,123 +26,100 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
25
  print(f"Using device: {device}")
26
 
27
  # Set Streamlit app layout
28
- st.set_page_config(layout="wide", page_title="Advanced Voice Emotion Analyzer")
29
 
30
  # Interface design
31
- st.title("πŸŽ™οΈ Advanced Voice Emotion Analyzer")
32
- st.write("Analyze all 27 emotions from uploaded audio with enhanced detection to avoid neutral defaults.")
33
-
34
- # Audio Preprocessing
35
- def make_audio_scarier(audio_path, output_path):
36
- try:
37
- # Step 1: Adjust pitch (slower rate for scarier effect)
38
- cmd1 = f"ffmpeg -i {audio_path} -af 'asetrate=44100*0.8,aresample=44100' temp1.wav"
39
- subprocess.run(cmd1, shell=True, check=True, stderr=subprocess.PIPE, text=True)
40
-
41
- # Step 2: Apply reverb with adjusted parameters
42
- cmd2 = f"ffmpeg -i temp1.wav -af 'reverb=0.4:0.7:0.5:0.5:0.5:0.02' temp2.wav"
43
- subprocess.run(cmd2, shell=True, check=True, stderr=subprocess.PIPE, text=True)
44
-
45
- # Step 3: Adjust tempo
46
- cmd3 = f"ffmpeg -i temp2.wav -af 'atempo=1.2' {output_path}"
47
- subprocess.run(cmd3, shell=True, check=True, stderr=subprocess.PIPE, text=True)
48
-
49
- # Clean up temporary files
50
- for temp_file in ["temp1.wav", "temp2.wav"]:
51
- if os.path.exists(temp_file):
52
- os.remove(temp_file)
53
- except subprocess.CalledProcessError as e:
54
- st.error(f"Audio processing failed: {str(e)} - Command: {e.cmd}, Output: {e.stderr}")
55
- raise
56
- except Exception as e:
57
- st.error(f"Audio processing failed: {str(e)}")
58
- raise
59
-
60
- # Audio Feature Extraction
61
- def extract_audio_features(audio_path):
62
- try:
63
- y, sr = librosa.load(audio_path, sr=16000)
64
- pitch_mean = np.mean(librosa.piptrack(y=y, sr=sr)[0][librosa.piptrack(y=y, sr=sr)[0] > 0]) if np.any(librosa.piptrack(y=y, sr=sr)[0] > 0) else 0
65
- energy_mean = np.mean(librosa.feature.rms(y=y))
66
- zcr_mean = np.mean(librosa.feature.zero_crossing_rate(y))
67
- return {"pitch_mean": pitch_mean, "energy_mean": energy_mean, "zcr_mean": zcr_mean}
68
- except Exception as e:
69
- st.error(f"Audio feature extraction failed: {str(e)}")
70
- return {}
71
-
72
- # Audio Emotion Classification with Wav2Vec2
73
- @st.cache_resource
74
- def get_audio_emotion_classifier():
75
- processor = Wav2Vec2Processor.from_pretrained("superb/wav2vec2-base-superb-er")
76
- model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-er")
77
- model = model.to(device)
78
- return processor, model
79
-
80
- def perform_audio_emotion_detection(audio_path):
81
- try:
82
- processor, model = get_audio_emotion_classifier()
83
- waveform, sample_rate = librosa.load(audio_path, sr=16000)
84
- inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
85
- inputs = {k: v.to(device) for k, v in inputs.items()}
86
- with torch.no_grad():
87
- logits = model(**inputs).logits
88
- scores = torch.softmax(logits, dim=1).detach().cpu().numpy()[0]
89
- audio_emotions = ["neutral", "happy", "sad", "angry", "fearful", "surprise", "disgust"]
90
- emotion_dict = {emotion: float(scores[i]) for i, emotion in enumerate(audio_emotions)}
91
- top_emotion = audio_emotions[np.argmax(scores)]
92
- # Enhanced boosting based on audio features
93
- features = extract_audio_features(audio_path)
94
- if features.get("pitch_mean", 0) < 200 and features.get("energy_mean", 0) > 0.1 and features.get("zcr_mean", 0) > 0.1:
95
- emotion_dict["fearful"] = min(1.0, emotion_dict.get("fearful", 0) + 0.4)
96
- top_emotion = "fearful" if emotion_dict["fearful"] > emotion_dict[top_emotion] else top_emotion
97
- elif features.get("energy_mean", 0) > 0.25:
98
- emotion_dict["angry"] = min(1.0, emotion_dict.get("angry", 0) + 0.35)
99
- top_emotion = "angry" if emotion_dict["angry"] > emotion_dict[top_emotion] else top_emotion
100
- elif features.get("pitch_mean", 0) > 500 and features.get("energy_mean", 0) < 0.05:
101
- emotion_dict["sad"] = min(1.0, emotion_dict.get("sad", 0) + 0.3)
102
- top_emotion = "sad" if emotion_dict["sad"] > emotion_dict[top_emotion] else top_emotion
103
- elif features.get("energy_mean", 0) > 0.15 and features.get("pitch_mean", 0) > 300:
104
- emotion_dict["happy"] = min(1.0, emotion_dict.get("happy", 0) + 0.3)
105
- top_emotion = "happy" if emotion_dict["happy"] > emotion_dict[top_emotion] else top_emotion
106
- elif features.get("zcr_mean", 0) > 0.15 and features.get("energy_mean", 0) > 0.1:
107
- emotion_dict["surprise"] = min(1.0, emotion_dict.get("surprise", 0) + 0.25)
108
- top_emotion = "surprise" if emotion_dict["surprise"] > emotion_dict[top_emotion] else top_emotion
109
- # Fallback to avoid neutral if score is low
110
- if emotion_dict["neutral"] > 0.5 and max([v for k, v in emotion_dict.items() if k != "neutral"]) > 0.3:
111
- emotion_dict["neutral"] = max(0.0, emotion_dict["neutral"] - 0.2)
112
- top_emotion = max(emotion_dict, key=emotion_dict.get)
113
- return emotion_dict, top_emotion
114
- except Exception as e:
115
- st.error(f"Audio emotion detection failed: {str(e)}")
116
- return {}, "unknown"
117
 
118
- # Text Emotion Classification with RoBERTa
119
  @st.cache_resource
120
- def get_text_emotion_classifier():
121
  tokenizer = AutoTokenizer.from_pretrained("SamLowe/roberta-base-go_emotions", use_fast=True)
122
  model = AutoModelForSequenceClassification.from_pretrained("SamLowe/roberta-base-go_emotions")
123
  model = model.to(device)
124
  return pipeline("text-classification", model=model, tokenizer=tokenizer, top_k=None, device=-1 if device.type == "cpu" else 0)
125
 
126
- def perform_text_emotion_detection(text):
127
  try:
128
- classifier = get_text_emotion_classifier()
129
- results = classifier(text)[0]
130
- emotions = ["admiration", "amusement", "anger", "annoyance", "approval", "caring", "confusion",
131
- "curiosity", "desire", "disappointment", "disapproval", "disgust", "embarrassment",
132
- "excitement", "fear", "gratitude", "grief", "joy", "love", "nervousness", "optimism",
133
- "pride", "realization", "relief", "remorse", "sadness", "surprise", "neutral"]
134
- emotions_dict = {result['label']: result['score'] for result in results if result['label'] in emotions}
135
- top_emotion = max(emotions_dict, key=emotions_dict.get)
136
- if emotions_dict.get("neutral", 0) > 0.5 and max([v for k, v in emotions_dict.items() if k != "neutral"]) > 0.4:
137
- emotions_dict["neutral"] = max(0.0, emotions_dict["neutral"] - 0.15)
138
- top_emotion = max(emotions_dict, key=emotions_dict.get)
139
- return emotions_dict, top_emotion
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  except Exception as e:
141
- st.error(f"Text emotion detection failed: {str(e)}")
142
- return {}, "unknown"
 
143
 
144
- # Sarcasm Detection
145
  @st.cache_resource
146
  def get_sarcasm_classifier():
147
  tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-irony", use_fast=True)
@@ -151,8 +129,11 @@ def get_sarcasm_classifier():
151
 
152
  def perform_sarcasm_detection(text):
153
  try:
154
- classifier = get_sarcasm_classifier()
155
- result = classifier(text)[0]
 
 
 
156
  is_sarcastic = result['label'] == "LABEL_1"
157
  sarcasm_score = result['score'] if is_sarcastic else 1 - result['score']
158
  return is_sarcastic, sarcasm_score
@@ -160,157 +141,415 @@ def perform_sarcasm_detection(text):
160
  st.error(f"Sarcasm detection failed: {str(e)}")
161
  return False, 0.0
162
 
163
- # Validate Audio
164
  def validate_audio(audio_path):
165
  try:
166
  sound = AudioSegment.from_file(audio_path)
167
- if sound.dBFS < -50 or len(sound) < 1000:
168
- st.warning("Audio volume too low or too short. Please use a louder, longer audio.")
 
 
 
169
  return False
170
  return True
171
- except Exception:
172
  st.error("Invalid or corrupted audio file.")
173
  return False
174
 
175
  # Speech Recognition with Whisper
176
  @st.cache_resource
177
  def load_whisper_model():
178
- return whisper.load_model("large-v3")
 
 
179
 
180
- def transcribe_audio(audio_path):
181
  try:
 
182
  sound = AudioSegment.from_file(audio_path)
 
 
 
183
  temp_wav_path = os.path.join(tempfile.gettempdir(), "temp_converted.wav")
184
- sound = sound.set_frame_rate(16000).set_channels(1)
 
185
  sound.export(temp_wav_path, format="wav")
 
 
186
  model = load_whisper_model()
 
 
187
  result = model.transcribe(temp_wav_path, language="en")
188
- os.remove(temp_wav_path)
189
- return result["text"].strip()
 
 
 
 
 
 
 
 
190
  except Exception as e:
191
  st.error(f"Transcription failed: {str(e)}")
192
- return ""
193
 
194
- # Process Audio Files
195
- def process_audio_file(audio_data):
196
- temp_dir = tempfile.gettempdir()
197
- temp_file_path = os.path.join(temp_dir, f"audio_{int(time.time())}.wav")
198
- with open(temp_file_path, "wb") as f:
199
- f.write(audio_data.getvalue())
200
- if not validate_audio(temp_file_path):
 
 
 
 
 
 
 
 
 
 
 
201
  return None
202
- return temp_file_path
203
-
204
- # Display Results
205
- def display_analysis_results(audio_path):
206
- st.header("Audio Analysis")
207
- st.audio(audio_path)
208
-
209
- # Preprocess audio
210
- processed_audio_path = os.path.join(tempfile.gettempdir(), f"processed_{int(time.time())}.wav")
211
- make_audio_scarier(audio_path, processed_audio_path)
212
-
213
- # Audio emotion detection
214
- audio_emotions, audio_top_emotion = perform_audio_emotion_detection(processed_audio_path)
215
- st.subheader("Audio-Based Emotion")
216
- st.write(f"**Dominant Emotion:** {audio_top_emotion} (Score: {audio_emotions.get(audio_top_emotion, 0):.3f})")
217
- st.write("Audio Emotions:", audio_emotions) # Debug output
218
 
219
- # Transcription and text emotion detection
220
- transcribed_text = transcribe_audio(processed_audio_path)
221
- st.subheader("Transcribed Text")
222
- st.text_area("Text", transcribed_text, height=100, disabled=True)
223
- if transcribed_text:
224
- text_emotions, text_top_emotion = perform_text_emotion_detection(transcribed_text)
225
- st.write(f"**Text-Based Dominant Emotion:** {text_top_emotion} (Score: {text_emotions.get(text_top_emotion, 0):.3f})")
226
- st.write("Text Emotions:", text_emotions) # Debug output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
 
228
- # Combine emotions (prioritize audio, map to 27 emotions)
229
- emotion_map = {
230
- "neutral": "neutral", "happy": "joy", "sad": "sadness", "angry": "anger",
231
- "fearful": "fear", "surprise": "surprise", "disgust": "disgust"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  }
233
- combined_emotions = {emotion: 0 for emotion in ["admiration", "amusement", "anger", "annoyance", "approval", "caring",
234
- "confusion", "curiosity", "desire", "disappointment", "disapproval",
235
- "disgust", "embarrassment", "excitement", "fear", "gratitude",
236
- "grief", "joy", "love", "nervousness", "optimism", "pride",
237
- "realization", "relief", "remorse", "sadness", "surprise", "neutral"]}
238
- for audio_emotion, score in audio_emotions.items():
239
- mapped_emotion = emotion_map.get(audio_emotion, "neutral")
240
- combined_emotions[mapped_emotion] = max(combined_emotions[mapped_emotion], score * 0.7)
241
- if transcribed_text:
242
- for text_emotion, score in text_emotions.items():
243
- combined_emotions[text_emotion] = combined_emotions.get(text_emotion, 0) + score * 0.3
 
 
 
 
 
 
 
 
 
 
 
244
 
245
- # Avoid neutral if other emotions are competitive
246
- top_emotion = max(combined_emotions, key=combined_emotions.get)
247
- if combined_emotions["neutral"] > 0.5 and max([v for k, v in combined_emotions.items() if k != "neutral"]) > 0.4:
248
- combined_emotions["neutral"] = max(0.0, combined_emotions["neutral"] - 0.25)
249
- top_emotion = max(combined_emotions, key=combined_emotions.get)
 
 
 
 
 
 
 
250
 
251
- sentiment = "POSITIVE" if top_emotion in ["admiration", "amusement", "approval", "caring", "desire", "excitement",
252
- "gratitude", "joy", "love", "optimism", "pride", "relief"] else "NEGATIVE" if top_emotion in ["anger", "annoyance", "disappointment", "disapproval", "disgust", "embarrassment", "fear", "grief", "nervousness", "remorse", "sadness"] else "NEUTRAL"
253
 
254
- # Sarcasm detection
255
- is_sarcastic, sarcasm_score = perform_sarcasm_detection(transcribed_text) if transcribed_text else (False, 0.0)
256
 
257
- # Display results
258
  col1, col2 = st.columns([1, 2])
 
259
  with col1:
260
  st.subheader("Sentiment")
261
  sentiment_icon = "πŸ‘" if sentiment == "POSITIVE" else "πŸ‘Ž" if sentiment == "NEGATIVE" else "😐"
262
- st.markdown(f"**{sentiment_icon} {sentiment.capitalize()}** (Based on {top_emotion})")
 
 
263
  st.subheader("Sarcasm")
264
  sarcasm_icon = "😏" if is_sarcastic else "😐"
265
- st.markdown(f"**{sarc_icon} {'Detected' if is_sarcastic else 'Not Detected'}** (Score: {sarcasm_score:.3f})")
 
 
266
 
267
  with col2:
268
- st.subheader("Emotion Distribution")
269
- sorted_emotions = sorted(combined_emotions.items(), key=lambda x: x[1], reverse=True)[:10]
270
- emotions, scores = zip(*sorted_emotions)
271
- fig = px.bar(x=list(emotions), y=list(scores), labels={'x': 'Emotion', 'y': 'Score'},
272
- title="Top Emotion Scores", color=list(emotions),
273
- color_discrete_sequence=px.colors.qualitative.Bold)
274
- fig.update_layout(yaxis_range=[0, 1], showlegend=False, title_font_size=14)
275
- st.plotly_chart(fig, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
 
277
- with st.expander("Details"):
278
- st.write(f"**Audio Features:** {extract_audio_features(processed_audio_path)}")
279
  st.write("""
280
- **How it works:**
281
- - Audio Emotion: Wav2Vec2 detects 7 emotions with feature-based boosts.
282
- - Transcription: Whisper converts audio to text.
283
- - Text Emotion: RoBERTa refines 27 emotions from text.
284
- - Sarcasm: Analyzes text for irony.
285
- **Accuracy depends on:** Audio quality, clarity, and noise.
 
 
 
 
286
  """)
287
 
288
- # Clean up
289
- for path in [audio_path, processed_audio_path]:
290
- if os.path.exists(path):
291
- os.remove(path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
 
293
  # Main App Logic
294
  def main():
295
- st.header("Upload Audio File")
296
- audio_file = st.file_uploader("Upload audio (wav, mp3, ogg)", type=["wav", "mp3", "ogg"])
297
- if audio_file:
298
- temp_audio_path = process_audio_file(audio_file)
299
- if temp_audio_path:
300
- if st.button("Analyze Audio"):
301
- with st.spinner("Analyzing..."):
302
- display_analysis_results(temp_audio_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
 
304
- st.sidebar.header("About")
305
- st.sidebar.write("""
306
- **Models Used:**
307
- - Audio: superb/wav2vec2-base-superb-er (7 emotions)
308
- - Text: SamLowe/roberta-base-go_emotions (27 emotions)
309
- - Sarcasm: cardiffnlp/twitter-roberta-base-irony
310
- - Speech: OpenAI Whisper (large-v3)
311
- **Note:** Recording is not supported on Hugging Face Spaces; use uploaded files.
312
- """)
313
 
314
- if __name__ == "__main__":
315
- main()
316
-
 
1
+
2
  import os
3
  import streamlit as st
4
  import tempfile
5
  import torch
6
  import transformers
7
+ from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
8
  import plotly.express as px
9
  import logging
10
  import warnings
11
  import whisper
12
  from pydub import AudioSegment
13
  import time
14
+ import base64
15
+ import io
16
+ import streamlit.components.v1 as components
17
 
18
  # Suppress warnings for a clean console
19
  logging.getLogger("torch").setLevel(logging.CRITICAL)
 
26
  print(f"Using device: {device}")
27
 
28
  # Set Streamlit app layout
29
+ st.set_page_config(layout="wide", page_title="Voice Based Sentiment Analysis")
30
 
31
  # Interface design
32
+ st.title("πŸŽ™ Voice Based Sentiment Analysis")
33
+ st.write("Detect emotions, sentiment, and sarcasm from your voice with state-of-the-art accuracy using OpenAI Whisper.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ # Emotion Detection Function
36
  @st.cache_resource
37
+ def get_emotion_classifier():
38
  tokenizer = AutoTokenizer.from_pretrained("SamLowe/roberta-base-go_emotions", use_fast=True)
39
  model = AutoModelForSequenceClassification.from_pretrained("SamLowe/roberta-base-go_emotions")
40
  model = model.to(device)
41
  return pipeline("text-classification", model=model, tokenizer=tokenizer, top_k=None, device=-1 if device.type == "cpu" else 0)
42
 
43
+ def perform_emotion_detection(text):
44
  try:
45
+ if not text or len(text.strip()) < 3:
46
+ return {}, "neutral", {}, "NEUTRAL"
47
+
48
+ emotion_classifier = get_emotion_classifier()
49
+ emotion_results = emotion_classifier(text)[0]
50
+
51
+ emotion_map = {
52
+ "admiration": "🀩", "amusement": "πŸ˜„", "anger": "😑", "annoyance": "πŸ˜’",
53
+ "approval": "πŸ‘", "caring": "πŸ€—", "confusion": "πŸ˜•", "curiosity": "🧐",
54
+ "desire": "😍", "disappointment": "😞", "disapproval": "πŸ‘Ž", "disgust": "🀒",
55
+ "embarrassment": "😳", "excitement": "🀩", "fear": "😨", "gratitude": "πŸ™",
56
+ "grief": "😒", "joy": "😊", "love": "❀", "nervousness": "😰",
57
+ "optimism": "🌈", "pride": "😌", "realization": "πŸ’‘", "relief": "😌",
58
+ "remorse": "πŸ˜”", "sadness": "😭", "surprise": "😲", "neutral": "😐"
59
+ }
60
+
61
+ positive_emotions = ["admiration", "amusement", "approval", "caring", "desire",
62
+ "excitement", "gratitude", "joy", "love", "optimism", "pride", "relief"]
63
+ negative_emotions = ["anger", "annoyance", "disappointment", "disapproval", "disgust",
64
+ "embarrassment", "fear", "grief", "nervousness", "remorse", "sadness"]
65
+ neutral_emotions = ["confusion", "curiosity", "realization", "surprise", "neutral"]
66
+
67
+ # Fix 1: Create a clean emotions dictionary from results
68
+ emotions_dict = {}
69
+ for result in emotion_results:
70
+ emotions_dict[result['label']] = result['score']
71
+
72
+ # Fix 2: Filter out very low scores (below threshold)
73
+ filtered_emotions = {k: v for k, v in emotions_dict.items() if v > 0.05}
74
+
75
+ # If filtered dictionary is empty, fall back to original
76
+ if not filtered_emotions:
77
+ filtered_emotions = emotions_dict
78
+
79
+ # Fix 3: Make sure we properly find the top emotion
80
+ top_emotion = max(filtered_emotions, key=filtered_emotions.get)
81
+ top_score = filtered_emotions[top_emotion]
82
+
83
+ # Fix 4: More robust sentiment assignment
84
+ if top_emotion in positive_emotions:
85
+ sentiment = "POSITIVE"
86
+ elif top_emotion in negative_emotions:
87
+ sentiment = "NEGATIVE"
88
+ else:
89
+ # If the top emotion is neutral but there are strong competing emotions, use them
90
+ competing_emotions = sorted(filtered_emotions.items(), key=lambda x: x[1], reverse=True)[:3]
91
+
92
+ # Check if there's a close second non-neutral emotion
93
+ if len(competing_emotions) > 1:
94
+ if (competing_emotions[0][0] in neutral_emotions and
95
+ competing_emotions[1][0] not in neutral_emotions and
96
+ competing_emotions[1][1] > 0.7 * competing_emotions[0][1]):
97
+ # Use the second strongest emotion instead
98
+ top_emotion = competing_emotions[1][0]
99
+ if top_emotion in positive_emotions:
100
+ sentiment = "POSITIVE"
101
+ elif top_emotion in negative_emotions:
102
+ sentiment = "NEGATIVE"
103
+ else:
104
+ sentiment = "NEUTRAL"
105
+ else:
106
+ sentiment = "NEUTRAL"
107
+ else:
108
+ sentiment = "NEUTRAL"
109
+
110
+ # Log for debugging
111
+ print(f"Text: {text[:50]}...")
112
+ print(f"Top 3 emotions: {sorted(filtered_emotions.items(), key=lambda x: x[1], reverse=True)[:3]}")
113
+ print(f"Selected top emotion: {top_emotion} ({filtered_emotions.get(top_emotion, 0):.3f})")
114
+ print(f"Sentiment determined: {sentiment}")
115
+
116
+ return emotions_dict, top_emotion, emotion_map, sentiment
117
  except Exception as e:
118
+ st.error(f"Emotion detection failed: {str(e)}")
119
+ print(f"Exception in emotion detection: {str(e)}")
120
+ return {}, "neutral", {}, "NEUTRAL"
121
 
122
+ # Sarcasm Detection Function
123
  @st.cache_resource
124
  def get_sarcasm_classifier():
125
  tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-irony", use_fast=True)
 
129
 
130
  def perform_sarcasm_detection(text):
131
  try:
132
+ if not text or len(text.strip()) < 3:
133
+ return False, 0.0
134
+
135
+ sarcasm_classifier = get_sarcasm_classifier()
136
+ result = sarcasm_classifier(text)[0]
137
  is_sarcastic = result['label'] == "LABEL_1"
138
  sarcasm_score = result['score'] if is_sarcastic else 1 - result['score']
139
  return is_sarcastic, sarcasm_score
 
141
  st.error(f"Sarcasm detection failed: {str(e)}")
142
  return False, 0.0
143
 
144
+ # Validate audio quality
145
  def validate_audio(audio_path):
146
  try:
147
  sound = AudioSegment.from_file(audio_path)
148
+ if sound.dBFS < -50:
149
+ st.warning("Audio volume is too low. Please record or upload a louder audio.")
150
+ return False
151
+ if len(sound) < 1000: # Less than 1 second
152
+ st.warning("Audio is too short. Please record a longer audio.")
153
  return False
154
  return True
155
+ except:
156
  st.error("Invalid or corrupted audio file.")
157
  return False
158
 
159
  # Speech Recognition with Whisper
160
  @st.cache_resource
161
  def load_whisper_model():
162
+ # Use 'large-v3' for maximum accuracy
163
+ model = whisper.load_model("large-v3")
164
+ return model
165
 
166
+ def transcribe_audio(audio_path, show_alternative=False):
167
  try:
168
+ st.write(f"Processing audio file: {audio_path}")
169
  sound = AudioSegment.from_file(audio_path)
170
+ st.write(f"Audio duration: {len(sound)/1000:.2f}s, Sample rate: {sound.frame_rate}, Channels: {sound.channels}")
171
+
172
+ # Convert to WAV format (16kHz, mono) for Whisper
173
  temp_wav_path = os.path.join(tempfile.gettempdir(), "temp_converted.wav")
174
+ sound = sound.set_frame_rate(16000)
175
+ sound = sound.set_channels(1)
176
  sound.export(temp_wav_path, format="wav")
177
+
178
+ # Load Whisper model
179
  model = load_whisper_model()
180
+
181
+ # Transcribe audio
182
  result = model.transcribe(temp_wav_path, language="en")
183
+ main_text = result["text"].strip()
184
+
185
+ # Clean up
186
+ if os.path.exists(temp_wav_path):
187
+ os.remove(temp_wav_path)
188
+
189
+ # Whisper doesn't provide alternatives, so return empty list
190
+ if show_alternative:
191
+ return main_text, []
192
+ return main_text
193
  except Exception as e:
194
  st.error(f"Transcription failed: {str(e)}")
195
+ return "", [] if show_alternative else ""
196
 
197
+ # Function to handle uploaded audio files
198
+ def process_uploaded_audio(audio_file):
199
+ if not audio_file:
200
+ return None
201
+
202
+ try:
203
+ temp_dir = tempfile.gettempdir()
204
+ temp_file_path = os.path.join(temp_dir, f"uploaded_audio_{int(time.time())}.wav")
205
+
206
+ with open(temp_file_path, "wb") as f:
207
+ f.write(audio_file.getvalue())
208
+
209
+ if not validate_audio(temp_file_path):
210
+ return None
211
+
212
+ return temp_file_path
213
+ except Exception as e:
214
+ st.error(f"Error processing uploaded audio: {str(e)}")
215
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
 
217
+ # Show model information
218
+ def show_model_info():
219
+ st.sidebar.header("🧠 About the Models")
220
+
221
+ model_tabs = st.sidebar.tabs(["Emotion", "Sarcasm", "Speech"])
222
+
223
+ with model_tabs[0]:
224
+ st.markdown("""
225
+ *Emotion Model*: SamLowe/roberta-base-go_emotions
226
+ - Fine-tuned on GoEmotions dataset (58k Reddit comments, 27 emotions)
227
+ - Architecture: RoBERTa base
228
+ - Micro-F1: 0.46
229
+ [πŸ” Model Hub](https://huggingface.co/SamLowe/roberta-base-go_emotions)
230
+ """)
231
+
232
+ with model_tabs[1]:
233
+ st.markdown("""
234
+ *Sarcasm Model*: cardiffnlp/twitter-roberta-base-irony
235
+ - Trained on SemEval-2018 Task 3 (Twitter irony dataset)
236
+ - Architecture: RoBERTa base
237
+ - F1-score: 0.705
238
+ [πŸ” Model Hub](https://huggingface.co/cardiffnlp/twitter-roberta-base-irony)
239
+ """)
240
+
241
+ with model_tabs[2]:
242
+ st.markdown("""
243
+ *Speech Recognition*: OpenAI Whisper (large-v3)
244
+ - State-of-the-art model for speech-to-text
245
+ - Accuracy: ~5-10% WER on clean English audio
246
+ - Robust to noise, accents, and varied conditions
247
+ - Runs locally, no internet required
248
+ *Tips*: Use good mic, reduce noise, speak clearly
249
+ [πŸ” Model Details](https://github.com/openai/whisper)
250
+ """)
251
 
252
+ # Custom audio recorder using HTML/JS
253
+ def custom_audio_recorder():
254
+ audio_recorder_html = """
255
+ <script>
256
+ var audioRecorder = {
257
+ audioBlobs: [],
258
+ mediaRecorder: null,
259
+ streamBeingCaptured: null,
260
+ start: function() {
261
+ if (!(navigator.mediaDevices && navigator.mediaDevices.getUserMedia)) {
262
+ return Promise.reject(new Error('mediaDevices API or getUserMedia method is not supported in this browser.'));
263
+ }
264
+ else {
265
+ return navigator.mediaDevices.getUserMedia({ audio: true })
266
+ .then(stream => {
267
+ audioRecorder.streamBeingCaptured = stream;
268
+ audioRecorder.mediaRecorder = new MediaRecorder(stream);
269
+ audioRecorder.audioBlobs = [];
270
+
271
+ audioRecorder.mediaRecorder.addEventListener("dataavailable", event => {
272
+ audioRecorder.audioBlobs.push(event.data);
273
+ });
274
+
275
+ audioRecorder.mediaRecorder.start();
276
+ });
277
+ }
278
+ },
279
+ stop: function() {
280
+ return new Promise(resolve => {
281
+ let mimeType = audioRecorder.mediaRecorder.mimeType;
282
+
283
+ audioRecorder.mediaRecorder.addEventListener("stop", () => {
284
+ let audioBlob = new Blob(audioRecorder.audioBlobs, { type: mimeType });
285
+ resolve(audioBlob);
286
+ });
287
+
288
+ audioRecorder.mediaRecorder.stop();
289
+
290
+ audioRecorder.stopStream();
291
+ audioRecorder.resetRecordingProperties();
292
+ });
293
+ },
294
+ stopStream: function() {
295
+ audioRecorder.streamBeingCaptured.getTracks()
296
+ .forEach(track => track.stop());
297
+ },
298
+ resetRecordingProperties: function() {
299
+ audioRecorder.mediaRecorder = null;
300
+ audioRecorder.streamBeingCaptured = null;
301
+ }
302
+ }
303
+ var isRecording = false;
304
+ var recordButton = document.getElementById('record-button');
305
+ var audioElement = document.getElementById('audio-playback');
306
+ var audioData = document.getElementById('audio-data');
307
+
308
+ function toggleRecording() {
309
+ if (!isRecording) {
310
+ audioRecorder.start()
311
+ .then(() => {
312
+ isRecording = true;
313
+ recordButton.textContent = 'Stop Recording';
314
+ recordButton.classList.add('recording');
315
+ })
316
+ .catch(error => {
317
+ alert('Error starting recording: ' + error.message);
318
+ });
319
+ } else {
320
+ audioRecorder.stop()
321
+ .then(audioBlob => {
322
+ const audioUrl = URL.createObjectURL(audioBlob);
323
+ audioElement.src = audioUrl;
324
+
325
+ const reader = new FileReader();
326
+ reader.readAsDataURL(audioBlob);
327
+ reader.onloadend = function() {
328
+ const base64data = reader.result;
329
+ audioData.value = base64data;
330
+ const streamlitMessage = {type: "streamlit:setComponentValue", value: base64data};
331
+ window.parent.postMessage(streamlitMessage, "*");
332
+ }
333
+
334
+ isRecording = false;
335
+ recordButton.textContent = 'Start Recording';
336
+ recordButton.classList.remove('recording');
337
+ });
338
+ }
339
+ }
340
+ document.addEventListener('DOMContentLoaded', function() {
341
+ recordButton = document.getElementById('record-button');
342
+ audioElement = document.getElementById('audio-playback');
343
+ audioData = document.getElementById('audio-data');
344
+
345
+ recordButton.addEventListener('click', toggleRecording);
346
+ });
347
+ </script>
348
+ <div class="audio-recorder-container">
349
+ <button id="record-button" class="record-button">Start Recording</button>
350
+ <audio id="audio-playback" controls style="display:block; margin-top:10px;"></audio>
351
+ <input type="hidden" id="audio-data" name="audio-data">
352
+ </div>
353
+ <style>
354
+ .audio-recorder-container {
355
+ display: flex;
356
+ flex-direction: column;
357
+ align-items: center;
358
+ padding: 20px;
359
  }
360
+ .record-button {
361
+ background-color: #f63366;
362
+ color: white;
363
+ border: none;
364
+ padding: 10px 20px;
365
+ border-radius: 5px;
366
+ cursor: pointer;
367
+ font-size: 16px;
368
+ }
369
+ .record-button.recording {
370
+ background-color: #ff0000;
371
+ animation: pulse 1.5s infinite;
372
+ }
373
+ @keyframes pulse {
374
+ 0% { opacity: 1; }
375
+ 50% { opacity: 0.7; }
376
+ 100% { opacity: 1; }
377
+ }
378
+ </style>
379
+ """
380
+
381
+ return components.html(audio_recorder_html, height=150)
382
 
383
+ # Function to display analysis results
384
+ def display_analysis_results(transcribed_text):
385
+ # Fix 5: Add debugging to track what's happening
386
+ st.session_state.debug_info = st.session_state.get('debug_info', [])
387
+ st.session_state.debug_info.append(f"Processing text: {transcribed_text[:50]}...")
388
+
389
+ emotions_dict, top_emotion, emotion_map, sentiment = perform_emotion_detection(transcribed_text)
390
+ is_sarcastic, sarcasm_score = perform_sarcasm_detection(transcribed_text)
391
+
392
+ # Add results to debug info
393
+ st.session_state.debug_info.append(f"Top emotion: {top_emotion}, Sentiment: {sentiment}")
394
+ st.session_state.debug_info.append(f"Sarcasm: {is_sarcastic}, Score: {sarcasm_score:.3f}")
395
 
396
+ st.header("Transcribed Text")
397
+ st.text_area("Text", transcribed_text, height=150, disabled=True, help="The audio converted to text.")
398
 
399
+ confidence_score = min(0.95, max(0.70, len(transcribed_text.split()) / 50))
400
+ st.caption(f"Transcription confidence: {confidence_score:.2f}")
401
 
402
+ st.header("Analysis Results")
403
  col1, col2 = st.columns([1, 2])
404
+
405
  with col1:
406
  st.subheader("Sentiment")
407
  sentiment_icon = "πŸ‘" if sentiment == "POSITIVE" else "πŸ‘Ž" if sentiment == "NEGATIVE" else "😐"
408
+ st.markdown(f"{sentiment_icon} {sentiment.capitalize()}** (Based on {top_emotion})")
409
+ st.info("Sentiment reflects the dominant emotion's tone.")
410
+
411
  st.subheader("Sarcasm")
412
  sarcasm_icon = "😏" if is_sarcastic else "😐"
413
+ sarcasm_text = "Detected" if is_sarcastic else "Not Detected"
414
+ st.markdown(f"{sarcasm_icon} {sarcasm_text}** (Score: {sarcasm_score:.3f})")
415
+ st.info("Score indicates sarcasm confidence (0 to 1).")
416
 
417
  with col2:
418
+ st.subheader("Emotions")
419
+ if emotions_dict:
420
+ st.markdown(f"*Dominant:* {emotion_map.get(top_emotion, '❓')} {top_emotion.capitalize()} (Score: {emotions_dict[top_emotion]:.3f})")
421
+ sorted_emotions = sorted(emotions_dict.items(), key=lambda x: x[1], reverse=True)
422
+ top_emotions = sorted_emotions[:8]
423
+ emotions = [e[0] for e in top_emotions]
424
+ scores = [e[1] for e in top_emotions]
425
+ fig = px.bar(x=emotions, y=scores, labels={'x': 'Emotion', 'y': 'Score'},
426
+ title="Top Emotions Distribution", color=emotions,
427
+ color_discrete_sequence=px.colors.qualitative.Bold)
428
+ fig.update_layout(yaxis_range=[0, 1], showlegend=False, title_font_size=14)
429
+ st.plotly_chart(fig, use_container_width=True)
430
+ else:
431
+ st.write("No emotions detected.")
432
+
433
+ # Fix 6: Add debug expander for troubleshooting
434
+ with st.expander("Debug Information", expanded=False):
435
+ st.write("Debugging information for troubleshooting:")
436
+ for i, debug_line in enumerate(st.session_state.debug_info[-10:]):
437
+ st.text(f"{i+1}. {debug_line}")
438
+ if emotions_dict:
439
+ st.write("Raw emotion scores:")
440
+ for emotion, score in sorted(emotions_dict.items(), key=lambda x: x[1], reverse=True):
441
+ if score > 0.01: # Only show non-negligible scores
442
+ st.text(f"{emotion}: {score:.4f}")
443
 
444
+ with st.expander("Analysis Details", expanded=False):
 
445
  st.write("""
446
+ *How this works:*
447
+ 1. *Speech Recognition*: Audio transcribed using OpenAI Whisper (large-v3)
448
+ 2. *Emotion Analysis*: RoBERTa model trained on GoEmotions (27 emotions)
449
+ 3. *Sentiment Analysis*: Derived from dominant emotion
450
+ 4. *Sarcasm Detection*: RoBERTa model for irony detection
451
+ *Accuracy depends on*:
452
+ - Audio quality
453
+ - Speech clarity
454
+ - Background noise
455
+ - Speech patterns
456
  """)
457
 
458
+ # Process base64 audio data
459
+ def process_base64_audio(base64_data):
460
+ try:
461
+ base64_binary = base64_data.split(',')[1]
462
+ binary_data = base64.b64decode(base64_binary)
463
+
464
+ temp_dir = tempfile.gettempdir()
465
+ temp_file_path = os.path.join(temp_dir, f"recording_{int(time.time())}.wav")
466
+
467
+ with open(temp_file_path, "wb") as f:
468
+ f.write(binary_data)
469
+
470
+ if not validate_audio(temp_file_path):
471
+ return None
472
+
473
+ return temp_file_path
474
+ except Exception as e:
475
+ st.error(f"Error processing audio data: {str(e)}")
476
+ return None
477
 
478
  # Main App Logic
479
  def main():
480
+ # Fix 7: Initialize session state for debugging
481
+ if 'debug_info' not in st.session_state:
482
+ st.session_state.debug_info = []
483
+
484
+ tab1, tab2 = st.tabs(["πŸ“ Upload Audio", "πŸŽ™ Record Audio"])
485
+
486
+ with tab1:
487
+ st.header("Upload an Audio File")
488
+ audio_file = st.file_uploader("Choose an audio file", type=["wav", "mp3", "ogg"],
489
+ help="Upload an audio file for analysis")
490
+
491
+ if audio_file:
492
+ st.audio(audio_file.getvalue())
493
+ st.caption("🎧 Uploaded Audio Playback")
494
+
495
+ upload_button = st.button("Analyze Upload", key="analyze_upload")
496
+
497
+ if upload_button:
498
+ with st.spinner('Analyzing audio with advanced precision...'):
499
+ temp_audio_path = process_uploaded_audio(audio_file)
500
+ if temp_audio_path:
501
+ main_text, alternatives = transcribe_audio(temp_audio_path, show_alternative=True)
502
+
503
+ if main_text:
504
+ if alternatives:
505
+ with st.expander("Alternative transcriptions detected", expanded=False):
506
+ for i, alt in enumerate(alternatives[:3], 1):
507
+ st.write(f"{i}. {alt}")
508
+
509
+ display_analysis_results(main_text)
510
+ else:
511
+ st.error("Could not transcribe the audio. Please try again with clearer audio.")
512
+
513
+ if os.path.exists(temp_audio_path):
514
+ os.remove(temp_audio_path)
515
+
516
+ with tab2:
517
+ st.header("Record Your Voice")
518
+ st.write("Use the recorder below to analyze your speech in real-time.")
519
+
520
+ st.subheader("Browser-Based Recorder")
521
+ st.write("Click the button below to start/stop recording.")
522
+
523
+ audio_data = custom_audio_recorder()
524
+
525
+ if audio_data:
526
+ analyze_rec_button = st.button("Analyze Recording", key="analyze_rec")
527
+
528
+ if analyze_rec_button:
529
+ with st.spinner("Processing your recording..."):
530
+ temp_audio_path = process_base64_audio(audio_data)
531
+
532
+ if temp_audio_path:
533
+ transcribed_text = transcribe_audio(temp_audio_path)
534
+
535
+ if transcribed_text:
536
+ display_analysis_results(transcribed_text)
537
+ else:
538
+ st.error("Could not transcribe the audio. Please try speaking more clearly.")
539
+
540
+ if os.path.exists(temp_audio_path):
541
+ os.remove(temp_audio_path)
542
+
543
+ st.subheader("Manual Text Input")
544
+ st.write("If recording doesn't work, you can type your text here:")
545
+
546
+ manual_text = st.text_area("Enter text to analyze:", placeholder="Type what you want to analyze...")
547
+ analyze_text_button = st.button("Analyze Text", key="analyze_manual")
548
+
549
+ if analyze_text_button and manual_text:
550
+ display_analysis_results(manual_text)
551
 
552
+ show_model_info()
 
 
 
 
 
 
 
 
553
 
554
+ if _name_ == "_main_":
555
+ main()