Spaces:
Sleeping
Sleeping
import gradio as gr | |
import json | |
import os | |
import logging | |
import re | |
import numpy as np | |
import pandas as pd | |
from datetime import datetime | |
import time | |
import tempfile | |
from typing import Dict, List, Tuple, Optional | |
import requests | |
# Configure logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
# Try to import video processing libraries | |
try: | |
import moviepy.editor as mp | |
MOVIEPY_AVAILABLE = True | |
logger.info("MoviePy available for video processing") | |
except ImportError as e: | |
logger.warning(f"MoviePy not available: {e}") | |
MOVIEPY_AVAILABLE = False | |
# Try to import speaker diarization | |
try: | |
from pyannote.audio import Pipeline | |
from pyannote.audio.pipelines.utils.hook import ProgressHook | |
DIARIZATION_AVAILABLE = True | |
logger.info("Pyannote.audio available for speaker diarization") | |
except ImportError as e: | |
logger.warning(f"Pyannote.audio not available: {e}") | |
DIARIZATION_AVAILABLE = False | |
# Try to import SpeechBrain and HuggingFace components | |
try: | |
from speechbrain.pretrained import EncoderDecoderASR, VAD, EncoderClassifier | |
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification | |
import torch | |
SPEECHBRAIN_AVAILABLE = True | |
HUGGINGFACE_AVAILABLE = True | |
logger.info("SpeechBrain and HuggingFace models available") | |
except ImportError as e: | |
logger.warning(f"SpeechBrain/HuggingFace not available: {e}") | |
SPEECHBRAIN_AVAILABLE = False | |
HUGGINGFACE_AVAILABLE = False | |
# Initialize models if available | |
asr_model = None | |
vad_model = None | |
sentiment_model = None | |
emotion_model = None | |
diarization_pipeline = None | |
if SPEECHBRAIN_AVAILABLE and HUGGINGFACE_AVAILABLE: | |
try: | |
# Speech-to-text model | |
asr_model = EncoderDecoderASR.from_hparams( | |
source="speechbrain/asr-crdnn-rnnlm-librispeech", | |
savedir="pretrained_models/asr-crdnn-rnnlm-librispeech" | |
) | |
# Voice Activity Detection | |
vad_model = VAD.from_hparams( | |
source="speechbrain/vad-crdnn-libriparty", | |
savedir="pretrained_models/vad-crdnn-libriparty" | |
) | |
# Sentiment analysis | |
sentiment_model = pipeline( | |
"sentiment-analysis", | |
model="cardiffnlp/twitter-roberta-base-sentiment-latest", | |
top_k=None | |
) | |
# Emotion analysis | |
emotion_model = pipeline( | |
"text-classification", | |
model="j-hartmann/emotion-english-distilroberta-base", | |
top_k=None | |
) | |
logger.info("All models loaded successfully") | |
except Exception as e: | |
logger.error(f"Error loading models: {e}") | |
SPEECHBRAIN_AVAILABLE = False | |
HUGGINGFACE_AVAILABLE = False | |
# Initialize diarization pipeline | |
if DIARIZATION_AVAILABLE: | |
try: | |
# Note: You'll need to get a HuggingFace token and accept the model terms | |
# at https://huggingface.co/pyannote/speaker-diarization | |
HF_TOKEN = os.getenv("HF_TOKEN", "") | |
if HF_TOKEN: | |
diarization_pipeline = Pipeline.from_pretrained( | |
"pyannote/speaker-diarization@2.1", | |
use_auth_token=HF_TOKEN | |
) | |
logger.info("Speaker diarization pipeline loaded") | |
else: | |
logger.warning("HF_TOKEN not set - speaker diarization will be disabled") | |
except Exception as e: | |
logger.error(f"Error loading diarization pipeline: {e}") | |
def extract_audio_from_video(video_path): | |
"""Extract audio from video file (MP4, etc.)""" | |
if not MOVIEPY_AVAILABLE: | |
return None, "MoviePy not available for video processing" | |
try: | |
# Create temporary file for audio | |
temp_audio = tempfile.NamedTemporaryFile(suffix='.wav', delete=False) | |
temp_audio_path = temp_audio.name | |
temp_audio.close() | |
# Load video and extract audio | |
video = mp.VideoFileClip(video_path) | |
audio = video.audio | |
if audio is None: | |
return None, "No audio track found in video file" | |
# Export audio to temporary WAV file | |
audio.write_audiofile(temp_audio_path, verbose=False, logger=None) | |
# Close video to free memory | |
video.close() | |
audio.close() | |
logger.info(f"Audio extracted from video: {temp_audio_path}") | |
return temp_audio_path, "Audio extracted successfully" | |
except Exception as e: | |
logger.error(f"Error extracting audio from video: {e}") | |
return None, f"Error extracting audio: {str(e)}" | |
def perform_speaker_diarization(audio_path): | |
"""Perform speaker diarization on audio file""" | |
if not DIARIZATION_AVAILABLE or not diarization_pipeline: | |
return None, "Speaker diarization not available" | |
try: | |
# Perform diarization | |
with ProgressHook() as hook: | |
diarization = diarization_pipeline(audio_path, hook=hook) | |
# Extract speaker segments | |
speaker_segments = [] | |
for turn, _, speaker in diarization.itertracks(yield_label=True): | |
speaker_segments.append({ | |
'start': turn.start, | |
'end': turn.end, | |
'speaker': speaker, | |
'duration': turn.end - turn.start | |
}) | |
logger.info(f"Diarization completed: {len(speaker_segments)} segments found") | |
return speaker_segments, "Diarization completed successfully" | |
except Exception as e: | |
logger.error(f"Error in diarization: {e}") | |
return None, f"Diarization error: {str(e)}" | |
def process_audio_file(file_path): | |
"""Process audio file, extracting from video if needed""" | |
if not file_path: | |
return None, "No file provided" | |
file_extension = os.path.splitext(file_path)[1].lower() | |
# If it's a video file, extract audio first | |
if file_extension in ['.mp4', '.avi', '.mov', '.mkv', '.wmv', '.flv']: | |
logger.info(f"Processing video file: {file_path}") | |
audio_path, status = extract_audio_from_video(file_path) | |
if audio_path: | |
return audio_path, f"Video processed: {status}" | |
else: | |
return None, status | |
# If it's already an audio file, use it directly | |
elif file_extension in ['.wav', '.mp3', '.m4a', '.flac', '.ogg']: | |
logger.info(f"Processing audio file: {file_path}") | |
return file_path, "Audio file ready for transcription" | |
else: | |
return None, f"Unsupported file format: {file_extension}" | |
def transcribe_audio_with_metadata(audio_file, enable_diarization=True): | |
"""Transcribe audio with timestamps, sentiment, and metadata""" | |
if not audio_file: | |
return None, "No audio file provided" | |
if not SPEECHBRAIN_AVAILABLE: | |
return None, "SpeechBrain not available - using demo transcription" | |
try: | |
# Process the file (extract audio if it's a video) | |
processed_audio_path, process_status = process_audio_file(audio_file) | |
if not processed_audio_path: | |
return None, process_status | |
# Perform speaker diarization if enabled | |
speaker_segments = None | |
diarization_status = "" | |
if enable_diarization: | |
speaker_segments, diarization_status = perform_speaker_diarization(processed_audio_path) | |
# Get transcription with timestamps | |
transcript = asr_model.transcribe_file(processed_audio_path) | |
# Clean up temporary audio file if it was created from video | |
if processed_audio_path != audio_file and os.path.exists(processed_audio_path): | |
try: | |
os.unlink(processed_audio_path) | |
logger.info("Temporary audio file cleaned up") | |
except Exception as e: | |
logger.warning(f"Could not clean up temporary file: {e}") | |
# Split into sentences for analysis | |
sentences = re.split(r'[.!?]+', transcript) | |
sentences = [s.strip() for s in sentences if s.strip()] | |
# Analyze each sentence | |
rich_transcript = [] | |
current_time = 0 | |
for i, sentence in enumerate(sentences): | |
# Estimate timestamp (rough approximation) | |
timestamp = current_time + (i * 2) # Assume ~2 seconds per sentence | |
# Determine speaker for this timestamp | |
speaker = "UNKNOWN" | |
if speaker_segments: | |
for segment in speaker_segments: | |
if segment['start'] <= timestamp <= segment['end']: | |
speaker = segment['speaker'] | |
break | |
# Sentiment analysis | |
sentiment_result = sentiment_model(sentence)[0] if sentiment_model else None | |
sentiment = max(sentiment_result, key=lambda x: x['score']) if sentiment_result else {'label': 'neutral', 'score': 0.5} | |
# Emotion analysis | |
emotion_result = emotion_model(sentence)[0] if emotion_model else None | |
emotion = max(emotion_result, key=lambda x: x['score']) if emotion_result else {'label': 'neutral', 'score': 0.5} | |
# Word count and complexity metrics | |
words = sentence.split() | |
word_count = len(words) | |
avg_word_length = np.mean([len(word) for word in words]) if words else 0 | |
# Calculate speech rate (words per minute estimate) | |
speech_rate = word_count * 30 / 60 # Rough estimate | |
rich_transcript.append({ | |
'timestamp': timestamp, | |
'speaker': speaker, | |
'sentence': sentence, | |
'word_count': word_count, | |
'avg_word_length': round(avg_word_length, 2), | |
'speech_rate_wpm': round(speech_rate, 1), | |
'sentiment': sentiment['label'], | |
'sentiment_score': round(sentiment['score'], 3), | |
'emotion': emotion['label'], | |
'emotion_score': round(emotion['score'], 3) | |
}) | |
current_time = timestamp | |
status_msg = f"Transcription completed successfully. {process_status}" | |
if diarization_status: | |
status_msg += f" {diarization_status}" | |
return rich_transcript, status_msg | |
except Exception as e: | |
logger.error(f"Error in transcription: {e}") | |
return None, f"Transcription error: {str(e)}" | |
def format_rich_transcript(rich_transcript): | |
"""Format rich transcript for display""" | |
if not rich_transcript: | |
return "No transcript data available" | |
formatted_lines = [] | |
for entry in rich_transcript: | |
timestamp_str = f"{int(entry['timestamp']//60):02d}:{int(entry['timestamp']%60):02d}" | |
line = f"[{timestamp_str}] *{entry['speaker']}: {entry['sentence']}" | |
line += f" [Words: {entry['word_count']}, Rate: {entry['speech_rate_wpm']}wpm]" | |
line += f" [Sentiment: {entry['sentiment']} ({entry['sentiment_score']})]" | |
line += f" [Emotion: {entry['emotion']} ({entry['emotion_score']})]" | |
formatted_lines.append(line) | |
return '\n'.join(formatted_lines) | |
def calculate_slp_metrics(rich_transcript): | |
"""Calculate comprehensive SLP metrics""" | |
if not rich_transcript: | |
return {} | |
# Basic metrics | |
total_sentences = len(rich_transcript) | |
total_words = sum(entry['word_count'] for entry in rich_transcript) | |
total_duration = rich_transcript[-1]['timestamp'] if rich_transcript else 0 | |
# Speaker analysis | |
speakers = {} | |
for entry in rich_transcript: | |
speaker = entry['speaker'] | |
if speaker not in speakers: | |
speakers[speaker] = { | |
'sentences': 0, | |
'words': 0, | |
'sentiments': [], | |
'emotions': [] | |
} | |
speakers[speaker]['sentences'] += 1 | |
speakers[speaker]['words'] += entry['word_count'] | |
speakers[speaker]['sentiments'].append(entry['sentiment']) | |
speakers[speaker]['emotions'].append(entry['emotion']) | |
# Word-level analysis | |
all_words = [] | |
for entry in rich_transcript: | |
words = entry['sentence'].lower().split() | |
all_words.extend(words) | |
# Word frequency distribution | |
word_freq = {} | |
for word in all_words: | |
word_clean = re.sub(r'[^\w\s]', '', word) | |
if word_clean: | |
word_freq[word_clean] = word_freq.get(word_clean, 0) + 1 | |
# Vocabulary diversity (Type-Token Ratio) | |
unique_words = len(set(all_words)) | |
ttr = unique_words / total_words if total_words > 0 else 0 | |
# Speech rate analysis | |
speech_rates = [entry['speech_rate_wpm'] for entry in rich_transcript] | |
avg_speech_rate = np.mean(speech_rates) if speech_rates else 0 | |
# Sentiment analysis | |
sentiment_counts = {} | |
emotion_counts = {} | |
for entry in rich_transcript: | |
sentiment_counts[entry['sentiment']] = sentiment_counts.get(entry['sentiment'], 0) + 1 | |
emotion_counts[entry['emotion']] = emotion_counts.get(entry['emotion'], 0) + 1 | |
# Sentence complexity | |
sentence_lengths = [entry['word_count'] for entry in rich_transcript] | |
avg_sentence_length = np.mean(sentence_lengths) if sentence_lengths else 0 | |
# Pause analysis (gaps between sentences) | |
pauses = [] | |
for i in range(1, len(rich_transcript)): | |
pause = rich_transcript[i]['timestamp'] - rich_transcript[i-1]['timestamp'] | |
pauses.append(pause) | |
avg_pause_duration = np.mean(pauses) if pauses else 0 | |
return { | |
'total_sentences': total_sentences, | |
'total_words': total_words, | |
'total_duration_seconds': total_duration, | |
'unique_words': unique_words, | |
'type_token_ratio': round(ttr, 3), | |
'avg_sentence_length': round(avg_sentence_length, 1), | |
'avg_speech_rate_wpm': round(avg_speech_rate, 1), | |
'avg_pause_duration': round(avg_pause_duration, 1), | |
'sentiment_distribution': sentiment_counts, | |
'emotion_distribution': emotion_counts, | |
'word_frequency': dict(sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:20]), | |
'speech_rate_variability': round(np.std(speech_rates), 1) if speech_rates else 0, | |
'speakers': speakers, | |
'speaker_count': len(speakers) | |
} | |
def generate_comprehensive_analysis_prompt(rich_transcript, metrics, age, gender, slp_notes=""): | |
"""Generate comprehensive analysis prompt using rich transcript data""" | |
# Format rich transcript with timestamps and metadata | |
transcript_lines = [] | |
for entry in rich_transcript: | |
timestamp_str = f"{int(entry['timestamp']//60):02d}:{int(entry['timestamp']%60):02d}" | |
transcript_lines.append(f"[{timestamp_str}] *{entry['speaker']}: {entry['sentence']}") | |
transcript_text = '\n'.join(transcript_lines) | |
# Format metrics for analysis | |
metrics_text = f""" | |
TRANSCRIPT METRICS: | |
• Total sentences: {metrics['total_sentences']} | |
• Total words: {metrics['total_words']} | |
• Duration: {metrics['total_duration_seconds']:.1f} seconds | |
• Type-Token Ratio: {metrics['type_token_ratio']} (vocabulary diversity) | |
• Average sentence length: {metrics['avg_sentence_length']} words | |
• Average speech rate: {metrics['avg_speech_rate_wpm']} words per minute | |
• Speech rate variability: {metrics['speech_rate_variability']} wpm | |
• Average pause duration: {metrics['avg_pause_duration']:.1f} seconds | |
• Number of speakers: {metrics['speaker_count']} | |
SENTIMENT DISTRIBUTION: {metrics['sentiment_distribution']} | |
EMOTION DISTRIBUTION: {metrics['emotion_distribution']} | |
SPEAKER ANALYSIS:""" | |
for speaker, data in metrics['speakers'].items(): | |
metrics_text += f"\n• {speaker}: {data['sentences']} sentences, {data['words']} words" | |
metrics_text += f"\n\nMOST FREQUENT WORDS: {list(metrics['word_frequency'].keys())[:10]}" | |
notes_section = f"\nSLP CLINICAL NOTES:\n{slp_notes}" if slp_notes else "" | |
prompt = f""" | |
You are a speech-language pathologist conducting a comprehensive analysis of a speech transcript with rich temporal and affective metadata. | |
PATIENT: {age}-year-old {gender} | |
{metrics_text} | |
TRANSCRIPT WITH TIMESTAMPS AND METADATA: | |
{transcript_text}{notes_section} | |
Please provide a comprehensive analysis including: | |
1. TEMPORAL SPEECH PATTERNS: | |
- Analyze speech rate changes over time using timestamps | |
- Identify patterns in pause duration and frequency | |
- Assess temporal consistency in speech production | |
- Note any significant changes in speech patterns throughout the session | |
2. AFFECTIVE AND EMOTIONAL ANALYSIS: | |
- Analyze sentiment patterns throughout the transcript using timestamp data | |
- Identify emotional shifts and their potential causes | |
- Assess emotional regulation and expression | |
- Note any correlations between emotional state and speech characteristics | |
3. SPEAKER-SPECIFIC ANALYSIS (if multiple speakers): | |
- Compare speech patterns between speakers | |
- Analyze turn-taking patterns and timing | |
- Assess interaction dynamics | |
- Note speaker-specific emotional and sentiment patterns | |
4. SPEECH FLUENCY AND RATE ANALYSIS: | |
- Analyze speech rate variability using the provided metrics | |
- Identify periods of fluent vs. dysfluent speech | |
- Assess the impact of emotional state on speech rate | |
- Note any temporal patterns in speech rate changes | |
5. LANGUAGE COMPLEXITY ASSESSMENT: | |
- Analyze vocabulary diversity using Type-Token Ratio | |
- Assess sentence complexity and variety | |
- Identify patterns in word frequency and usage | |
- Note any temporal changes in language complexity | |
6. COMPLEX SENTENCE ANALYSIS: | |
- Count and analyze use of coordinating conjunctions (and, but, or, so, yet, for, nor) | |
- Count and analyze use of subordinating conjunctions (because, although, while, since, if, when, where, that, which, who, whom, whose) | |
- Identify compound, complex, and compound-complex sentences | |
- Assess sentence variety and complexity level for age | |
7. FIGURATIVE LANGUAGE ANALYSIS: | |
- Identify and count similes (comparisons using "like" or "as") | |
- Identify and count metaphors (direct comparisons without "like" or "as") | |
- Identify and count idioms (common expressions with non-literal meanings) | |
- Assess figurative language comprehension and use for age | |
8. CLINICAL IMPLICATIONS: | |
- Specific intervention targets based on temporal patterns | |
- Recommendations for emotional regulation if needed | |
- Suggestions for improving speech rate consistency | |
- Strategies for enhancing language complexity | |
- Age-appropriate development recommendations | |
9. COMPREHENSIVE SUMMARY: | |
- Overall communication profile with temporal considerations | |
- Assessment of emotional and affective communication | |
- Developmental appropriateness considering age | |
- Prognosis and treatment priorities | |
Use the temporal data, sentiment scores, and emotional labels to provide insights that would not be possible with a simple transcript. Reference specific timestamps and emotional states when making observations. | |
""" | |
return prompt | |
def analyze_rich_transcript_with_llm(rich_transcript, age, gender, slp_notes=""): | |
"""Analyze rich transcript using LLM with comprehensive metadata""" | |
if not rich_transcript: | |
return "No transcript data available for analysis." | |
# Calculate SLP metrics | |
metrics = calculate_slp_metrics(rich_transcript) | |
# Generate comprehensive analysis prompt | |
prompt = generate_comprehensive_analysis_prompt(rich_transcript, metrics, age, gender, slp_notes) | |
# Get analysis from Claude API | |
if ANTHROPIC_API_KEY: | |
result = call_claude_api(prompt) | |
else: | |
result = generate_demo_analysis(rich_transcript, metrics) | |
return result | |
def call_claude_api(prompt): | |
"""Call Claude API directly""" | |
if not ANTHROPIC_API_KEY: | |
return "❌ Claude API key not configured. Please set ANTHROPIC_API_KEY environment variable." | |
try: | |
headers = { | |
"Content-Type": "application/json", | |
"x-api-key": ANTHROPIC_API_KEY, | |
"anthropic-version": "2023-06-01" | |
} | |
data = { | |
"model": "claude-3-5-sonnet-20241022", | |
"max_tokens": 4096, | |
"messages": [ | |
{ | |
"role": "user", | |
"content": prompt | |
} | |
] | |
} | |
response = requests.post( | |
"https://api.anthropic.com/v1/messages", | |
headers=headers, | |
json=data, | |
timeout=60 | |
) | |
if response.status_code == 200: | |
response_json = response.json() | |
return response_json['content'][0]['text'] | |
else: | |
logger.error(f"Claude API error: {response.status_code} - {response.text}") | |
return f"❌ Claude API Error: {response.status_code}" | |
except Exception as e: | |
logger.error(f"Error calling Claude API: {str(e)}") | |
return f"❌ Error: {str(e)}" | |
def generate_demo_analysis(rich_transcript, metrics): | |
"""Generate demo analysis when API is not available""" | |
return f"""## Comprehensive SLP Analysis with Temporal and Affective Data | |
### TEMPORAL SPEECH PATTERNS | |
**Speech Rate Analysis**: {metrics['avg_speech_rate_wpm']} words per minute (variability: {metrics['speech_rate_variability']} wpm) | |
- Speech rate appears {'within normal limits' if 120 <= metrics['avg_speech_rate_wpm'] <= 180 else 'below typical range' if metrics['avg_speech_rate_wpm'] < 120 else 'above typical range'} | |
- Variability suggests {'consistent' if metrics['speech_rate_variability'] < 20 else 'variable'} speech patterns | |
**Pause Analysis**: Average pause duration of {metrics['avg_pause_duration']:.1f} seconds | |
- {'Appropriate' if 0.5 <= metrics['avg_pause_duration'] <= 2.0 else 'Short' if metrics['avg_pause_duration'] < 0.5 else 'Long'} pauses between utterances | |
### AFFECTIVE AND EMOTIONAL ANALYSIS | |
**Sentiment Distribution**: {metrics['sentiment_distribution']} | |
**Emotion Distribution**: {metrics['emotion_distribution']} | |
The emotional patterns suggest {'positive' if 'positive' in metrics['sentiment_distribution'] and metrics['sentiment_distribution']['positive'] > 2 else 'neutral' if 'neutral' in metrics['sentiment_distribution'] and metrics['sentiment_distribution']['neutral'] > 2 else 'mixed'} emotional expression throughout the session. | |
### LANGUAGE COMPLEXITY | |
**Vocabulary Diversity**: Type-Token Ratio of {metrics['type_token_ratio']} | |
- {'Good' if metrics['type_token_ratio'] > 0.4 else 'Limited' if metrics['type_token_ratio'] < 0.3 else 'Moderate'} vocabulary diversity | |
**Sentence Structure**: Average {metrics['avg_sentence_length']} words per sentence | |
- Sentence length appears {'age-appropriate' if 5 <= metrics['avg_sentence_length'] <= 12 else 'below age expectations' if metrics['avg_sentence_length'] < 5 else 'above age expectations'} | |
**Most Frequent Words**: {', '.join(list(metrics['word_frequency'].keys())[:5])} | |
### SPEAKER ANALYSIS | |
**Number of Speakers**: {metrics['speaker_count']} | |
{chr(10).join([f"• {speaker}: {data['sentences']} sentences, {data['words']} words" for speaker, data in metrics['speakers'].items()])} | |
### CLINICAL IMPLICATIONS | |
Based on the temporal and affective analysis, this patient shows: | |
- {'Good' if metrics['type_token_ratio'] > 0.4 else 'Limited'} vocabulary diversity | |
- {'Appropriate' if 120 <= metrics['avg_speech_rate_wpm'] <= 180 else 'Atypical'} speech rate | |
- {'Consistent' if metrics['speech_rate_variability'] < 20 else 'Variable'} speech patterns | |
- {'Positive' if 'positive' in metrics['sentiment_distribution'] and metrics['sentiment_distribution']['positive'] > 2 else 'Neutral'} emotional expression | |
### RECOMMENDATIONS | |
1. Focus on vocabulary expansion if TTR < 0.4 | |
2. Address speech rate if outside normal range | |
3. Work on sentence complexity if below age expectations | |
4. Consider emotional regulation strategies based on sentiment patterns | |
5. Monitor temporal patterns in speech rate and fluency""" | |
def create_transcription_interface(): | |
"""Create the transcription-focused Gradio interface""" | |
with gr.Blocks(title="Advanced Transcription Tool", theme=gr.themes.Soft()) as app: | |
gr.Markdown("# 🎤 Advanced Transcription Tool") | |
gr.Markdown("Transcribe audio/video with speaker diarization, timestamps, sentiment analysis, and comprehensive LLM analysis") | |
with gr.Tabs(): | |
# Audio/Video Upload & Transcription Tab | |
with gr.Tab("🎤 Audio/Video Transcription"): | |
with gr.Row(): | |
with gr.Column(scale=1): | |
gr.Markdown("### File Upload") | |
gr.Markdown("**Supported formats:** MP4, AVI, MOV, MKV, WMV, FLV, WAV, MP3, M4A, FLAC, OGG") | |
file_input = gr.File( | |
label="Upload Audio or Video File", | |
file_types=["audio", "video"] | |
) | |
enable_diarization = gr.Checkbox( | |
label="Enable Speaker Diarization", | |
value=True, | |
info="Identify different speakers in the audio" | |
) | |
transcribe_btn = gr.Button( | |
"🎤 Transcribe File", | |
variant="primary", | |
size="lg" | |
) | |
transcription_status = gr.Markdown("") | |
with gr.Column(scale=2): | |
gr.Markdown("### Rich Transcript with Metadata") | |
rich_transcript_display = gr.Textbox( | |
label="Transcription with Speakers, Timestamps, Sentiment & Emotion", | |
lines=15, | |
max_lines=20 | |
) | |
# Analysis Tab | |
with gr.Tab("📊 LLM Analysis"): | |
with gr.Row(): | |
with gr.Column(scale=1): | |
gr.Markdown("### Patient Information") | |
with gr.Row(): | |
age = gr.Number(label="Age", value=8, minimum=1, maximum=120) | |
gender = gr.Radio(["male", "female", "other"], label="Gender", value="male") | |
slp_notes = gr.Textbox( | |
label="SLP Clinical Notes (Optional)", | |
placeholder="Enter additional clinical observations...", | |
lines=3 | |
) | |
analyze_btn = gr.Button( | |
"🔍 Analyze with LLM", | |
variant="primary", | |
size="lg" | |
) | |
with gr.Column(scale=2): | |
gr.Markdown("### Comprehensive LLM Analysis") | |
analysis_output = gr.Textbox( | |
label="LLM Analysis Report", | |
lines=25, | |
max_lines=30 | |
) | |
# Metrics Tab | |
with gr.Tab("📈 Speech Metrics"): | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("### Quantitative Speech Metrics") | |
metrics_display = gr.Textbox( | |
label="SLP Metrics", | |
lines=15, | |
max_lines=20 | |
) | |
with gr.Column(): | |
gr.Markdown("### Word Frequency Analysis") | |
word_freq_display = gr.Dataframe( | |
headers=["Word", "Frequency"], | |
label="Most Frequent Words", | |
interactive=False | |
) | |
# Raw Data Tab | |
with gr.Tab("📊 Raw Data"): | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("### JSON Data") | |
json_display = gr.Textbox( | |
label="Raw JSON Data", | |
lines=20, | |
max_lines=25 | |
) | |
# Event handlers | |
def on_transcribe(file, diarization_enabled): | |
"""Handle file transcription""" | |
if not file: | |
return "", "", "", "Please upload a file first." | |
rich_transcript, status = transcribe_audio_with_metadata(file.name, diarization_enabled) | |
if rich_transcript: | |
formatted = format_rich_transcript(rich_transcript) | |
metrics = calculate_slp_metrics(rich_transcript) | |
# Format metrics for display | |
metrics_text = f"""SPEECH METRICS: | |
• Total sentences: {metrics['total_sentences']} | |
• Total words: {metrics['total_words']} | |
• Duration: {metrics['total_duration_seconds']:.1f} seconds | |
• Type-Token Ratio: {metrics['type_token_ratio']} (vocabulary diversity) | |
• Average sentence length: {metrics['avg_sentence_length']} words | |
• Average speech rate: {metrics['avg_speech_rate_wpm']} words per minute | |
• Speech rate variability: {metrics['speech_rate_variability']} wpm | |
• Average pause duration: {metrics['avg_pause_duration']:.1f} seconds | |
• Number of speakers: {metrics['speaker_count']} | |
SENTIMENT DISTRIBUTION: {metrics['sentiment_distribution']} | |
EMOTION DISTRIBUTION: {metrics['emotion_distribution']} | |
SPEAKER ANALYSIS:""" | |
for speaker, data in metrics['speakers'].items(): | |
metrics_text += f"\n• {speaker}: {data['sentences']} sentences, {data['words']} words" | |
# Create word frequency dataframe | |
word_freq_data = [[word, freq] for word, freq in list(metrics['word_frequency'].items())[:20]] | |
# JSON data | |
json_data = json.dumps(rich_transcript, indent=2) | |
return formatted, metrics_text, word_freq_data, status | |
else: | |
return "", "", [], status | |
def on_analyze(rich_transcript_text, age_val, gender_val, notes): | |
"""Handle LLM analysis""" | |
if not rich_transcript_text or rich_transcript_text == "No transcript data available": | |
return "Please transcribe audio first." | |
# Convert formatted text back to rich transcript structure | |
lines = rich_transcript_text.split('\n') | |
rich_transcript = [] | |
for i, line in enumerate(lines): | |
if line.strip(): | |
# Extract data from the formatted line | |
timestamp_match = re.search(r'\[(\d{2}:\d{2})\]', line) | |
speaker_match = re.search(r'\*(\w+):', line) | |
sentence_match = re.search(r'\*\w+:\s*(.+?)(?=\s*\[|$)', line) | |
if timestamp_match and speaker_match and sentence_match: | |
timestamp_str = timestamp_match.group(1) | |
minutes, seconds = map(int, timestamp_str.split(':')) | |
timestamp = minutes * 60 + seconds | |
speaker = speaker_match.group(1) | |
sentence = sentence_match.group(1).strip() | |
rich_transcript.append({ | |
'timestamp': timestamp, | |
'speaker': speaker, | |
'sentence': sentence, | |
'word_count': len(sentence.split()), | |
'avg_word_length': np.mean([len(word) for word in sentence.split()]) if sentence.split() else 0, | |
'speech_rate_wpm': 120.0, | |
'sentiment': 'neutral', | |
'sentiment_score': 0.5, | |
'emotion': 'neutral', | |
'emotion_score': 0.5 | |
}) | |
return analyze_rich_transcript_with_llm(rich_transcript, age_val, gender_val, notes) | |
# Connect event handlers | |
transcribe_btn.click( | |
on_transcribe, | |
inputs=[file_input, enable_diarization], | |
outputs=[rich_transcript_display, metrics_display, word_freq_display, transcription_status] | |
) | |
analyze_btn.click( | |
on_analyze, | |
inputs=[rich_transcript_display, age, gender, slp_notes], | |
outputs=[analysis_output] | |
) | |
return app | |
if __name__ == "__main__": | |
print("🚀 Starting Advanced Transcription Tool...") | |
if not MOVIEPY_AVAILABLE: | |
print("⚠️ MoviePy not available - video processing will be limited") | |
print(" Install with: pip install moviepy") | |
else: | |
print("✅ MoviePy available for video processing") | |
if not DIARIZATION_AVAILABLE: | |
print("⚠️ Pyannote.audio not available - speaker diarization will be disabled") | |
print(" Install with: pip install pyannote.audio") | |
else: | |
print("✅ Pyannote.audio available for speaker diarization") | |
if not os.getenv("HF_TOKEN"): | |
print("⚠️ HF_TOKEN not set - set it to enable speaker diarization") | |
print(" Get token from: https://huggingface.co/settings/tokens") | |
print(" Accept model terms at: https://huggingface.co/pyannote/speaker-diarization") | |
if not SPEECHBRAIN_AVAILABLE: | |
print("⚠️ SpeechBrain not available - audio transcription will use demo mode") | |
print(" Install with: pip install speechbrain transformers torch") | |
else: | |
print("✅ SpeechBrain and HuggingFace models loaded") | |
app = create_transcription_interface() | |
app.launch(show_api=False) |