SLPAnalysis / transcription_demo.py
SreekarB's picture
Upload 8 files
d60565b verified
import gradio as gr
import json
import os
import logging
import re
import numpy as np
import pandas as pd
from datetime import datetime
import time
import tempfile
from typing import Dict, List, Tuple, Optional
import requests
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Try to import video processing libraries
try:
import moviepy.editor as mp
MOVIEPY_AVAILABLE = True
logger.info("MoviePy available for video processing")
except ImportError as e:
logger.warning(f"MoviePy not available: {e}")
MOVIEPY_AVAILABLE = False
# Try to import speaker diarization
try:
from pyannote.audio import Pipeline
from pyannote.audio.pipelines.utils.hook import ProgressHook
DIARIZATION_AVAILABLE = True
logger.info("Pyannote.audio available for speaker diarization")
except ImportError as e:
logger.warning(f"Pyannote.audio not available: {e}")
DIARIZATION_AVAILABLE = False
# Try to import SpeechBrain and HuggingFace components
try:
from speechbrain.pretrained import EncoderDecoderASR, VAD, EncoderClassifier
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
SPEECHBRAIN_AVAILABLE = True
HUGGINGFACE_AVAILABLE = True
logger.info("SpeechBrain and HuggingFace models available")
except ImportError as e:
logger.warning(f"SpeechBrain/HuggingFace not available: {e}")
SPEECHBRAIN_AVAILABLE = False
HUGGINGFACE_AVAILABLE = False
# Initialize models if available
asr_model = None
vad_model = None
sentiment_model = None
emotion_model = None
diarization_pipeline = None
if SPEECHBRAIN_AVAILABLE and HUGGINGFACE_AVAILABLE:
try:
# Speech-to-text model
asr_model = EncoderDecoderASR.from_hparams(
source="speechbrain/asr-crdnn-rnnlm-librispeech",
savedir="pretrained_models/asr-crdnn-rnnlm-librispeech"
)
# Voice Activity Detection
vad_model = VAD.from_hparams(
source="speechbrain/vad-crdnn-libriparty",
savedir="pretrained_models/vad-crdnn-libriparty"
)
# Sentiment analysis
sentiment_model = pipeline(
"sentiment-analysis",
model="cardiffnlp/twitter-roberta-base-sentiment-latest",
top_k=None
)
# Emotion analysis
emotion_model = pipeline(
"text-classification",
model="j-hartmann/emotion-english-distilroberta-base",
top_k=None
)
logger.info("All models loaded successfully")
except Exception as e:
logger.error(f"Error loading models: {e}")
SPEECHBRAIN_AVAILABLE = False
HUGGINGFACE_AVAILABLE = False
# Initialize diarization pipeline
if DIARIZATION_AVAILABLE:
try:
# Note: You'll need to get a HuggingFace token and accept the model terms
# at https://huggingface.co/pyannote/speaker-diarization
HF_TOKEN = os.getenv("HF_TOKEN", "")
if HF_TOKEN:
diarization_pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization@2.1",
use_auth_token=HF_TOKEN
)
logger.info("Speaker diarization pipeline loaded")
else:
logger.warning("HF_TOKEN not set - speaker diarization will be disabled")
except Exception as e:
logger.error(f"Error loading diarization pipeline: {e}")
def extract_audio_from_video(video_path):
"""Extract audio from video file (MP4, etc.)"""
if not MOVIEPY_AVAILABLE:
return None, "MoviePy not available for video processing"
try:
# Create temporary file for audio
temp_audio = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
temp_audio_path = temp_audio.name
temp_audio.close()
# Load video and extract audio
video = mp.VideoFileClip(video_path)
audio = video.audio
if audio is None:
return None, "No audio track found in video file"
# Export audio to temporary WAV file
audio.write_audiofile(temp_audio_path, verbose=False, logger=None)
# Close video to free memory
video.close()
audio.close()
logger.info(f"Audio extracted from video: {temp_audio_path}")
return temp_audio_path, "Audio extracted successfully"
except Exception as e:
logger.error(f"Error extracting audio from video: {e}")
return None, f"Error extracting audio: {str(e)}"
def perform_speaker_diarization(audio_path):
"""Perform speaker diarization on audio file"""
if not DIARIZATION_AVAILABLE or not diarization_pipeline:
return None, "Speaker diarization not available"
try:
# Perform diarization
with ProgressHook() as hook:
diarization = diarization_pipeline(audio_path, hook=hook)
# Extract speaker segments
speaker_segments = []
for turn, _, speaker in diarization.itertracks(yield_label=True):
speaker_segments.append({
'start': turn.start,
'end': turn.end,
'speaker': speaker,
'duration': turn.end - turn.start
})
logger.info(f"Diarization completed: {len(speaker_segments)} segments found")
return speaker_segments, "Diarization completed successfully"
except Exception as e:
logger.error(f"Error in diarization: {e}")
return None, f"Diarization error: {str(e)}"
def process_audio_file(file_path):
"""Process audio file, extracting from video if needed"""
if not file_path:
return None, "No file provided"
file_extension = os.path.splitext(file_path)[1].lower()
# If it's a video file, extract audio first
if file_extension in ['.mp4', '.avi', '.mov', '.mkv', '.wmv', '.flv']:
logger.info(f"Processing video file: {file_path}")
audio_path, status = extract_audio_from_video(file_path)
if audio_path:
return audio_path, f"Video processed: {status}"
else:
return None, status
# If it's already an audio file, use it directly
elif file_extension in ['.wav', '.mp3', '.m4a', '.flac', '.ogg']:
logger.info(f"Processing audio file: {file_path}")
return file_path, "Audio file ready for transcription"
else:
return None, f"Unsupported file format: {file_extension}"
def transcribe_audio_with_metadata(audio_file, enable_diarization=True):
"""Transcribe audio with timestamps, sentiment, and metadata"""
if not audio_file:
return None, "No audio file provided"
if not SPEECHBRAIN_AVAILABLE:
return None, "SpeechBrain not available - using demo transcription"
try:
# Process the file (extract audio if it's a video)
processed_audio_path, process_status = process_audio_file(audio_file)
if not processed_audio_path:
return None, process_status
# Perform speaker diarization if enabled
speaker_segments = None
diarization_status = ""
if enable_diarization:
speaker_segments, diarization_status = perform_speaker_diarization(processed_audio_path)
# Get transcription with timestamps
transcript = asr_model.transcribe_file(processed_audio_path)
# Clean up temporary audio file if it was created from video
if processed_audio_path != audio_file and os.path.exists(processed_audio_path):
try:
os.unlink(processed_audio_path)
logger.info("Temporary audio file cleaned up")
except Exception as e:
logger.warning(f"Could not clean up temporary file: {e}")
# Split into sentences for analysis
sentences = re.split(r'[.!?]+', transcript)
sentences = [s.strip() for s in sentences if s.strip()]
# Analyze each sentence
rich_transcript = []
current_time = 0
for i, sentence in enumerate(sentences):
# Estimate timestamp (rough approximation)
timestamp = current_time + (i * 2) # Assume ~2 seconds per sentence
# Determine speaker for this timestamp
speaker = "UNKNOWN"
if speaker_segments:
for segment in speaker_segments:
if segment['start'] <= timestamp <= segment['end']:
speaker = segment['speaker']
break
# Sentiment analysis
sentiment_result = sentiment_model(sentence)[0] if sentiment_model else None
sentiment = max(sentiment_result, key=lambda x: x['score']) if sentiment_result else {'label': 'neutral', 'score': 0.5}
# Emotion analysis
emotion_result = emotion_model(sentence)[0] if emotion_model else None
emotion = max(emotion_result, key=lambda x: x['score']) if emotion_result else {'label': 'neutral', 'score': 0.5}
# Word count and complexity metrics
words = sentence.split()
word_count = len(words)
avg_word_length = np.mean([len(word) for word in words]) if words else 0
# Calculate speech rate (words per minute estimate)
speech_rate = word_count * 30 / 60 # Rough estimate
rich_transcript.append({
'timestamp': timestamp,
'speaker': speaker,
'sentence': sentence,
'word_count': word_count,
'avg_word_length': round(avg_word_length, 2),
'speech_rate_wpm': round(speech_rate, 1),
'sentiment': sentiment['label'],
'sentiment_score': round(sentiment['score'], 3),
'emotion': emotion['label'],
'emotion_score': round(emotion['score'], 3)
})
current_time = timestamp
status_msg = f"Transcription completed successfully. {process_status}"
if diarization_status:
status_msg += f" {diarization_status}"
return rich_transcript, status_msg
except Exception as e:
logger.error(f"Error in transcription: {e}")
return None, f"Transcription error: {str(e)}"
def format_rich_transcript(rich_transcript):
"""Format rich transcript for display"""
if not rich_transcript:
return "No transcript data available"
formatted_lines = []
for entry in rich_transcript:
timestamp_str = f"{int(entry['timestamp']//60):02d}:{int(entry['timestamp']%60):02d}"
line = f"[{timestamp_str}] *{entry['speaker']}: {entry['sentence']}"
line += f" [Words: {entry['word_count']}, Rate: {entry['speech_rate_wpm']}wpm]"
line += f" [Sentiment: {entry['sentiment']} ({entry['sentiment_score']})]"
line += f" [Emotion: {entry['emotion']} ({entry['emotion_score']})]"
formatted_lines.append(line)
return '\n'.join(formatted_lines)
def calculate_slp_metrics(rich_transcript):
"""Calculate comprehensive SLP metrics"""
if not rich_transcript:
return {}
# Basic metrics
total_sentences = len(rich_transcript)
total_words = sum(entry['word_count'] for entry in rich_transcript)
total_duration = rich_transcript[-1]['timestamp'] if rich_transcript else 0
# Speaker analysis
speakers = {}
for entry in rich_transcript:
speaker = entry['speaker']
if speaker not in speakers:
speakers[speaker] = {
'sentences': 0,
'words': 0,
'sentiments': [],
'emotions': []
}
speakers[speaker]['sentences'] += 1
speakers[speaker]['words'] += entry['word_count']
speakers[speaker]['sentiments'].append(entry['sentiment'])
speakers[speaker]['emotions'].append(entry['emotion'])
# Word-level analysis
all_words = []
for entry in rich_transcript:
words = entry['sentence'].lower().split()
all_words.extend(words)
# Word frequency distribution
word_freq = {}
for word in all_words:
word_clean = re.sub(r'[^\w\s]', '', word)
if word_clean:
word_freq[word_clean] = word_freq.get(word_clean, 0) + 1
# Vocabulary diversity (Type-Token Ratio)
unique_words = len(set(all_words))
ttr = unique_words / total_words if total_words > 0 else 0
# Speech rate analysis
speech_rates = [entry['speech_rate_wpm'] for entry in rich_transcript]
avg_speech_rate = np.mean(speech_rates) if speech_rates else 0
# Sentiment analysis
sentiment_counts = {}
emotion_counts = {}
for entry in rich_transcript:
sentiment_counts[entry['sentiment']] = sentiment_counts.get(entry['sentiment'], 0) + 1
emotion_counts[entry['emotion']] = emotion_counts.get(entry['emotion'], 0) + 1
# Sentence complexity
sentence_lengths = [entry['word_count'] for entry in rich_transcript]
avg_sentence_length = np.mean(sentence_lengths) if sentence_lengths else 0
# Pause analysis (gaps between sentences)
pauses = []
for i in range(1, len(rich_transcript)):
pause = rich_transcript[i]['timestamp'] - rich_transcript[i-1]['timestamp']
pauses.append(pause)
avg_pause_duration = np.mean(pauses) if pauses else 0
return {
'total_sentences': total_sentences,
'total_words': total_words,
'total_duration_seconds': total_duration,
'unique_words': unique_words,
'type_token_ratio': round(ttr, 3),
'avg_sentence_length': round(avg_sentence_length, 1),
'avg_speech_rate_wpm': round(avg_speech_rate, 1),
'avg_pause_duration': round(avg_pause_duration, 1),
'sentiment_distribution': sentiment_counts,
'emotion_distribution': emotion_counts,
'word_frequency': dict(sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:20]),
'speech_rate_variability': round(np.std(speech_rates), 1) if speech_rates else 0,
'speakers': speakers,
'speaker_count': len(speakers)
}
def generate_comprehensive_analysis_prompt(rich_transcript, metrics, age, gender, slp_notes=""):
"""Generate comprehensive analysis prompt using rich transcript data"""
# Format rich transcript with timestamps and metadata
transcript_lines = []
for entry in rich_transcript:
timestamp_str = f"{int(entry['timestamp']//60):02d}:{int(entry['timestamp']%60):02d}"
transcript_lines.append(f"[{timestamp_str}] *{entry['speaker']}: {entry['sentence']}")
transcript_text = '\n'.join(transcript_lines)
# Format metrics for analysis
metrics_text = f"""
TRANSCRIPT METRICS:
• Total sentences: {metrics['total_sentences']}
• Total words: {metrics['total_words']}
• Duration: {metrics['total_duration_seconds']:.1f} seconds
• Type-Token Ratio: {metrics['type_token_ratio']} (vocabulary diversity)
• Average sentence length: {metrics['avg_sentence_length']} words
• Average speech rate: {metrics['avg_speech_rate_wpm']} words per minute
• Speech rate variability: {metrics['speech_rate_variability']} wpm
• Average pause duration: {metrics['avg_pause_duration']:.1f} seconds
• Number of speakers: {metrics['speaker_count']}
SENTIMENT DISTRIBUTION: {metrics['sentiment_distribution']}
EMOTION DISTRIBUTION: {metrics['emotion_distribution']}
SPEAKER ANALYSIS:"""
for speaker, data in metrics['speakers'].items():
metrics_text += f"\n• {speaker}: {data['sentences']} sentences, {data['words']} words"
metrics_text += f"\n\nMOST FREQUENT WORDS: {list(metrics['word_frequency'].keys())[:10]}"
notes_section = f"\nSLP CLINICAL NOTES:\n{slp_notes}" if slp_notes else ""
prompt = f"""
You are a speech-language pathologist conducting a comprehensive analysis of a speech transcript with rich temporal and affective metadata.
PATIENT: {age}-year-old {gender}
{metrics_text}
TRANSCRIPT WITH TIMESTAMPS AND METADATA:
{transcript_text}{notes_section}
Please provide a comprehensive analysis including:
1. TEMPORAL SPEECH PATTERNS:
- Analyze speech rate changes over time using timestamps
- Identify patterns in pause duration and frequency
- Assess temporal consistency in speech production
- Note any significant changes in speech patterns throughout the session
2. AFFECTIVE AND EMOTIONAL ANALYSIS:
- Analyze sentiment patterns throughout the transcript using timestamp data
- Identify emotional shifts and their potential causes
- Assess emotional regulation and expression
- Note any correlations between emotional state and speech characteristics
3. SPEAKER-SPECIFIC ANALYSIS (if multiple speakers):
- Compare speech patterns between speakers
- Analyze turn-taking patterns and timing
- Assess interaction dynamics
- Note speaker-specific emotional and sentiment patterns
4. SPEECH FLUENCY AND RATE ANALYSIS:
- Analyze speech rate variability using the provided metrics
- Identify periods of fluent vs. dysfluent speech
- Assess the impact of emotional state on speech rate
- Note any temporal patterns in speech rate changes
5. LANGUAGE COMPLEXITY ASSESSMENT:
- Analyze vocabulary diversity using Type-Token Ratio
- Assess sentence complexity and variety
- Identify patterns in word frequency and usage
- Note any temporal changes in language complexity
6. COMPLEX SENTENCE ANALYSIS:
- Count and analyze use of coordinating conjunctions (and, but, or, so, yet, for, nor)
- Count and analyze use of subordinating conjunctions (because, although, while, since, if, when, where, that, which, who, whom, whose)
- Identify compound, complex, and compound-complex sentences
- Assess sentence variety and complexity level for age
7. FIGURATIVE LANGUAGE ANALYSIS:
- Identify and count similes (comparisons using "like" or "as")
- Identify and count metaphors (direct comparisons without "like" or "as")
- Identify and count idioms (common expressions with non-literal meanings)
- Assess figurative language comprehension and use for age
8. CLINICAL IMPLICATIONS:
- Specific intervention targets based on temporal patterns
- Recommendations for emotional regulation if needed
- Suggestions for improving speech rate consistency
- Strategies for enhancing language complexity
- Age-appropriate development recommendations
9. COMPREHENSIVE SUMMARY:
- Overall communication profile with temporal considerations
- Assessment of emotional and affective communication
- Developmental appropriateness considering age
- Prognosis and treatment priorities
Use the temporal data, sentiment scores, and emotional labels to provide insights that would not be possible with a simple transcript. Reference specific timestamps and emotional states when making observations.
"""
return prompt
def analyze_rich_transcript_with_llm(rich_transcript, age, gender, slp_notes=""):
"""Analyze rich transcript using LLM with comprehensive metadata"""
if not rich_transcript:
return "No transcript data available for analysis."
# Calculate SLP metrics
metrics = calculate_slp_metrics(rich_transcript)
# Generate comprehensive analysis prompt
prompt = generate_comprehensive_analysis_prompt(rich_transcript, metrics, age, gender, slp_notes)
# Get analysis from Claude API
if ANTHROPIC_API_KEY:
result = call_claude_api(prompt)
else:
result = generate_demo_analysis(rich_transcript, metrics)
return result
def call_claude_api(prompt):
"""Call Claude API directly"""
if not ANTHROPIC_API_KEY:
return "❌ Claude API key not configured. Please set ANTHROPIC_API_KEY environment variable."
try:
headers = {
"Content-Type": "application/json",
"x-api-key": ANTHROPIC_API_KEY,
"anthropic-version": "2023-06-01"
}
data = {
"model": "claude-3-5-sonnet-20241022",
"max_tokens": 4096,
"messages": [
{
"role": "user",
"content": prompt
}
]
}
response = requests.post(
"https://api.anthropic.com/v1/messages",
headers=headers,
json=data,
timeout=60
)
if response.status_code == 200:
response_json = response.json()
return response_json['content'][0]['text']
else:
logger.error(f"Claude API error: {response.status_code} - {response.text}")
return f"❌ Claude API Error: {response.status_code}"
except Exception as e:
logger.error(f"Error calling Claude API: {str(e)}")
return f"❌ Error: {str(e)}"
def generate_demo_analysis(rich_transcript, metrics):
"""Generate demo analysis when API is not available"""
return f"""## Comprehensive SLP Analysis with Temporal and Affective Data
### TEMPORAL SPEECH PATTERNS
**Speech Rate Analysis**: {metrics['avg_speech_rate_wpm']} words per minute (variability: {metrics['speech_rate_variability']} wpm)
- Speech rate appears {'within normal limits' if 120 <= metrics['avg_speech_rate_wpm'] <= 180 else 'below typical range' if metrics['avg_speech_rate_wpm'] < 120 else 'above typical range'}
- Variability suggests {'consistent' if metrics['speech_rate_variability'] < 20 else 'variable'} speech patterns
**Pause Analysis**: Average pause duration of {metrics['avg_pause_duration']:.1f} seconds
- {'Appropriate' if 0.5 <= metrics['avg_pause_duration'] <= 2.0 else 'Short' if metrics['avg_pause_duration'] < 0.5 else 'Long'} pauses between utterances
### AFFECTIVE AND EMOTIONAL ANALYSIS
**Sentiment Distribution**: {metrics['sentiment_distribution']}
**Emotion Distribution**: {metrics['emotion_distribution']}
The emotional patterns suggest {'positive' if 'positive' in metrics['sentiment_distribution'] and metrics['sentiment_distribution']['positive'] > 2 else 'neutral' if 'neutral' in metrics['sentiment_distribution'] and metrics['sentiment_distribution']['neutral'] > 2 else 'mixed'} emotional expression throughout the session.
### LANGUAGE COMPLEXITY
**Vocabulary Diversity**: Type-Token Ratio of {metrics['type_token_ratio']}
- {'Good' if metrics['type_token_ratio'] > 0.4 else 'Limited' if metrics['type_token_ratio'] < 0.3 else 'Moderate'} vocabulary diversity
**Sentence Structure**: Average {metrics['avg_sentence_length']} words per sentence
- Sentence length appears {'age-appropriate' if 5 <= metrics['avg_sentence_length'] <= 12 else 'below age expectations' if metrics['avg_sentence_length'] < 5 else 'above age expectations'}
**Most Frequent Words**: {', '.join(list(metrics['word_frequency'].keys())[:5])}
### SPEAKER ANALYSIS
**Number of Speakers**: {metrics['speaker_count']}
{chr(10).join([f"• {speaker}: {data['sentences']} sentences, {data['words']} words" for speaker, data in metrics['speakers'].items()])}
### CLINICAL IMPLICATIONS
Based on the temporal and affective analysis, this patient shows:
- {'Good' if metrics['type_token_ratio'] > 0.4 else 'Limited'} vocabulary diversity
- {'Appropriate' if 120 <= metrics['avg_speech_rate_wpm'] <= 180 else 'Atypical'} speech rate
- {'Consistent' if metrics['speech_rate_variability'] < 20 else 'Variable'} speech patterns
- {'Positive' if 'positive' in metrics['sentiment_distribution'] and metrics['sentiment_distribution']['positive'] > 2 else 'Neutral'} emotional expression
### RECOMMENDATIONS
1. Focus on vocabulary expansion if TTR < 0.4
2. Address speech rate if outside normal range
3. Work on sentence complexity if below age expectations
4. Consider emotional regulation strategies based on sentiment patterns
5. Monitor temporal patterns in speech rate and fluency"""
def create_transcription_interface():
"""Create the transcription-focused Gradio interface"""
with gr.Blocks(title="Advanced Transcription Tool", theme=gr.themes.Soft()) as app:
gr.Markdown("# 🎤 Advanced Transcription Tool")
gr.Markdown("Transcribe audio/video with speaker diarization, timestamps, sentiment analysis, and comprehensive LLM analysis")
with gr.Tabs():
# Audio/Video Upload & Transcription Tab
with gr.Tab("🎤 Audio/Video Transcription"):
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### File Upload")
gr.Markdown("**Supported formats:** MP4, AVI, MOV, MKV, WMV, FLV, WAV, MP3, M4A, FLAC, OGG")
file_input = gr.File(
label="Upload Audio or Video File",
file_types=["audio", "video"]
)
enable_diarization = gr.Checkbox(
label="Enable Speaker Diarization",
value=True,
info="Identify different speakers in the audio"
)
transcribe_btn = gr.Button(
"🎤 Transcribe File",
variant="primary",
size="lg"
)
transcription_status = gr.Markdown("")
with gr.Column(scale=2):
gr.Markdown("### Rich Transcript with Metadata")
rich_transcript_display = gr.Textbox(
label="Transcription with Speakers, Timestamps, Sentiment & Emotion",
lines=15,
max_lines=20
)
# Analysis Tab
with gr.Tab("📊 LLM Analysis"):
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Patient Information")
with gr.Row():
age = gr.Number(label="Age", value=8, minimum=1, maximum=120)
gender = gr.Radio(["male", "female", "other"], label="Gender", value="male")
slp_notes = gr.Textbox(
label="SLP Clinical Notes (Optional)",
placeholder="Enter additional clinical observations...",
lines=3
)
analyze_btn = gr.Button(
"🔍 Analyze with LLM",
variant="primary",
size="lg"
)
with gr.Column(scale=2):
gr.Markdown("### Comprehensive LLM Analysis")
analysis_output = gr.Textbox(
label="LLM Analysis Report",
lines=25,
max_lines=30
)
# Metrics Tab
with gr.Tab("📈 Speech Metrics"):
with gr.Row():
with gr.Column():
gr.Markdown("### Quantitative Speech Metrics")
metrics_display = gr.Textbox(
label="SLP Metrics",
lines=15,
max_lines=20
)
with gr.Column():
gr.Markdown("### Word Frequency Analysis")
word_freq_display = gr.Dataframe(
headers=["Word", "Frequency"],
label="Most Frequent Words",
interactive=False
)
# Raw Data Tab
with gr.Tab("📊 Raw Data"):
with gr.Row():
with gr.Column():
gr.Markdown("### JSON Data")
json_display = gr.Textbox(
label="Raw JSON Data",
lines=20,
max_lines=25
)
# Event handlers
def on_transcribe(file, diarization_enabled):
"""Handle file transcription"""
if not file:
return "", "", "", "Please upload a file first."
rich_transcript, status = transcribe_audio_with_metadata(file.name, diarization_enabled)
if rich_transcript:
formatted = format_rich_transcript(rich_transcript)
metrics = calculate_slp_metrics(rich_transcript)
# Format metrics for display
metrics_text = f"""SPEECH METRICS:
• Total sentences: {metrics['total_sentences']}
• Total words: {metrics['total_words']}
• Duration: {metrics['total_duration_seconds']:.1f} seconds
• Type-Token Ratio: {metrics['type_token_ratio']} (vocabulary diversity)
• Average sentence length: {metrics['avg_sentence_length']} words
• Average speech rate: {metrics['avg_speech_rate_wpm']} words per minute
• Speech rate variability: {metrics['speech_rate_variability']} wpm
• Average pause duration: {metrics['avg_pause_duration']:.1f} seconds
• Number of speakers: {metrics['speaker_count']}
SENTIMENT DISTRIBUTION: {metrics['sentiment_distribution']}
EMOTION DISTRIBUTION: {metrics['emotion_distribution']}
SPEAKER ANALYSIS:"""
for speaker, data in metrics['speakers'].items():
metrics_text += f"\n• {speaker}: {data['sentences']} sentences, {data['words']} words"
# Create word frequency dataframe
word_freq_data = [[word, freq] for word, freq in list(metrics['word_frequency'].items())[:20]]
# JSON data
json_data = json.dumps(rich_transcript, indent=2)
return formatted, metrics_text, word_freq_data, status
else:
return "", "", [], status
def on_analyze(rich_transcript_text, age_val, gender_val, notes):
"""Handle LLM analysis"""
if not rich_transcript_text or rich_transcript_text == "No transcript data available":
return "Please transcribe audio first."
# Convert formatted text back to rich transcript structure
lines = rich_transcript_text.split('\n')
rich_transcript = []
for i, line in enumerate(lines):
if line.strip():
# Extract data from the formatted line
timestamp_match = re.search(r'\[(\d{2}:\d{2})\]', line)
speaker_match = re.search(r'\*(\w+):', line)
sentence_match = re.search(r'\*\w+:\s*(.+?)(?=\s*\[|$)', line)
if timestamp_match and speaker_match and sentence_match:
timestamp_str = timestamp_match.group(1)
minutes, seconds = map(int, timestamp_str.split(':'))
timestamp = minutes * 60 + seconds
speaker = speaker_match.group(1)
sentence = sentence_match.group(1).strip()
rich_transcript.append({
'timestamp': timestamp,
'speaker': speaker,
'sentence': sentence,
'word_count': len(sentence.split()),
'avg_word_length': np.mean([len(word) for word in sentence.split()]) if sentence.split() else 0,
'speech_rate_wpm': 120.0,
'sentiment': 'neutral',
'sentiment_score': 0.5,
'emotion': 'neutral',
'emotion_score': 0.5
})
return analyze_rich_transcript_with_llm(rich_transcript, age_val, gender_val, notes)
# Connect event handlers
transcribe_btn.click(
on_transcribe,
inputs=[file_input, enable_diarization],
outputs=[rich_transcript_display, metrics_display, word_freq_display, transcription_status]
)
analyze_btn.click(
on_analyze,
inputs=[rich_transcript_display, age, gender, slp_notes],
outputs=[analysis_output]
)
return app
if __name__ == "__main__":
print("🚀 Starting Advanced Transcription Tool...")
if not MOVIEPY_AVAILABLE:
print("⚠️ MoviePy not available - video processing will be limited")
print(" Install with: pip install moviepy")
else:
print("✅ MoviePy available for video processing")
if not DIARIZATION_AVAILABLE:
print("⚠️ Pyannote.audio not available - speaker diarization will be disabled")
print(" Install with: pip install pyannote.audio")
else:
print("✅ Pyannote.audio available for speaker diarization")
if not os.getenv("HF_TOKEN"):
print("⚠️ HF_TOKEN not set - set it to enable speaker diarization")
print(" Get token from: https://huggingface.co/settings/tokens")
print(" Accept model terms at: https://huggingface.co/pyannote/speaker-diarization")
if not SPEECHBRAIN_AVAILABLE:
print("⚠️ SpeechBrain not available - audio transcription will use demo mode")
print(" Install with: pip install speechbrain transformers torch")
else:
print("✅ SpeechBrain and HuggingFace models loaded")
app = create_transcription_interface()
app.launch(show_api=False)