Spaces:

SreekarB
/

SLPAnalysis

Sleeping

App Files Files Community

SLPAnalysis / transcription_demo.py

SreekarB

Upload 8 files

d60565b verified 5 days ago

raw

history blame contribute delete

34.8 kB

	import gradio as gr
	import json
	import os
	import logging
	import re
	import numpy as np
	import pandas as pd
	from datetime import datetime
	import time
	import tempfile
	from typing import Dict, List, Tuple, Optional
	import requests

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Try to import video processing libraries
	try:
	import moviepy.editor as mp
	MOVIEPY_AVAILABLE = True
	logger.info("MoviePy available for video processing")
	except ImportError as e:
	logger.warning(f"MoviePy not available: {e}")
	MOVIEPY_AVAILABLE = False

	# Try to import speaker diarization
	try:
	from pyannote.audio import Pipeline
	from pyannote.audio.pipelines.utils.hook import ProgressHook
	DIARIZATION_AVAILABLE = True
	logger.info("Pyannote.audio available for speaker diarization")
	except ImportError as e:
	logger.warning(f"Pyannote.audio not available: {e}")
	DIARIZATION_AVAILABLE = False

	# Try to import SpeechBrain and HuggingFace components
	try:
	from speechbrain.pretrained import EncoderDecoderASR, VAD, EncoderClassifier
	from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
	import torch
	SPEECHBRAIN_AVAILABLE = True
	HUGGINGFACE_AVAILABLE = True
	logger.info("SpeechBrain and HuggingFace models available")
	except ImportError as e:
	logger.warning(f"SpeechBrain/HuggingFace not available: {e}")
	SPEECHBRAIN_AVAILABLE = False
	HUGGINGFACE_AVAILABLE = False

	# Initialize models if available
	asr_model = None
	vad_model = None
	sentiment_model = None
	emotion_model = None
	diarization_pipeline = None

	if SPEECHBRAIN_AVAILABLE and HUGGINGFACE_AVAILABLE:
	try:
	# Speech-to-text model
	asr_model = EncoderDecoderASR.from_hparams(
	source="speechbrain/asr-crdnn-rnnlm-librispeech",
	savedir="pretrained_models/asr-crdnn-rnnlm-librispeech"
	)

	# Voice Activity Detection
	vad_model = VAD.from_hparams(
	source="speechbrain/vad-crdnn-libriparty",
	savedir="pretrained_models/vad-crdnn-libriparty"
	)

	# Sentiment analysis
	sentiment_model = pipeline(
	"sentiment-analysis",
	model="cardiffnlp/twitter-roberta-base-sentiment-latest",
	top_k=None
	)

	# Emotion analysis
	emotion_model = pipeline(
	"text-classification",
	model="j-hartmann/emotion-english-distilroberta-base",
	top_k=None
	)

	logger.info("All models loaded successfully")
	except Exception as e:
	logger.error(f"Error loading models: {e}")
	SPEECHBRAIN_AVAILABLE = False
	HUGGINGFACE_AVAILABLE = False

	# Initialize diarization pipeline
	if DIARIZATION_AVAILABLE:
	try:
	# Note: You'll need to get a HuggingFace token and accept the model terms
	# at https://huggingface.co/pyannote/speaker-diarization
	HF_TOKEN = os.getenv("HF_TOKEN", "")
	if HF_TOKEN:
	diarization_pipeline = Pipeline.from_pretrained(
	"pyannote/speaker-diarization@2.1",
	use_auth_token=HF_TOKEN
	)
	logger.info("Speaker diarization pipeline loaded")
	else:
	logger.warning("HF_TOKEN not set - speaker diarization will be disabled")
	except Exception as e:
	logger.error(f"Error loading diarization pipeline: {e}")

	def extract_audio_from_video(video_path):
	"""Extract audio from video file (MP4, etc.)"""
	if not MOVIEPY_AVAILABLE:
	return None, "MoviePy not available for video processing"

	try:
	# Create temporary file for audio
	temp_audio = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
	temp_audio_path = temp_audio.name
	temp_audio.close()

	# Load video and extract audio
	video = mp.VideoFileClip(video_path)
	audio = video.audio

	if audio is None:
	return None, "No audio track found in video file"

	# Export audio to temporary WAV file
	audio.write_audiofile(temp_audio_path, verbose=False, logger=None)

	# Close video to free memory
	video.close()
	audio.close()

	logger.info(f"Audio extracted from video: {temp_audio_path}")
	return temp_audio_path, "Audio extracted successfully"

	except Exception as e:
	logger.error(f"Error extracting audio from video: {e}")
	return None, f"Error extracting audio: {str(e)}"

	def perform_speaker_diarization(audio_path):
	"""Perform speaker diarization on audio file"""
	if not DIARIZATION_AVAILABLE or not diarization_pipeline:
	return None, "Speaker diarization not available"

	try:
	# Perform diarization
	with ProgressHook() as hook:
	diarization = diarization_pipeline(audio_path, hook=hook)

	# Extract speaker segments
	speaker_segments = []
	for turn, _, speaker in diarization.itertracks(yield_label=True):
	speaker_segments.append({
	'start': turn.start,
	'end': turn.end,
	'speaker': speaker,
	'duration': turn.end - turn.start
	})

	logger.info(f"Diarization completed: {len(speaker_segments)} segments found")
	return speaker_segments, "Diarization completed successfully"

	except Exception as e:
	logger.error(f"Error in diarization: {e}")
	return None, f"Diarization error: {str(e)}"

	def process_audio_file(file_path):
	"""Process audio file, extracting from video if needed"""
	if not file_path:
	return None, "No file provided"

	file_extension = os.path.splitext(file_path)[1].lower()

	# If it's a video file, extract audio first
	if file_extension in ['.mp4', '.avi', '.mov', '.mkv', '.wmv', '.flv']:
	logger.info(f"Processing video file: {file_path}")
	audio_path, status = extract_audio_from_video(file_path)
	if audio_path:
	return audio_path, f"Video processed: {status}"
	else:
	return None, status

	# If it's already an audio file, use it directly
	elif file_extension in ['.wav', '.mp3', '.m4a', '.flac', '.ogg']:
	logger.info(f"Processing audio file: {file_path}")
	return file_path, "Audio file ready for transcription"

	else:
	return None, f"Unsupported file format: {file_extension}"

	def transcribe_audio_with_metadata(audio_file, enable_diarization=True):
	"""Transcribe audio with timestamps, sentiment, and metadata"""
	if not audio_file:
	return None, "No audio file provided"

	if not SPEECHBRAIN_AVAILABLE:
	return None, "SpeechBrain not available - using demo transcription"

	try:
	# Process the file (extract audio if it's a video)
	processed_audio_path, process_status = process_audio_file(audio_file)

	if not processed_audio_path:
	return None, process_status

	# Perform speaker diarization if enabled
	speaker_segments = None
	diarization_status = ""
	if enable_diarization:
	speaker_segments, diarization_status = perform_speaker_diarization(processed_audio_path)

	# Get transcription with timestamps
	transcript = asr_model.transcribe_file(processed_audio_path)

	# Clean up temporary audio file if it was created from video
	if processed_audio_path != audio_file and os.path.exists(processed_audio_path):
	try:
	os.unlink(processed_audio_path)
	logger.info("Temporary audio file cleaned up")
	except Exception as e:
	logger.warning(f"Could not clean up temporary file: {e}")

	# Split into sentences for analysis
	sentences = re.split(r'[.!?]+', transcript)
	sentences = [s.strip() for s in sentences if s.strip()]

	# Analyze each sentence
	rich_transcript = []
	current_time = 0

	for i, sentence in enumerate(sentences):
	# Estimate timestamp (rough approximation)
	timestamp = current_time + (i * 2) # Assume ~2 seconds per sentence

	# Determine speaker for this timestamp
	speaker = "UNKNOWN"
	if speaker_segments:
	for segment in speaker_segments:
	if segment['start'] <= timestamp <= segment['end']:
	speaker = segment['speaker']
	break

	# Sentiment analysis
	sentiment_result = sentiment_model(sentence)[0] if sentiment_model else None
	sentiment = max(sentiment_result, key=lambda x: x['score']) if sentiment_result else {'label': 'neutral', 'score': 0.5}

	# Emotion analysis
	emotion_result = emotion_model(sentence)[0] if emotion_model else None
	emotion = max(emotion_result, key=lambda x: x['score']) if emotion_result else {'label': 'neutral', 'score': 0.5}

	# Word count and complexity metrics
	words = sentence.split()
	word_count = len(words)
	avg_word_length = np.mean([len(word) for word in words]) if words else 0

	# Calculate speech rate (words per minute estimate)
	speech_rate = word_count * 30 / 60 # Rough estimate

	rich_transcript.append({
	'timestamp': timestamp,
	'speaker': speaker,
	'sentence': sentence,
	'word_count': word_count,
	'avg_word_length': round(avg_word_length, 2),
	'speech_rate_wpm': round(speech_rate, 1),
	'sentiment': sentiment['label'],
	'sentiment_score': round(sentiment['score'], 3),
	'emotion': emotion['label'],
	'emotion_score': round(emotion['score'], 3)
	})

	current_time = timestamp

	status_msg = f"Transcription completed successfully. {process_status}"
	if diarization_status:
	status_msg += f" {diarization_status}"

	return rich_transcript, status_msg

	except Exception as e:
	logger.error(f"Error in transcription: {e}")
	return None, f"Transcription error: {str(e)}"

	def format_rich_transcript(rich_transcript):
	"""Format rich transcript for display"""
	if not rich_transcript:
	return "No transcript data available"

	formatted_lines = []
	for entry in rich_transcript:
	timestamp_str = f"{int(entry['timestamp']//60):02d}:{int(entry['timestamp']%60):02d}"

	line = f"[{timestamp_str}] *{entry['speaker']}: {entry['sentence']}"
	line += f" [Words: {entry['word_count']}, Rate: {entry['speech_rate_wpm']}wpm]"
	line += f" [Sentiment: {entry['sentiment']} ({entry['sentiment_score']})]"
	line += f" [Emotion: {entry['emotion']} ({entry['emotion_score']})]"

	formatted_lines.append(line)

	return '\n'.join(formatted_lines)

	def calculate_slp_metrics(rich_transcript):
	"""Calculate comprehensive SLP metrics"""
	if not rich_transcript:
	return {}

	# Basic metrics
	total_sentences = len(rich_transcript)
	total_words = sum(entry['word_count'] for entry in rich_transcript)
	total_duration = rich_transcript[-1]['timestamp'] if rich_transcript else 0

	# Speaker analysis
	speakers = {}
	for entry in rich_transcript:
	speaker = entry['speaker']
	if speaker not in speakers:
	speakers[speaker] = {
	'sentences': 0,
	'words': 0,
	'sentiments': [],
	'emotions': []
	}
	speakers[speaker]['sentences'] += 1
	speakers[speaker]['words'] += entry['word_count']
	speakers[speaker]['sentiments'].append(entry['sentiment'])
	speakers[speaker]['emotions'].append(entry['emotion'])

	# Word-level analysis
	all_words = []
	for entry in rich_transcript:
	words = entry['sentence'].lower().split()
	all_words.extend(words)

	# Word frequency distribution
	word_freq = {}
	for word in all_words:
	word_clean = re.sub(r'[^\w\s]', '', word)
	if word_clean:
	word_freq[word_clean] = word_freq.get(word_clean, 0) + 1

	# Vocabulary diversity (Type-Token Ratio)
	unique_words = len(set(all_words))
	ttr = unique_words / total_words if total_words > 0 else 0

	# Speech rate analysis
	speech_rates = [entry['speech_rate_wpm'] for entry in rich_transcript]
	avg_speech_rate = np.mean(speech_rates) if speech_rates else 0

	# Sentiment analysis
	sentiment_counts = {}
	emotion_counts = {}
	for entry in rich_transcript:
	sentiment_counts[entry['sentiment']] = sentiment_counts.get(entry['sentiment'], 0) + 1
	emotion_counts[entry['emotion']] = emotion_counts.get(entry['emotion'], 0) + 1

	# Sentence complexity
	sentence_lengths = [entry['word_count'] for entry in rich_transcript]
	avg_sentence_length = np.mean(sentence_lengths) if sentence_lengths else 0

	# Pause analysis (gaps between sentences)
	pauses = []
	for i in range(1, len(rich_transcript)):
	pause = rich_transcript[i]['timestamp'] - rich_transcript[i-1]['timestamp']
	pauses.append(pause)

	avg_pause_duration = np.mean(pauses) if pauses else 0

	return {
	'total_sentences': total_sentences,
	'total_words': total_words,
	'total_duration_seconds': total_duration,
	'unique_words': unique_words,
	'type_token_ratio': round(ttr, 3),
	'avg_sentence_length': round(avg_sentence_length, 1),
	'avg_speech_rate_wpm': round(avg_speech_rate, 1),
	'avg_pause_duration': round(avg_pause_duration, 1),
	'sentiment_distribution': sentiment_counts,
	'emotion_distribution': emotion_counts,
	'word_frequency': dict(sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:20]),
	'speech_rate_variability': round(np.std(speech_rates), 1) if speech_rates else 0,
	'speakers': speakers,
	'speaker_count': len(speakers)
	}

	def generate_comprehensive_analysis_prompt(rich_transcript, metrics, age, gender, slp_notes=""):
	"""Generate comprehensive analysis prompt using rich transcript data"""

	# Format rich transcript with timestamps and metadata
	transcript_lines = []
	for entry in rich_transcript:
	timestamp_str = f"{int(entry['timestamp']//60):02d}:{int(entry['timestamp']%60):02d}"
	transcript_lines.append(f"[{timestamp_str}] *{entry['speaker']}: {entry['sentence']}")

	transcript_text = '\n'.join(transcript_lines)

	# Format metrics for analysis
	metrics_text = f"""
	TRANSCRIPT METRICS:
	• Total sentences: {metrics['total_sentences']}
	• Total words: {metrics['total_words']}
	• Duration: {metrics['total_duration_seconds']:.1f} seconds
	• Type-Token Ratio: {metrics['type_token_ratio']} (vocabulary diversity)
	• Average sentence length: {metrics['avg_sentence_length']} words
	• Average speech rate: {metrics['avg_speech_rate_wpm']} words per minute
	• Speech rate variability: {metrics['speech_rate_variability']} wpm
	• Average pause duration: {metrics['avg_pause_duration']:.1f} seconds
	• Number of speakers: {metrics['speaker_count']}

	SENTIMENT DISTRIBUTION: {metrics['sentiment_distribution']}
	EMOTION DISTRIBUTION: {metrics['emotion_distribution']}

	SPEAKER ANALYSIS:"""

	for speaker, data in metrics['speakers'].items():
	metrics_text += f"\n• {speaker}: {data['sentences']} sentences, {data['words']} words"

	metrics_text += f"\n\nMOST FREQUENT WORDS: {list(metrics['word_frequency'].keys())[:10]}"

	notes_section = f"\nSLP CLINICAL NOTES:\n{slp_notes}" if slp_notes else ""

	prompt = f"""
	You are a speech-language pathologist conducting a comprehensive analysis of a speech transcript with rich temporal and affective metadata.

	PATIENT: {age}-year-old {gender}

	{metrics_text}

	TRANSCRIPT WITH TIMESTAMPS AND METADATA:
	{transcript_text}{notes_section}

	Please provide a comprehensive analysis including:

	1. TEMPORAL SPEECH PATTERNS:
	- Analyze speech rate changes over time using timestamps
	- Identify patterns in pause duration and frequency
	- Assess temporal consistency in speech production
	- Note any significant changes in speech patterns throughout the session

	2. AFFECTIVE AND EMOTIONAL ANALYSIS:
	- Analyze sentiment patterns throughout the transcript using timestamp data
	- Identify emotional shifts and their potential causes
	- Assess emotional regulation and expression
	- Note any correlations between emotional state and speech characteristics

	3. SPEAKER-SPECIFIC ANALYSIS (if multiple speakers):
	- Compare speech patterns between speakers
	- Analyze turn-taking patterns and timing
	- Assess interaction dynamics
	- Note speaker-specific emotional and sentiment patterns

	4. SPEECH FLUENCY AND RATE ANALYSIS:
	- Analyze speech rate variability using the provided metrics
	- Identify periods of fluent vs. dysfluent speech
	- Assess the impact of emotional state on speech rate
	- Note any temporal patterns in speech rate changes

	5. LANGUAGE COMPLEXITY ASSESSMENT:
	- Analyze vocabulary diversity using Type-Token Ratio
	- Assess sentence complexity and variety
	- Identify patterns in word frequency and usage
	- Note any temporal changes in language complexity

	6. COMPLEX SENTENCE ANALYSIS:
	- Count and analyze use of coordinating conjunctions (and, but, or, so, yet, for, nor)
	- Count and analyze use of subordinating conjunctions (because, although, while, since, if, when, where, that, which, who, whom, whose)
	- Identify compound, complex, and compound-complex sentences
	- Assess sentence variety and complexity level for age

	7. FIGURATIVE LANGUAGE ANALYSIS:
	- Identify and count similes (comparisons using "like" or "as")
	- Identify and count metaphors (direct comparisons without "like" or "as")
	- Identify and count idioms (common expressions with non-literal meanings)
	- Assess figurative language comprehension and use for age

	8. CLINICAL IMPLICATIONS:
	- Specific intervention targets based on temporal patterns
	- Recommendations for emotional regulation if needed
	- Suggestions for improving speech rate consistency
	- Strategies for enhancing language complexity
	- Age-appropriate development recommendations

	9. COMPREHENSIVE SUMMARY:
	- Overall communication profile with temporal considerations
	- Assessment of emotional and affective communication
	- Developmental appropriateness considering age
	- Prognosis and treatment priorities

	Use the temporal data, sentiment scores, and emotional labels to provide insights that would not be possible with a simple transcript. Reference specific timestamps and emotional states when making observations.
	"""

	return prompt

	def analyze_rich_transcript_with_llm(rich_transcript, age, gender, slp_notes=""):
	"""Analyze rich transcript using LLM with comprehensive metadata"""
	if not rich_transcript:
	return "No transcript data available for analysis."

	# Calculate SLP metrics
	metrics = calculate_slp_metrics(rich_transcript)

	# Generate comprehensive analysis prompt
	prompt = generate_comprehensive_analysis_prompt(rich_transcript, metrics, age, gender, slp_notes)

	# Get analysis from Claude API
	if ANTHROPIC_API_KEY:
	result = call_claude_api(prompt)
	else:
	result = generate_demo_analysis(rich_transcript, metrics)

	return result

	def call_claude_api(prompt):
	"""Call Claude API directly"""
	if not ANTHROPIC_API_KEY:
	return "❌ Claude API key not configured. Please set ANTHROPIC_API_KEY environment variable."

	try:
	headers = {
	"Content-Type": "application/json",
	"x-api-key": ANTHROPIC_API_KEY,
	"anthropic-version": "2023-06-01"
	}

	data = {
	"model": "claude-3-5-sonnet-20241022",
	"max_tokens": 4096,
	"messages": [
	{
	"role": "user",
	"content": prompt
	}
	]
	}

	response = requests.post(
	"https://api.anthropic.com/v1/messages",
	headers=headers,
	json=data,
	timeout=60
	)

	if response.status_code == 200:
	response_json = response.json()
	return response_json['content'][0]['text']
	else:
	logger.error(f"Claude API error: {response.status_code} - {response.text}")
	return f"❌ Claude API Error: {response.status_code}"

	except Exception as e:
	logger.error(f"Error calling Claude API: {str(e)}")
	return f"❌ Error: {str(e)}"

	def generate_demo_analysis(rich_transcript, metrics):
	"""Generate demo analysis when API is not available"""
	return f"""## Comprehensive SLP Analysis with Temporal and Affective Data

	### TEMPORAL SPEECH PATTERNS
	Speech Rate Analysis: {metrics['avg_speech_rate_wpm']} words per minute (variability: {metrics['speech_rate_variability']} wpm)
	- Speech rate appears {'within normal limits' if 120 <= metrics['avg_speech_rate_wpm'] <= 180 else 'below typical range' if metrics['avg_speech_rate_wpm'] < 120 else 'above typical range'}
	- Variability suggests {'consistent' if metrics['speech_rate_variability'] < 20 else 'variable'} speech patterns

	Pause Analysis: Average pause duration of {metrics['avg_pause_duration']:.1f} seconds
	- {'Appropriate' if 0.5 <= metrics['avg_pause_duration'] <= 2.0 else 'Short' if metrics['avg_pause_duration'] < 0.5 else 'Long'} pauses between utterances

	### AFFECTIVE AND EMOTIONAL ANALYSIS
	Sentiment Distribution: {metrics['sentiment_distribution']}
	Emotion Distribution: {metrics['emotion_distribution']}

	The emotional patterns suggest {'positive' if 'positive' in metrics['sentiment_distribution'] and metrics['sentiment_distribution']['positive'] > 2 else 'neutral' if 'neutral' in metrics['sentiment_distribution'] and metrics['sentiment_distribution']['neutral'] > 2 else 'mixed'} emotional expression throughout the session.

	### LANGUAGE COMPLEXITY
	Vocabulary Diversity: Type-Token Ratio of {metrics['type_token_ratio']}
	- {'Good' if metrics['type_token_ratio'] > 0.4 else 'Limited' if metrics['type_token_ratio'] < 0.3 else 'Moderate'} vocabulary diversity

	Sentence Structure: Average {metrics['avg_sentence_length']} words per sentence
	- Sentence length appears {'age-appropriate' if 5 <= metrics['avg_sentence_length'] <= 12 else 'below age expectations' if metrics['avg_sentence_length'] < 5 else 'above age expectations'}

	Most Frequent Words: {', '.join(list(metrics['word_frequency'].keys())[:5])}

	### SPEAKER ANALYSIS
	Number of Speakers: {metrics['speaker_count']}
	{chr(10).join([f"• {speaker}: {data['sentences']} sentences, {data['words']} words" for speaker, data in metrics['speakers'].items()])}

	### CLINICAL IMPLICATIONS
	Based on the temporal and affective analysis, this patient shows:
	- {'Good' if metrics['type_token_ratio'] > 0.4 else 'Limited'} vocabulary diversity
	- {'Appropriate' if 120 <= metrics['avg_speech_rate_wpm'] <= 180 else 'Atypical'} speech rate
	- {'Consistent' if metrics['speech_rate_variability'] < 20 else 'Variable'} speech patterns
	- {'Positive' if 'positive' in metrics['sentiment_distribution'] and metrics['sentiment_distribution']['positive'] > 2 else 'Neutral'} emotional expression

	### RECOMMENDATIONS
	1. Focus on vocabulary expansion if TTR < 0.4
	2. Address speech rate if outside normal range
	3. Work on sentence complexity if below age expectations
	4. Consider emotional regulation strategies based on sentiment patterns
	5. Monitor temporal patterns in speech rate and fluency"""

	def create_transcription_interface():
	"""Create the transcription-focused Gradio interface"""
	with gr.Blocks(title="Advanced Transcription Tool", theme=gr.themes.Soft()) as app:
	gr.Markdown("# 🎤 Advanced Transcription Tool")
	gr.Markdown("Transcribe audio/video with speaker diarization, timestamps, sentiment analysis, and comprehensive LLM analysis")

	with gr.Tabs():
	# Audio/Video Upload & Transcription Tab
	with gr.Tab("🎤 Audio/Video Transcription"):
	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### File Upload")
	gr.Markdown("Supported formats: MP4, AVI, MOV, MKV, WMV, FLV, WAV, MP3, M4A, FLAC, OGG")

	file_input = gr.File(
	label="Upload Audio or Video File",
	file_types=["audio", "video"]
	)

	enable_diarization = gr.Checkbox(
	label="Enable Speaker Diarization",
	value=True,
	info="Identify different speakers in the audio"
	)

	transcribe_btn = gr.Button(
	"🎤 Transcribe File",
	variant="primary",
	size="lg"
	)

	transcription_status = gr.Markdown("")

	with gr.Column(scale=2):
	gr.Markdown("### Rich Transcript with Metadata")

	rich_transcript_display = gr.Textbox(
	label="Transcription with Speakers, Timestamps, Sentiment & Emotion",
	lines=15,
	max_lines=20
	)

	# Analysis Tab
	with gr.Tab("📊 LLM Analysis"):
	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### Patient Information")

	with gr.Row():
	age = gr.Number(label="Age", value=8, minimum=1, maximum=120)
	gender = gr.Radio(["male", "female", "other"], label="Gender", value="male")

	slp_notes = gr.Textbox(
	label="SLP Clinical Notes (Optional)",
	placeholder="Enter additional clinical observations...",
	lines=3
	)

	analyze_btn = gr.Button(
	"🔍 Analyze with LLM",
	variant="primary",
	size="lg"
	)

	with gr.Column(scale=2):
	gr.Markdown("### Comprehensive LLM Analysis")

	analysis_output = gr.Textbox(
	label="LLM Analysis Report",
	lines=25,
	max_lines=30
	)

	# Metrics Tab
	with gr.Tab("📈 Speech Metrics"):
	with gr.Row():
	with gr.Column():
	gr.Markdown("### Quantitative Speech Metrics")

	metrics_display = gr.Textbox(
	label="SLP Metrics",
	lines=15,
	max_lines=20
	)

	with gr.Column():
	gr.Markdown("### Word Frequency Analysis")

	word_freq_display = gr.Dataframe(
	headers=["Word", "Frequency"],
	label="Most Frequent Words",
	interactive=False
	)

	# Raw Data Tab
	with gr.Tab("📊 Raw Data"):
	with gr.Row():
	with gr.Column():
	gr.Markdown("### JSON Data")

	json_display = gr.Textbox(
	label="Raw JSON Data",
	lines=20,
	max_lines=25
	)

	# Event handlers
	def on_transcribe(file, diarization_enabled):
	"""Handle file transcription"""
	if not file:
	return "", "", "", "Please upload a file first."

	rich_transcript, status = transcribe_audio_with_metadata(file.name, diarization_enabled)

	if rich_transcript:
	formatted = format_rich_transcript(rich_transcript)
	metrics = calculate_slp_metrics(rich_transcript)

	# Format metrics for display
	metrics_text = f"""SPEECH METRICS:
	• Total sentences: {metrics['total_sentences']}
	• Total words: {metrics['total_words']}
	• Duration: {metrics['total_duration_seconds']:.1f} seconds
	• Type-Token Ratio: {metrics['type_token_ratio']} (vocabulary diversity)
	• Average sentence length: {metrics['avg_sentence_length']} words
	• Average speech rate: {metrics['avg_speech_rate_wpm']} words per minute
	• Speech rate variability: {metrics['speech_rate_variability']} wpm
	• Average pause duration: {metrics['avg_pause_duration']:.1f} seconds
	• Number of speakers: {metrics['speaker_count']}

	SENTIMENT DISTRIBUTION: {metrics['sentiment_distribution']}
	EMOTION DISTRIBUTION: {metrics['emotion_distribution']}

	SPEAKER ANALYSIS:"""

	for speaker, data in metrics['speakers'].items():
	metrics_text += f"\n• {speaker}: {data['sentences']} sentences, {data['words']} words"

	# Create word frequency dataframe
	word_freq_data = [[word, freq] for word, freq in list(metrics['word_frequency'].items())[:20]]

	# JSON data
	json_data = json.dumps(rich_transcript, indent=2)

	return formatted, metrics_text, word_freq_data, status
	else:
	return "", "", [], status

	def on_analyze(rich_transcript_text, age_val, gender_val, notes):
	"""Handle LLM analysis"""
	if not rich_transcript_text or rich_transcript_text == "No transcript data available":
	return "Please transcribe audio first."

	# Convert formatted text back to rich transcript structure
	lines = rich_transcript_text.split('\n')
	rich_transcript = []

	for i, line in enumerate(lines):
	if line.strip():
	# Extract data from the formatted line
	timestamp_match = re.search(r'\[(\d{2}:\d{2})\]', line)
	speaker_match = re.search(r'\*(\w+):', line)
	sentence_match = re.search(r'\\w+:\s(.+?)(?=\s*\[\|$)', line)

	if timestamp_match and speaker_match and sentence_match:
	timestamp_str = timestamp_match.group(1)
	minutes, seconds = map(int, timestamp_str.split(':'))
	timestamp = minutes * 60 + seconds

	speaker = speaker_match.group(1)
	sentence = sentence_match.group(1).strip()

	rich_transcript.append({
	'timestamp': timestamp,
	'speaker': speaker,
	'sentence': sentence,
	'word_count': len(sentence.split()),
	'avg_word_length': np.mean([len(word) for word in sentence.split()]) if sentence.split() else 0,
	'speech_rate_wpm': 120.0,
	'sentiment': 'neutral',
	'sentiment_score': 0.5,
	'emotion': 'neutral',
	'emotion_score': 0.5
	})

	return analyze_rich_transcript_with_llm(rich_transcript, age_val, gender_val, notes)

	# Connect event handlers
	transcribe_btn.click(
	on_transcribe,
	inputs=[file_input, enable_diarization],
	outputs=[rich_transcript_display, metrics_display, word_freq_display, transcription_status]
	)

	analyze_btn.click(
	on_analyze,
	inputs=[rich_transcript_display, age, gender, slp_notes],
	outputs=[analysis_output]
	)

	return app

	if __name__ == "__main__":
	print("🚀 Starting Advanced Transcription Tool...")

	if not MOVIEPY_AVAILABLE:
	print("⚠️ MoviePy not available - video processing will be limited")
	print(" Install with: pip install moviepy")
	else:
	print("✅ MoviePy available for video processing")

	if not DIARIZATION_AVAILABLE:
	print("⚠️ Pyannote.audio not available - speaker diarization will be disabled")
	print(" Install with: pip install pyannote.audio")
	else:
	print("✅ Pyannote.audio available for speaker diarization")
	if not os.getenv("HF_TOKEN"):
	print("⚠️ HF_TOKEN not set - set it to enable speaker diarization")
	print(" Get token from: https://huggingface.co/settings/tokens")
	print(" Accept model terms at: https://huggingface.co/pyannote/speaker-diarization")

	if not SPEECHBRAIN_AVAILABLE:
	print("⚠️ SpeechBrain not available - audio transcription will use demo mode")
	print(" Install with: pip install speechbrain transformers torch")
	else:
	print("✅ SpeechBrain and HuggingFace models loaded")

	app = create_transcription_interface()
	app.launch(show_api=False)