import gradio as gr import json import os import logging import re import numpy as np import pandas as pd from datetime import datetime import time import tempfile from typing import Dict, List, Tuple, Optional import requests # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Try to import video processing libraries try: import moviepy.editor as mp MOVIEPY_AVAILABLE = True logger.info("MoviePy available for video processing") except ImportError as e: logger.warning(f"MoviePy not available: {e}") MOVIEPY_AVAILABLE = False # Try to import speaker diarization try: from pyannote.audio import Pipeline from pyannote.audio.pipelines.utils.hook import ProgressHook DIARIZATION_AVAILABLE = True logger.info("Pyannote.audio available for speaker diarization") except ImportError as e: logger.warning(f"Pyannote.audio not available: {e}") DIARIZATION_AVAILABLE = False # Try to import SpeechBrain and HuggingFace components try: from speechbrain.pretrained import EncoderDecoderASR, VAD, EncoderClassifier from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification import torch SPEECHBRAIN_AVAILABLE = True HUGGINGFACE_AVAILABLE = True logger.info("SpeechBrain and HuggingFace models available") except ImportError as e: logger.warning(f"SpeechBrain/HuggingFace not available: {e}") SPEECHBRAIN_AVAILABLE = False HUGGINGFACE_AVAILABLE = False # Initialize models if available asr_model = None vad_model = None sentiment_model = None emotion_model = None diarization_pipeline = None if SPEECHBRAIN_AVAILABLE and HUGGINGFACE_AVAILABLE: try: # Speech-to-text model asr_model = EncoderDecoderASR.from_hparams( source="speechbrain/asr-crdnn-rnnlm-librispeech", savedir="pretrained_models/asr-crdnn-rnnlm-librispeech" ) # Voice Activity Detection vad_model = VAD.from_hparams( source="speechbrain/vad-crdnn-libriparty", savedir="pretrained_models/vad-crdnn-libriparty" ) # Sentiment analysis sentiment_model = pipeline( "sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest", top_k=None ) # Emotion analysis emotion_model = pipeline( "text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None ) logger.info("All models loaded successfully") except Exception as e: logger.error(f"Error loading models: {e}") SPEECHBRAIN_AVAILABLE = False HUGGINGFACE_AVAILABLE = False # Initialize diarization pipeline if DIARIZATION_AVAILABLE: try: # Note: You'll need to get a HuggingFace token and accept the model terms # at https://huggingface.co/pyannote/speaker-diarization HF_TOKEN = os.getenv("HF_TOKEN", "") if HF_TOKEN: diarization_pipeline = Pipeline.from_pretrained( "pyannote/speaker-diarization@2.1", use_auth_token=HF_TOKEN ) logger.info("Speaker diarization pipeline loaded") else: logger.warning("HF_TOKEN not set - speaker diarization will be disabled") except Exception as e: logger.error(f"Error loading diarization pipeline: {e}") def extract_audio_from_video(video_path): """Extract audio from video file (MP4, etc.)""" if not MOVIEPY_AVAILABLE: return None, "MoviePy not available for video processing" try: # Create temporary file for audio temp_audio = tempfile.NamedTemporaryFile(suffix='.wav', delete=False) temp_audio_path = temp_audio.name temp_audio.close() # Load video and extract audio video = mp.VideoFileClip(video_path) audio = video.audio if audio is None: return None, "No audio track found in video file" # Export audio to temporary WAV file audio.write_audiofile(temp_audio_path, verbose=False, logger=None) # Close video to free memory video.close() audio.close() logger.info(f"Audio extracted from video: {temp_audio_path}") return temp_audio_path, "Audio extracted successfully" except Exception as e: logger.error(f"Error extracting audio from video: {e}") return None, f"Error extracting audio: {str(e)}" def perform_speaker_diarization(audio_path): """Perform speaker diarization on audio file""" if not DIARIZATION_AVAILABLE or not diarization_pipeline: return None, "Speaker diarization not available" try: # Perform diarization with ProgressHook() as hook: diarization = diarization_pipeline(audio_path, hook=hook) # Extract speaker segments speaker_segments = [] for turn, _, speaker in diarization.itertracks(yield_label=True): speaker_segments.append({ 'start': turn.start, 'end': turn.end, 'speaker': speaker, 'duration': turn.end - turn.start }) logger.info(f"Diarization completed: {len(speaker_segments)} segments found") return speaker_segments, "Diarization completed successfully" except Exception as e: logger.error(f"Error in diarization: {e}") return None, f"Diarization error: {str(e)}" def process_audio_file(file_path): """Process audio file, extracting from video if needed""" if not file_path: return None, "No file provided" file_extension = os.path.splitext(file_path)[1].lower() # If it's a video file, extract audio first if file_extension in ['.mp4', '.avi', '.mov', '.mkv', '.wmv', '.flv']: logger.info(f"Processing video file: {file_path}") audio_path, status = extract_audio_from_video(file_path) if audio_path: return audio_path, f"Video processed: {status}" else: return None, status # If it's already an audio file, use it directly elif file_extension in ['.wav', '.mp3', '.m4a', '.flac', '.ogg']: logger.info(f"Processing audio file: {file_path}") return file_path, "Audio file ready for transcription" else: return None, f"Unsupported file format: {file_extension}" def transcribe_audio_with_metadata(audio_file, enable_diarization=True): """Transcribe audio with timestamps, sentiment, and metadata""" if not audio_file: return None, "No audio file provided" if not SPEECHBRAIN_AVAILABLE: return None, "SpeechBrain not available - using demo transcription" try: # Process the file (extract audio if it's a video) processed_audio_path, process_status = process_audio_file(audio_file) if not processed_audio_path: return None, process_status # Perform speaker diarization if enabled speaker_segments = None diarization_status = "" if enable_diarization: speaker_segments, diarization_status = perform_speaker_diarization(processed_audio_path) # Get transcription with timestamps transcript = asr_model.transcribe_file(processed_audio_path) # Clean up temporary audio file if it was created from video if processed_audio_path != audio_file and os.path.exists(processed_audio_path): try: os.unlink(processed_audio_path) logger.info("Temporary audio file cleaned up") except Exception as e: logger.warning(f"Could not clean up temporary file: {e}") # Split into sentences for analysis sentences = re.split(r'[.!?]+', transcript) sentences = [s.strip() for s in sentences if s.strip()] # Analyze each sentence rich_transcript = [] current_time = 0 for i, sentence in enumerate(sentences): # Estimate timestamp (rough approximation) timestamp = current_time + (i * 2) # Assume ~2 seconds per sentence # Determine speaker for this timestamp speaker = "UNKNOWN" if speaker_segments: for segment in speaker_segments: if segment['start'] <= timestamp <= segment['end']: speaker = segment['speaker'] break # Sentiment analysis sentiment_result = sentiment_model(sentence)[0] if sentiment_model else None sentiment = max(sentiment_result, key=lambda x: x['score']) if sentiment_result else {'label': 'neutral', 'score': 0.5} # Emotion analysis emotion_result = emotion_model(sentence)[0] if emotion_model else None emotion = max(emotion_result, key=lambda x: x['score']) if emotion_result else {'label': 'neutral', 'score': 0.5} # Word count and complexity metrics words = sentence.split() word_count = len(words) avg_word_length = np.mean([len(word) for word in words]) if words else 0 # Calculate speech rate (words per minute estimate) speech_rate = word_count * 30 / 60 # Rough estimate rich_transcript.append({ 'timestamp': timestamp, 'speaker': speaker, 'sentence': sentence, 'word_count': word_count, 'avg_word_length': round(avg_word_length, 2), 'speech_rate_wpm': round(speech_rate, 1), 'sentiment': sentiment['label'], 'sentiment_score': round(sentiment['score'], 3), 'emotion': emotion['label'], 'emotion_score': round(emotion['score'], 3) }) current_time = timestamp status_msg = f"Transcription completed successfully. {process_status}" if diarization_status: status_msg += f" {diarization_status}" return rich_transcript, status_msg except Exception as e: logger.error(f"Error in transcription: {e}") return None, f"Transcription error: {str(e)}" def format_rich_transcript(rich_transcript): """Format rich transcript for display""" if not rich_transcript: return "No transcript data available" formatted_lines = [] for entry in rich_transcript: timestamp_str = f"{int(entry['timestamp']//60):02d}:{int(entry['timestamp']%60):02d}" line = f"[{timestamp_str}] *{entry['speaker']}: {entry['sentence']}" line += f" [Words: {entry['word_count']}, Rate: {entry['speech_rate_wpm']}wpm]" line += f" [Sentiment: {entry['sentiment']} ({entry['sentiment_score']})]" line += f" [Emotion: {entry['emotion']} ({entry['emotion_score']})]" formatted_lines.append(line) return '\n'.join(formatted_lines) def calculate_slp_metrics(rich_transcript): """Calculate comprehensive SLP metrics""" if not rich_transcript: return {} # Basic metrics total_sentences = len(rich_transcript) total_words = sum(entry['word_count'] for entry in rich_transcript) total_duration = rich_transcript[-1]['timestamp'] if rich_transcript else 0 # Speaker analysis speakers = {} for entry in rich_transcript: speaker = entry['speaker'] if speaker not in speakers: speakers[speaker] = { 'sentences': 0, 'words': 0, 'sentiments': [], 'emotions': [] } speakers[speaker]['sentences'] += 1 speakers[speaker]['words'] += entry['word_count'] speakers[speaker]['sentiments'].append(entry['sentiment']) speakers[speaker]['emotions'].append(entry['emotion']) # Word-level analysis all_words = [] for entry in rich_transcript: words = entry['sentence'].lower().split() all_words.extend(words) # Word frequency distribution word_freq = {} for word in all_words: word_clean = re.sub(r'[^\w\s]', '', word) if word_clean: word_freq[word_clean] = word_freq.get(word_clean, 0) + 1 # Vocabulary diversity (Type-Token Ratio) unique_words = len(set(all_words)) ttr = unique_words / total_words if total_words > 0 else 0 # Speech rate analysis speech_rates = [entry['speech_rate_wpm'] for entry in rich_transcript] avg_speech_rate = np.mean(speech_rates) if speech_rates else 0 # Sentiment analysis sentiment_counts = {} emotion_counts = {} for entry in rich_transcript: sentiment_counts[entry['sentiment']] = sentiment_counts.get(entry['sentiment'], 0) + 1 emotion_counts[entry['emotion']] = emotion_counts.get(entry['emotion'], 0) + 1 # Sentence complexity sentence_lengths = [entry['word_count'] for entry in rich_transcript] avg_sentence_length = np.mean(sentence_lengths) if sentence_lengths else 0 # Pause analysis (gaps between sentences) pauses = [] for i in range(1, len(rich_transcript)): pause = rich_transcript[i]['timestamp'] - rich_transcript[i-1]['timestamp'] pauses.append(pause) avg_pause_duration = np.mean(pauses) if pauses else 0 return { 'total_sentences': total_sentences, 'total_words': total_words, 'total_duration_seconds': total_duration, 'unique_words': unique_words, 'type_token_ratio': round(ttr, 3), 'avg_sentence_length': round(avg_sentence_length, 1), 'avg_speech_rate_wpm': round(avg_speech_rate, 1), 'avg_pause_duration': round(avg_pause_duration, 1), 'sentiment_distribution': sentiment_counts, 'emotion_distribution': emotion_counts, 'word_frequency': dict(sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:20]), 'speech_rate_variability': round(np.std(speech_rates), 1) if speech_rates else 0, 'speakers': speakers, 'speaker_count': len(speakers) } def generate_comprehensive_analysis_prompt(rich_transcript, metrics, age, gender, slp_notes=""): """Generate comprehensive analysis prompt using rich transcript data""" # Format rich transcript with timestamps and metadata transcript_lines = [] for entry in rich_transcript: timestamp_str = f"{int(entry['timestamp']//60):02d}:{int(entry['timestamp']%60):02d}" transcript_lines.append(f"[{timestamp_str}] *{entry['speaker']}: {entry['sentence']}") transcript_text = '\n'.join(transcript_lines) # Format metrics for analysis metrics_text = f""" TRANSCRIPT METRICS: • Total sentences: {metrics['total_sentences']} • Total words: {metrics['total_words']} • Duration: {metrics['total_duration_seconds']:.1f} seconds • Type-Token Ratio: {metrics['type_token_ratio']} (vocabulary diversity) • Average sentence length: {metrics['avg_sentence_length']} words • Average speech rate: {metrics['avg_speech_rate_wpm']} words per minute • Speech rate variability: {metrics['speech_rate_variability']} wpm • Average pause duration: {metrics['avg_pause_duration']:.1f} seconds • Number of speakers: {metrics['speaker_count']} SENTIMENT DISTRIBUTION: {metrics['sentiment_distribution']} EMOTION DISTRIBUTION: {metrics['emotion_distribution']} SPEAKER ANALYSIS:""" for speaker, data in metrics['speakers'].items(): metrics_text += f"\n• {speaker}: {data['sentences']} sentences, {data['words']} words" metrics_text += f"\n\nMOST FREQUENT WORDS: {list(metrics['word_frequency'].keys())[:10]}" notes_section = f"\nSLP CLINICAL NOTES:\n{slp_notes}" if slp_notes else "" prompt = f""" You are a speech-language pathologist conducting a comprehensive analysis of a speech transcript with rich temporal and affective metadata. PATIENT: {age}-year-old {gender} {metrics_text} TRANSCRIPT WITH TIMESTAMPS AND METADATA: {transcript_text}{notes_section} Please provide a comprehensive analysis including: 1. TEMPORAL SPEECH PATTERNS: - Analyze speech rate changes over time using timestamps - Identify patterns in pause duration and frequency - Assess temporal consistency in speech production - Note any significant changes in speech patterns throughout the session 2. AFFECTIVE AND EMOTIONAL ANALYSIS: - Analyze sentiment patterns throughout the transcript using timestamp data - Identify emotional shifts and their potential causes - Assess emotional regulation and expression - Note any correlations between emotional state and speech characteristics 3. SPEAKER-SPECIFIC ANALYSIS (if multiple speakers): - Compare speech patterns between speakers - Analyze turn-taking patterns and timing - Assess interaction dynamics - Note speaker-specific emotional and sentiment patterns 4. SPEECH FLUENCY AND RATE ANALYSIS: - Analyze speech rate variability using the provided metrics - Identify periods of fluent vs. dysfluent speech - Assess the impact of emotional state on speech rate - Note any temporal patterns in speech rate changes 5. LANGUAGE COMPLEXITY ASSESSMENT: - Analyze vocabulary diversity using Type-Token Ratio - Assess sentence complexity and variety - Identify patterns in word frequency and usage - Note any temporal changes in language complexity 6. COMPLEX SENTENCE ANALYSIS: - Count and analyze use of coordinating conjunctions (and, but, or, so, yet, for, nor) - Count and analyze use of subordinating conjunctions (because, although, while, since, if, when, where, that, which, who, whom, whose) - Identify compound, complex, and compound-complex sentences - Assess sentence variety and complexity level for age 7. FIGURATIVE LANGUAGE ANALYSIS: - Identify and count similes (comparisons using "like" or "as") - Identify and count metaphors (direct comparisons without "like" or "as") - Identify and count idioms (common expressions with non-literal meanings) - Assess figurative language comprehension and use for age 8. CLINICAL IMPLICATIONS: - Specific intervention targets based on temporal patterns - Recommendations for emotional regulation if needed - Suggestions for improving speech rate consistency - Strategies for enhancing language complexity - Age-appropriate development recommendations 9. COMPREHENSIVE SUMMARY: - Overall communication profile with temporal considerations - Assessment of emotional and affective communication - Developmental appropriateness considering age - Prognosis and treatment priorities Use the temporal data, sentiment scores, and emotional labels to provide insights that would not be possible with a simple transcript. Reference specific timestamps and emotional states when making observations. """ return prompt def analyze_rich_transcript_with_llm(rich_transcript, age, gender, slp_notes=""): """Analyze rich transcript using LLM with comprehensive metadata""" if not rich_transcript: return "No transcript data available for analysis." # Calculate SLP metrics metrics = calculate_slp_metrics(rich_transcript) # Generate comprehensive analysis prompt prompt = generate_comprehensive_analysis_prompt(rich_transcript, metrics, age, gender, slp_notes) # Get analysis from Claude API if ANTHROPIC_API_KEY: result = call_claude_api(prompt) else: result = generate_demo_analysis(rich_transcript, metrics) return result def call_claude_api(prompt): """Call Claude API directly""" if not ANTHROPIC_API_KEY: return "❌ Claude API key not configured. Please set ANTHROPIC_API_KEY environment variable." try: headers = { "Content-Type": "application/json", "x-api-key": ANTHROPIC_API_KEY, "anthropic-version": "2023-06-01" } data = { "model": "claude-3-5-sonnet-20241022", "max_tokens": 4096, "messages": [ { "role": "user", "content": prompt } ] } response = requests.post( "https://api.anthropic.com/v1/messages", headers=headers, json=data, timeout=60 ) if response.status_code == 200: response_json = response.json() return response_json['content'][0]['text'] else: logger.error(f"Claude API error: {response.status_code} - {response.text}") return f"❌ Claude API Error: {response.status_code}" except Exception as e: logger.error(f"Error calling Claude API: {str(e)}") return f"❌ Error: {str(e)}" def generate_demo_analysis(rich_transcript, metrics): """Generate demo analysis when API is not available""" return f"""## Comprehensive SLP Analysis with Temporal and Affective Data ### TEMPORAL SPEECH PATTERNS **Speech Rate Analysis**: {metrics['avg_speech_rate_wpm']} words per minute (variability: {metrics['speech_rate_variability']} wpm) - Speech rate appears {'within normal limits' if 120 <= metrics['avg_speech_rate_wpm'] <= 180 else 'below typical range' if metrics['avg_speech_rate_wpm'] < 120 else 'above typical range'} - Variability suggests {'consistent' if metrics['speech_rate_variability'] < 20 else 'variable'} speech patterns **Pause Analysis**: Average pause duration of {metrics['avg_pause_duration']:.1f} seconds - {'Appropriate' if 0.5 <= metrics['avg_pause_duration'] <= 2.0 else 'Short' if metrics['avg_pause_duration'] < 0.5 else 'Long'} pauses between utterances ### AFFECTIVE AND EMOTIONAL ANALYSIS **Sentiment Distribution**: {metrics['sentiment_distribution']} **Emotion Distribution**: {metrics['emotion_distribution']} The emotional patterns suggest {'positive' if 'positive' in metrics['sentiment_distribution'] and metrics['sentiment_distribution']['positive'] > 2 else 'neutral' if 'neutral' in metrics['sentiment_distribution'] and metrics['sentiment_distribution']['neutral'] > 2 else 'mixed'} emotional expression throughout the session. ### LANGUAGE COMPLEXITY **Vocabulary Diversity**: Type-Token Ratio of {metrics['type_token_ratio']} - {'Good' if metrics['type_token_ratio'] > 0.4 else 'Limited' if metrics['type_token_ratio'] < 0.3 else 'Moderate'} vocabulary diversity **Sentence Structure**: Average {metrics['avg_sentence_length']} words per sentence - Sentence length appears {'age-appropriate' if 5 <= metrics['avg_sentence_length'] <= 12 else 'below age expectations' if metrics['avg_sentence_length'] < 5 else 'above age expectations'} **Most Frequent Words**: {', '.join(list(metrics['word_frequency'].keys())[:5])} ### SPEAKER ANALYSIS **Number of Speakers**: {metrics['speaker_count']} {chr(10).join([f"• {speaker}: {data['sentences']} sentences, {data['words']} words" for speaker, data in metrics['speakers'].items()])} ### CLINICAL IMPLICATIONS Based on the temporal and affective analysis, this patient shows: - {'Good' if metrics['type_token_ratio'] > 0.4 else 'Limited'} vocabulary diversity - {'Appropriate' if 120 <= metrics['avg_speech_rate_wpm'] <= 180 else 'Atypical'} speech rate - {'Consistent' if metrics['speech_rate_variability'] < 20 else 'Variable'} speech patterns - {'Positive' if 'positive' in metrics['sentiment_distribution'] and metrics['sentiment_distribution']['positive'] > 2 else 'Neutral'} emotional expression ### RECOMMENDATIONS 1. Focus on vocabulary expansion if TTR < 0.4 2. Address speech rate if outside normal range 3. Work on sentence complexity if below age expectations 4. Consider emotional regulation strategies based on sentiment patterns 5. Monitor temporal patterns in speech rate and fluency""" def create_transcription_interface(): """Create the transcription-focused Gradio interface""" with gr.Blocks(title="Advanced Transcription Tool", theme=gr.themes.Soft()) as app: gr.Markdown("# 🎤 Advanced Transcription Tool") gr.Markdown("Transcribe audio/video with speaker diarization, timestamps, sentiment analysis, and comprehensive LLM analysis") with gr.Tabs(): # Audio/Video Upload & Transcription Tab with gr.Tab("🎤 Audio/Video Transcription"): with gr.Row(): with gr.Column(scale=1): gr.Markdown("### File Upload") gr.Markdown("**Supported formats:** MP4, AVI, MOV, MKV, WMV, FLV, WAV, MP3, M4A, FLAC, OGG") file_input = gr.File( label="Upload Audio or Video File", file_types=["audio", "video"] ) enable_diarization = gr.Checkbox( label="Enable Speaker Diarization", value=True, info="Identify different speakers in the audio" ) transcribe_btn = gr.Button( "🎤 Transcribe File", variant="primary", size="lg" ) transcription_status = gr.Markdown("") with gr.Column(scale=2): gr.Markdown("### Rich Transcript with Metadata") rich_transcript_display = gr.Textbox( label="Transcription with Speakers, Timestamps, Sentiment & Emotion", lines=15, max_lines=20 ) # Analysis Tab with gr.Tab("📊 LLM Analysis"): with gr.Row(): with gr.Column(scale=1): gr.Markdown("### Patient Information") with gr.Row(): age = gr.Number(label="Age", value=8, minimum=1, maximum=120) gender = gr.Radio(["male", "female", "other"], label="Gender", value="male") slp_notes = gr.Textbox( label="SLP Clinical Notes (Optional)", placeholder="Enter additional clinical observations...", lines=3 ) analyze_btn = gr.Button( "🔍 Analyze with LLM", variant="primary", size="lg" ) with gr.Column(scale=2): gr.Markdown("### Comprehensive LLM Analysis") analysis_output = gr.Textbox( label="LLM Analysis Report", lines=25, max_lines=30 ) # Metrics Tab with gr.Tab("📈 Speech Metrics"): with gr.Row(): with gr.Column(): gr.Markdown("### Quantitative Speech Metrics") metrics_display = gr.Textbox( label="SLP Metrics", lines=15, max_lines=20 ) with gr.Column(): gr.Markdown("### Word Frequency Analysis") word_freq_display = gr.Dataframe( headers=["Word", "Frequency"], label="Most Frequent Words", interactive=False ) # Raw Data Tab with gr.Tab("📊 Raw Data"): with gr.Row(): with gr.Column(): gr.Markdown("### JSON Data") json_display = gr.Textbox( label="Raw JSON Data", lines=20, max_lines=25 ) # Event handlers def on_transcribe(file, diarization_enabled): """Handle file transcription""" if not file: return "", "", "", "Please upload a file first." rich_transcript, status = transcribe_audio_with_metadata(file.name, diarization_enabled) if rich_transcript: formatted = format_rich_transcript(rich_transcript) metrics = calculate_slp_metrics(rich_transcript) # Format metrics for display metrics_text = f"""SPEECH METRICS: • Total sentences: {metrics['total_sentences']} • Total words: {metrics['total_words']} • Duration: {metrics['total_duration_seconds']:.1f} seconds • Type-Token Ratio: {metrics['type_token_ratio']} (vocabulary diversity) • Average sentence length: {metrics['avg_sentence_length']} words • Average speech rate: {metrics['avg_speech_rate_wpm']} words per minute • Speech rate variability: {metrics['speech_rate_variability']} wpm • Average pause duration: {metrics['avg_pause_duration']:.1f} seconds • Number of speakers: {metrics['speaker_count']} SENTIMENT DISTRIBUTION: {metrics['sentiment_distribution']} EMOTION DISTRIBUTION: {metrics['emotion_distribution']} SPEAKER ANALYSIS:""" for speaker, data in metrics['speakers'].items(): metrics_text += f"\n• {speaker}: {data['sentences']} sentences, {data['words']} words" # Create word frequency dataframe word_freq_data = [[word, freq] for word, freq in list(metrics['word_frequency'].items())[:20]] # JSON data json_data = json.dumps(rich_transcript, indent=2) return formatted, metrics_text, word_freq_data, status else: return "", "", [], status def on_analyze(rich_transcript_text, age_val, gender_val, notes): """Handle LLM analysis""" if not rich_transcript_text or rich_transcript_text == "No transcript data available": return "Please transcribe audio first." # Convert formatted text back to rich transcript structure lines = rich_transcript_text.split('\n') rich_transcript = [] for i, line in enumerate(lines): if line.strip(): # Extract data from the formatted line timestamp_match = re.search(r'\[(\d{2}:\d{2})\]', line) speaker_match = re.search(r'\*(\w+):', line) sentence_match = re.search(r'\*\w+:\s*(.+?)(?=\s*\[|$)', line) if timestamp_match and speaker_match and sentence_match: timestamp_str = timestamp_match.group(1) minutes, seconds = map(int, timestamp_str.split(':')) timestamp = minutes * 60 + seconds speaker = speaker_match.group(1) sentence = sentence_match.group(1).strip() rich_transcript.append({ 'timestamp': timestamp, 'speaker': speaker, 'sentence': sentence, 'word_count': len(sentence.split()), 'avg_word_length': np.mean([len(word) for word in sentence.split()]) if sentence.split() else 0, 'speech_rate_wpm': 120.0, 'sentiment': 'neutral', 'sentiment_score': 0.5, 'emotion': 'neutral', 'emotion_score': 0.5 }) return analyze_rich_transcript_with_llm(rich_transcript, age_val, gender_val, notes) # Connect event handlers transcribe_btn.click( on_transcribe, inputs=[file_input, enable_diarization], outputs=[rich_transcript_display, metrics_display, word_freq_display, transcription_status] ) analyze_btn.click( on_analyze, inputs=[rich_transcript_display, age, gender, slp_notes], outputs=[analysis_output] ) return app if __name__ == "__main__": print("🚀 Starting Advanced Transcription Tool...") if not MOVIEPY_AVAILABLE: print("⚠️ MoviePy not available - video processing will be limited") print(" Install with: pip install moviepy") else: print("✅ MoviePy available for video processing") if not DIARIZATION_AVAILABLE: print("⚠️ Pyannote.audio not available - speaker diarization will be disabled") print(" Install with: pip install pyannote.audio") else: print("✅ Pyannote.audio available for speaker diarization") if not os.getenv("HF_TOKEN"): print("⚠️ HF_TOKEN not set - set it to enable speaker diarization") print(" Get token from: https://huggingface.co/settings/tokens") print(" Accept model terms at: https://huggingface.co/pyannote/speaker-diarization") if not SPEECHBRAIN_AVAILABLE: print("⚠️ SpeechBrain not available - audio transcription will use demo mode") print(" Install with: pip install speechbrain transformers torch") else: print("✅ SpeechBrain and HuggingFace models loaded") app = create_transcription_interface() app.launch(show_api=False)