import os import torch import numpy as np import librosa import torchaudio from typing import Dict, Any, Optional, Union, Tuple, List import warnings from transformers import ( AutoModelForCTC, AutoProcessor, pipeline, SpeechT5Processor, SpeechT5ForSpeechToText, SpeechT5HifiGan ) # Suppress specific warnings warnings.filterwarnings("ignore", message=".*gradient_checkpointing*.") warnings.filterwarnings("ignore", message="Using the model-agnostic default `max_length`") warnings.filterwarnings("ignore", message="You are using the default legacy behaviour") class HFTranscriber: def __init__(self, model_name: str = "facebook/wav2vec2-base-960h"): """ Initialize the Hugging Face transcriber with a pre-trained model. Args: model_name (str): Name of the Hugging Face model to use for transcription. Supported models: - "facebook/wav2vec2-base-960h" (default) - "openai/whisper-small" - "microsoft/speecht5_asr" """ self.device = "cuda" if torch.cuda.is_available() else "cpu" self.model_name = model_name self.processor = None self.model = None self.vocoder = None self.is_speecht5 = "speecht5" in model_name.lower() self.is_whisper = "whisper" in model_name.lower() self._load_model() def _load_model(self): """Load the model and processor based on the model type with authentication.""" try: # Try to get Hugging Face token from environment hf_token = (os.environ.get('HUGGINGFACE_TOKEN') or os.environ.get('HF_TOKEN') or (st.secrets.get('HUGGINGFACE_TOKEN') if 'secrets' in globals() and hasattr(st.secrets, 'get') else None) or (st.secrets.get('HF_TOKEN') if 'secrets' in globals() and hasattr(st.secrets, 'get') else None)) if not hf_token: st.sidebar.error("No Hugging Face token found. Using public access (rate limited).Please add it to your environment variables as HUGGINGFACE_TOKEN or HF_TOKEN.") #Configure headers for API requests headers ={} if hf_token: headers['Authorization'] = f'Bearer {hf_token}' #Configure model loading parameters load_kwargs = {'token': hf_token, 'use_auth_token': hf_token, 'local_files_only': False, 'device': 'cuda' if torch.cuda.is_available() else 'cpu'} #Remove None values load_kwargs = {k: v for k, v in load_kwargs.items() if v is not None} #Rest of model loading code..... if self.is_speecht5: # Load SpeechT5 model and processor self.processor = SpeechT5Processor.from_pretrained( self.model_name, **load_kwargs ) self.model = SpeechT5ForSpeechToText.from_pretrained( self.model_name, **load_kwargs ) self.vocoder = SpeechT5HifiGan.from_pretrained( "microsoft/speecht5_hifigan", **load_kwargs ) self.model = self.model.to(self.device) self.vocoder = self.vocoder.to(self.device) self.model.eval() self.vocoder.eval() elif self.is_whisper: # For whisper, we'll use the pipeline with the token self.model = pipeline( "automatic-speech-recognition", model=self.model_name, token=hf_token, # Pass token directly device=0 if self.device == "cuda" else -1 ) self.processor = None # Not needed when using pipeline else: # Load wav2vec2 model and processor self.processor = AutoProcessor.from_pretrained( self.model_name, **load_kwargs ) self.model = AutoModelForCTC.from_pretrained( self.model_name, **load_kwargs ) self.model = self.model.to(self.device) self.model.eval() except Exception as e: error_msg = str(e) if "401" in error_msg or "401" in str(e.__cause__): raise Exception( "Authentication failed. Please check your Hugging Face token.\n" "1. Get your token from https://huggingface.co/settings/tokens\n" "2. Add it to your environment variables as HUGGINGFACE_TOKEN" ) from e elif "404" in error_msg: raise Exception( f"Model {self.model_name} not found. Please check the model name." ) from e else: raise Exception( f"Failed to load model {self.model_name}: {error_msg}" ) from e def transcribe_audio(self, audio_array: np.ndarray, sample_rate: int) -> Dict[str, Any]: """ Transcribe audio data to text using the loaded Hugging Face model. Args: audio_array (np.ndarray): Audio data as a numpy array sample_rate (int): Sample rate of the audio data Returns: dict: Dictionary containing 'text' and optionally 'word_timestamps' """ try: if self.is_speecht5: return self._transcribe_speecht5(audio_array, sample_rate) elif self.is_whisper: return self._transcribe_whisper(audio_array, sample_rate) else: return self._transcribe_wav2vec2(audio_array, sample_rate) except Exception as e: raise Exception(f"Transcription failed: {str(e)}") from e def _transcribe_speecht5(self, audio_array: np.ndarray, sample_rate: int) -> Dict[str, Any]: """Transcribe audio using SpeechT5 model.""" inputs = self.processor( audio=audio_array, sampling_rate=sample_rate, return_tensors="pt" ).to(self.device) with torch.no_grad(): outputs = self.model.generate( input_values=inputs.input_values, speaker_embeddings=None, return_dict_in_generate=True ) # Decode the predicted text predicted_text = self.processor.batch_decode(outputs.sequences, skip_special_tokens=True)[0] return { 'text': predicted_text, 'model': self.model_name } def _transcribe_whisper(self, audio_array: np.ndarray, sample_rate: int) -> Dict[str, Any]: """Transcribe audio using Whisper model.""" result = self.model({ "raw": audio_array, "sampling_rate": sample_rate }) return { 'text': result['text'], 'model': self.model_name } def _transcribe_wav2vec2(self, audio_array: np.ndarray, sample_rate: int) -> Dict[str, Any]: """Transcribe audio using wav2vec2 model.""" # Resample if needed if sample_rate != 16000: audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000) sample_rate = 16000 # Process the audio inputs = self.processor( audio_array, sampling_rate=sample_rate, return_tensors="pt", padding=True ).to(self.device) with torch.no_grad(): logits = self.model(inputs.input_values).logits # Get the predicted token ids predicted_ids = torch.argmax(logits, dim=-1) # Decode the token ids to text transcription = self.processor.batch_decode(predicted_ids)[0] return { 'text': transcription, 'model': self.model_name } def transcribe_with_hf(audio_path: str, model_name: str = "openai/whisper-tiny") -> Dict[str, Any]: """ Convenience function to transcribe audio using a Hugging Face model. Args: audio_path (str): Path to the audio file model_name (str): Name of the Hugging Face model to use Returns: dict: Dictionary containing transcription results """ try: transcriber = HFTranscriber(model_name=model_name) return transcriber.transcribe_audio(audio_path) except Exception as e: return { 'error': str(e), 'model': model_name }