Spaces:
Running
Running
File size: 8,855 Bytes
725f466 00fa6cd 725f466 17d36a8 725f466 00fa6cd 725f466 00fa6cd 725f466 00fa6cd e9c60de 00fa6cd e9c60de 00fa6cd 34a1847 00fa6cd 34a1847 8484f48 6e4378a d099202 6e4378a 00fa6cd 34a1847 e9c60de 34a1847 e9c60de 34a1847 e9c60de 34a1847 00fa6cd e9c60de d099202 e9c60de d099202 e9c60de d099202 e9c60de 00fa6cd e9c60de 34a1847 e9c60de 34a1847 e9c60de 34a1847 00fa6cd e9c60de 00fa6cd e9c60de a6e99c9 00fa6cd a6e99c9 00fa6cd a6e99c9 00fa6cd a6e99c9 00fa6cd e9c60de 00fa6cd e9c60de 00fa6cd e9c60de 00fa6cd e9c60de 00fa6cd e9c60de 00fa6cd e9c60de |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 |
import os
import torch
import numpy as np
import librosa
import torchaudio
from typing import Dict, Any, Optional, Union, Tuple, List
import warnings
from transformers import (
AutoModelForCTC,
AutoProcessor,
pipeline,
SpeechT5Processor,
SpeechT5ForSpeechToText,
SpeechT5HifiGan
)
# Suppress specific warnings
warnings.filterwarnings("ignore", message=".*gradient_checkpointing*.")
warnings.filterwarnings("ignore", message="Using the model-agnostic default `max_length`")
warnings.filterwarnings("ignore", message="You are using the default legacy behaviour")
class HFTranscriber:
def __init__(self, model_name: str = "facebook/wav2vec2-base-960h"):
"""
Initialize the Hugging Face transcriber with a pre-trained model.
Args:
model_name (str): Name of the Hugging Face model to use for transcription.
Supported models:
- "facebook/wav2vec2-base-960h" (default)
- "openai/whisper-small"
- "microsoft/speecht5_asr"
"""
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model_name = model_name
self.processor = None
self.model = None
self.vocoder = None
self.is_speecht5 = "speecht5" in model_name.lower()
self.is_whisper = "whisper" in model_name.lower()
self._load_model()
def _load_model(self):
"""Load the model and processor based on the model type with authentication."""
try:
# Try to get Hugging Face token from environment
hf_token = (os.environ.get('HUGGINGFACE_TOKEN') or os.environ.get('HF_TOKEN') or (st.secrets.get('HUGGINGFACE_TOKEN') if 'secrets' in globals() and hasattr(st.secrets, 'get') else None) or (st.secrets.get('HF_TOKEN') if 'secrets' in globals() and hasattr(st.secrets, 'get') else None))
if not hf_token:
st.sidebar.error("No Hugging Face token found. Using public access (rate limited).Please add it to your environment variables as HUGGINGFACE_TOKEN or HF_TOKEN.")
#Configure headers for API requests
headers ={}
if hf_token:
headers['Authorization'] = f'Bearer {hf_token}'
#Configure model loading parameters
load_kwargs = {'token': hf_token, 'use_auth_token': hf_token, 'local_files_only': False, 'device': 'cuda' if torch.cuda.is_available() else 'cpu'}
#Remove None values
load_kwargs = {k: v for k, v in load_kwargs.items() if v is not None}
#Rest of model loading code.....
if self.is_speecht5:
# Load SpeechT5 model and processor
self.processor = SpeechT5Processor.from_pretrained(
self.model_name,
**load_kwargs
)
self.model = SpeechT5ForSpeechToText.from_pretrained(
self.model_name,
**load_kwargs
)
self.vocoder = SpeechT5HifiGan.from_pretrained(
"microsoft/speecht5_hifigan",
**load_kwargs
)
self.model = self.model.to(self.device)
self.vocoder = self.vocoder.to(self.device)
self.model.eval()
self.vocoder.eval()
elif self.is_whisper:
# For whisper, we'll use the pipeline with the token
self.model = pipeline(
"automatic-speech-recognition",
model=self.model_name,
token=hf_token, # Pass token directly
device=0 if self.device == "cuda" else -1
)
self.processor = None # Not needed when using pipeline
else:
# Load wav2vec2 model and processor
self.processor = AutoProcessor.from_pretrained(
self.model_name,
**load_kwargs
)
self.model = AutoModelForCTC.from_pretrained(
self.model_name,
**load_kwargs
)
self.model = self.model.to(self.device)
self.model.eval()
except Exception as e:
error_msg = str(e)
if "401" in error_msg or "401" in str(e.__cause__):
raise Exception(
"Authentication failed. Please check your Hugging Face token.\n"
"1. Get your token from https://huggingface.co/settings/tokens\n"
"2. Add it to your environment variables as HUGGINGFACE_TOKEN"
) from e
elif "404" in error_msg:
raise Exception(
f"Model {self.model_name} not found. Please check the model name."
) from e
else:
raise Exception(
f"Failed to load model {self.model_name}: {error_msg}"
) from e
def transcribe_audio(self, audio_array: np.ndarray, sample_rate: int) -> Dict[str, Any]:
"""
Transcribe audio data to text using the loaded Hugging Face model.
Args:
audio_array (np.ndarray): Audio data as a numpy array
sample_rate (int): Sample rate of the audio data
Returns:
dict: Dictionary containing 'text' and optionally 'word_timestamps'
"""
try:
if self.is_speecht5:
return self._transcribe_speecht5(audio_array, sample_rate)
elif self.is_whisper:
return self._transcribe_whisper(audio_array, sample_rate)
else:
return self._transcribe_wav2vec2(audio_array, sample_rate)
except Exception as e:
raise Exception(f"Transcription failed: {str(e)}") from e
def _transcribe_speecht5(self, audio_array: np.ndarray, sample_rate: int) -> Dict[str, Any]:
"""Transcribe audio using SpeechT5 model."""
inputs = self.processor(
audio=audio_array,
sampling_rate=sample_rate,
return_tensors="pt"
).to(self.device)
with torch.no_grad():
outputs = self.model.generate(
input_values=inputs.input_values,
speaker_embeddings=None,
return_dict_in_generate=True
)
# Decode the predicted text
predicted_text = self.processor.batch_decode(outputs.sequences, skip_special_tokens=True)[0]
return {
'text': predicted_text,
'model': self.model_name
}
def _transcribe_whisper(self, audio_array: np.ndarray, sample_rate: int) -> Dict[str, Any]:
"""Transcribe audio using Whisper model."""
result = self.model({
"raw": audio_array,
"sampling_rate": sample_rate
})
return {
'text': result['text'],
'model': self.model_name
}
def _transcribe_wav2vec2(self, audio_array: np.ndarray, sample_rate: int) -> Dict[str, Any]:
"""Transcribe audio using wav2vec2 model."""
# Resample if needed
if sample_rate != 16000:
audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000)
sample_rate = 16000
# Process the audio
inputs = self.processor(
audio_array,
sampling_rate=sample_rate,
return_tensors="pt",
padding=True
).to(self.device)
with torch.no_grad():
logits = self.model(inputs.input_values).logits
# Get the predicted token ids
predicted_ids = torch.argmax(logits, dim=-1)
# Decode the token ids to text
transcription = self.processor.batch_decode(predicted_ids)[0]
return {
'text': transcription,
'model': self.model_name
}
def transcribe_with_hf(audio_path: str, model_name: str = "openai/whisper-tiny") -> Dict[str, Any]:
"""
Convenience function to transcribe audio using a Hugging Face model.
Args:
audio_path (str): Path to the audio file
model_name (str): Name of the Hugging Face model to use
Returns:
dict: Dictionary containing transcription results
"""
try:
transcriber = HFTranscriber(model_name=model_name)
return transcriber.transcribe_audio(audio_path)
except Exception as e:
return {
'error': str(e),
'model': model_name
} |