Spaces:

alakxender
/

asr-dhivehi-demo

Running on Zero

App Files Files Community

alakxender commited on 18 days ago

Commit

febf67e

1 Parent(s): 5ef23ad

t

Browse files

Files changed (1) hide show

app.py +87 -47

app.py CHANGED Viewed

@@ -5,46 +5,28 @@ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 import tempfile
 import os
-# Model configuration, this model contains synthetic data
-MODEL_ID = "alakxender/whisper-small-dv-full"
 BATCH_SIZE = 8
 FILE_LIMIT_MB = 1000
 CHUNK_LENGTH_S = 10
 STRIDE_LENGTH_S = [3,2]
-# Device and dtype setup
 device = 0 if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-# Initialize model with memory optimizations
-model = AutoModelForSpeechSeq2Seq.from_pretrained(
-    MODEL_ID,
-    torch_dtype=torch_dtype,
-    low_cpu_mem_usage=True,
-    use_safetensors=True
-)
-model.to(device)
-# Initialize processor
-processor = AutoProcessor.from_pretrained(MODEL_ID)
-# Single pipeline initialization with all components
-pipe = pipeline(
-    "automatic-speech-recognition",
-    model=model,
-    tokenizer=processor.tokenizer,
-    feature_extractor=processor.feature_extractor,
-    chunk_length_s=CHUNK_LENGTH_S,
-    stride_length_s=STRIDE_LENGTH_S,
-    batch_size=BATCH_SIZE,
-    torch_dtype=torch_dtype,
-    device=device,
-)
-# Define the generation arguments
 # Define optimized generation arguments
-def get_generate_kwargs(is_short_audio=False):
     """
     Get appropriate generation parameters based on audio length.
     Short audio transcription benefits from different parameters.
@@ -72,29 +54,81 @@ def get_generate_kwargs(is_short_audio=False):
             "repetition_penalty": 1.2,              # Light penalty for repeated tokens
         }
-# IMPORTANT: Fix for forced_decoder_ids error
-# Remove forced_decoder_ids from the model's generation config
-if hasattr(model.generation_config, 'forced_decoder_ids'):
-    print("Removing forced_decoder_ids from generation config")
-    model.generation_config.forced_decoder_ids = None
-# Also check if it's in the model config
-if hasattr(model.config, 'forced_decoder_ids'):
-    print("Removing forced_decoder_ids from model config")
-    delattr(model.config, 'forced_decoder_ids')
 @spaces.GPU
-def transcribe(audio_input):
     if audio_input is None:
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
     try:
         # Use the defined generate_kwargs dictionary
-        result = pipe(
             audio_input,
-            generate_kwargs=get_generate_kwargs()
         )
         return result["text"]
     except Exception as e:
         # More detailed error logging might be helpful here if issues persist
         print(f"Detailed Error: {e}")
@@ -116,6 +150,12 @@ file_transcribe = gr.Interface(
     fn=transcribe,
     inputs=[
         gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio file"),
     ],
     outputs=gr.Textbox(
         label="",
@@ -125,11 +165,11 @@ file_transcribe = gr.Interface(
     ),
     title="Transcribe Dhivehi Audio",
     description=(
-        "Upload an audio file or record using your microphone to transcribe."
     ),
     flagging_mode="never",
     examples=[
-        ["sample.mp3"]
     ],
     api_name=False,
     cache_examples=False

 import tempfile
 import os
+# Available models
+MODELS = {
+    "alakxender/whisper-small-dv-full": "Whisper Small DV Full",
+    #"alakxender/whisper-small-dv-mx02": "Whisper Small DV MX02"
+}
+# Model configuration constants
 BATCH_SIZE = 8
 FILE_LIMIT_MB = 1000
 CHUNK_LENGTH_S = 10
 STRIDE_LENGTH_S = [3,2]
+# Global variables for device and model management
 device = 0 if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+current_model_id = None
+current_model = None
+current_processor = None
+current_pipe = None
 # Define optimized generation arguments
+def get_generate_kwargs(model, is_short_audio=False):
     """
     Get appropriate generation parameters based on audio length.
     Short audio transcription benefits from different parameters.
             "repetition_penalty": 1.2,              # Light penalty for repeated tokens
         }
 @spaces.GPU
+def transcribe(audio_input, model_choice, progress=gr.Progress()):
+    global current_model_id, current_model, current_processor, current_pipe, device, torch_dtype
     if audio_input is None:
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
     try:
+        # Load the selected model if not already loaded or different model selected
+        if current_model_id != model_choice or current_model is None:
+            progress(0, desc=f"Loading model: {MODELS[model_choice]}")
+            print(f"Loading model: {model_choice}")
+            # Initialize model with memory optimizations
+            progress(0.2, desc="Downloading model weights...")
+            model = AutoModelForSpeechSeq2Seq.from_pretrained(
+                model_choice,
+                torch_dtype=torch_dtype,
+                low_cpu_mem_usage=True,
+                use_safetensors=True
+            )
+            progress(0.4, desc="Moving model to device...")
+            model.to(device)
+            # Initialize processor
+            progress(0.6, desc="Loading processor...")
+            processor = AutoProcessor.from_pretrained(model_choice)
+            # Single pipeline initialization with all components
+            progress(0.8, desc="Creating pipeline...")
+            pipe = pipeline(
+                "automatic-speech-recognition",
+                model=model,
+                tokenizer=processor.tokenizer,
+                feature_extractor=processor.feature_extractor,
+                chunk_length_s=CHUNK_LENGTH_S,
+                stride_length_s=STRIDE_LENGTH_S,
+                batch_size=BATCH_SIZE,
+                torch_dtype=torch_dtype,
+                device=device,
+            )
+            # IMPORTANT: Fix for forced_decoder_ids error
+            progress(0.9, desc="Configuring model...")
+            # Remove forced_decoder_ids from the model's generation config
+            if hasattr(model.generation_config, 'forced_decoder_ids'):
+                print("Removing forced_decoder_ids from generation config")
+                model.generation_config.forced_decoder_ids = None
+            # Also check if it's in the model config
+            if hasattr(model.config, 'forced_decoder_ids'):
+                print("Removing forced_decoder_ids from model config")
+                delattr(model.config, 'forced_decoder_ids')
+            # Update global variables
+            current_model_id = model_choice
+            current_model = model
+            current_processor = processor
+            current_pipe = pipe
+            print(f"Model {model_choice} loaded successfully on {device}")
+        # Start transcription
+        progress(0.95, desc="Processing audio...")
         # Use the defined generate_kwargs dictionary
+        result = current_pipe(
             audio_input,
+            generate_kwargs=get_generate_kwargs(current_model)
         )
+        progress(1.0, desc="Transcription complete!")
         return result["text"]
     except Exception as e:
         # More detailed error logging might be helpful here if issues persist
         print(f"Detailed Error: {e}")
     fn=transcribe,
     inputs=[
         gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio file"),
+        gr.Dropdown(
+            choices=list(MODELS.keys()),
+            value=list(MODELS.keys())[0],  # Default to first model
+            label="Select Model",
+            info="Choose the Whisper model for transcription"
+        )
     ],
     outputs=gr.Textbox(
         label="",
     ),
     title="Transcribe Dhivehi Audio",
     description=(
+        "Upload an audio file or record using your microphone to transcribe. Select your preferred model from the dropdown."
     ),
     flagging_mode="never",
     examples=[
+        ["sample.mp3", "alakxender/whisper-small-dv-full"]
     ],
     api_name=False,
     cache_examples=False