Spaces:

GavinHuang
/

asr-demo

Running

File size: 7,199 Bytes

import os
import gradio as gr
import torch
import nemo.collections.asr as nemo_asr
from omegaconf import OmegaConf
import time
import spaces
import librosa

# Important: Don't initialize CUDA in the main process for Spaces
# The model will be loaded in the worker process through the GPU decorator
model = None

def load_model():
    # This function will be called in the GPU worker process
    global model
    if model is None:
        print(f"Loading model in worker process")
        print(f"CUDA available: {torch.cuda.is_available()}")
        if torch.cuda.is_available():
            print(f"CUDA device: {torch.cuda.get_device_name(0)}")
        model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("nvidia/parakeet-tdt-0.6b-v2")
        print(f"Model loaded on device: {model.device}")
    return model

@spaces.GPU(duration=120)
def transcribe(audio, state="", audio_buffer=None, last_processed_time=0):
    # Load the model inside the GPU worker process
    import numpy as np
    import soundfile as sf
    import librosa
    import os
    model = load_model()
    
    if audio_buffer is None:
        audio_buffer = []
    
    if audio is None or isinstance(audio, int):
        print(f"Skipping invalid audio input: {type(audio)}")
        return state, state, audio_buffer, last_processed_time
    
    print(f"Received audio input of type: {type(audio)}")
    
    if isinstance(audio, tuple) and len(audio) == 2 and isinstance(audio[1], np.ndarray):
        sample_rate, audio_data = audio
        print(f"Sample rate: {sample_rate}, Audio shape: {audio_data.shape}")
        
        # Append chunk to buffer
        audio_buffer.append(audio_data)
        
        # Calculate total duration in seconds
        total_samples = sum(arr.shape[0] for arr in audio_buffer)
        total_duration = total_samples / sample_rate
        print(f"Total buffered duration: {total_duration:.2f}s")
        
        # Process 3-second chunks with 1-second step size (2-second overlap)
        chunk_duration = 3.0  # seconds
        step_size = 1.0      # seconds
        min_samples = int(chunk_duration * 16000)  # 3s at 16kHz
        
        if total_duration < chunk_duration:
            print(f"Buffering audio, total duration: {total_duration:.2f}s")
            return state, state, audio_buffer, last_processed_time
        
        try:
            # Concatenate buffered chunks
            full_audio = np.concatenate(audio_buffer)
            
            # Resample to 16kHz if needed
            if sample_rate != 16000:
                print(f"Resampling from {sample_rate}Hz to 16000Hz")
                full_audio = librosa.resample(full_audio.astype(float), orig_sr=sample_rate, target_sr=16000)
                sample_rate = 16000
            else:
                full_audio = full_audio.astype(float)
            
            # Process 3-second chunks
            new_state = state
            current_time = last_processed_time
            total_samples_16k = len(full_audio)
            
            while current_time + chunk_duration <= total_duration:
                start_sample = int(current_time * sample_rate)
                end_sample = int((current_time + chunk_duration) * sample_rate)
                if end_sample > total_samples_16k:
                    break
                
                chunk = full_audio[start_sample:end_sample]
                print(f"Processing chunk from {current_time:.2f}s to {current_time + chunk_duration:.2f}s")
                
                # Save to temporary WAV file
                temp_file = "temp_audio.wav"
                sf.write(temp_file, chunk, samplerate=16000)
                
                # Transcribe
                hypothesis = model.transcribe([temp_file])[0]
                transcription = hypothesis.text
                print(f"Transcription: {transcription}")
                
                os.remove(temp_file)
                print("Temporary file removed.")
                
                # Append transcription if non-empty
                if transcription.strip():
                    new_state = new_state + " " + transcription if new_state else transcription
                
                current_time += step_size
            
            # Update last processed time
            last_processed_time = current_time
            
            # Trim buffer to keep only unprocessed audio
            keep_samples = int((total_duration - current_time) * sample_rate)
            if keep_samples > 0:
                audio_buffer = [full_audio[-keep_samples:]]
            else:
                audio_buffer = []
            
            print(f"New state: {new_state}")
            return new_state, new_state, audio_buffer, last_processed_time
        
        except Exception as e:
            print(f"Error processing audio: {e}")
            return state, state, audio_buffer, last_processed_time
    
    print(f"Invalid audio input format: {type(audio)}")
    return state, state, audio_buffer, last_processed_time

# Define the Gradio interface
with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
    gr.Markdown("# 🎙️ Real-time Speech-to-Text Transcription")
    gr.Markdown("Powered by NVIDIA NeMo and the parakeet-tdt-0.6b-v2 model")
    with gr.Row():
        with gr.Column(scale=2):
            audio_input = gr.Audio(
                sources=["microphone"], 
                type="numpy", 
                streaming=True,
                label="Speak into your microphone"
            )
            
            clear_btn = gr.Button("Clear Transcript")
            
        with gr.Column(scale=3):
            text_output = gr.Textbox(
                label="Transcription", 
                placeholder="Your speech will appear here...",
                lines=10
            )
            streaming_text = gr.Textbox(
                label="Real-time Transcription", 
                placeholder="Real-time results will appear here...",
                lines=2
            )
    
    # State to store the ongoing transcription
    state = gr.State("")
    audio_buffer = gr.State(value=None)
    last_processed_time = gr.State(value=0)
    # Handle the audio stream
    audio_input.stream(
        fn=transcribe,
        inputs=[audio_input, state, audio_buffer, last_processed_time],
        outputs=[state, streaming_text, audio_buffer, last_processed_time],
    )

    # Clear the transcription
    def clear_transcription():
        return "", "", None, 0

    clear_btn.click(
        fn=clear_transcription,
        inputs=[],
        outputs=[text_output, streaming_text, audio_buffer, last_processed_time]
    )
    
    # Update the main text output when the state changes
    state.change(
        fn=lambda s: s,
        inputs=[state],
        outputs=[text_output]
    )
    
    gr.Markdown("## 📝 Instructions")
    gr.Markdown("""
    1. Click the microphone button to start recording
    2. Speak clearly into your microphone
    3. The transcription will appear in real-time
    4. Click 'Clear Transcript' to start a new transcription
    """)

# Launch the app
if __name__ == "__main__":
    demo.launch()