import os from pathlib import Path import numpy as np import sherpa_onnx import scipy.signal from opencc import OpenCC from huggingface_hub import hf_hub_download from typing import List import tempfile from sentencepiece import SentencePieceProcessor # Ensure Hugging Face cache is in a user-writable directory CACHE_DIR = Path(__file__).parent / "hf_cache" os.makedirs(CACHE_DIR, exist_ok=True) to_ZHTW = OpenCC('s2t') to_ZHCN = OpenCC('t2s') # Streaming Zipformer model registry: paths relative to repo root STREAMING_ZIPFORMER_MODELS = { # bilingual zh-en with char+BPE "csukuangfj/k2fsa-zipformer-bilingual-zh-en-t": { "tokens": "data/lang_char_bpe/tokens.txt", "encoder_fp32": "exp/96/encoder-epoch-99-avg-1.onnx", "encoder_int8": "exp/96/encoder-epoch-99-avg-1.int8.onnx", "decoder_fp32": "exp/96/decoder-epoch-99-avg-1.onnx", "decoder_int8": "exp/96/decoder-epoch-99-avg-1.int8.onnx", "joiner_fp32": "exp/96/joiner-epoch-99-avg-1.onnx", "joiner_int8": "exp/96/joiner-epoch-99-avg-1.int8.onnx", "modeling_unit":"cjkchar+bpe", "bpe_model": "data/lang_char_bpe/bpe.model", }, # mixed Chinese+English (char+BPE) "pfluo/k2fsa-zipformer-chinese-english-mixed": { "tokens": "data/lang_char_bpe/tokens.txt", "encoder_fp32": "exp/encoder-epoch-99-avg-1.onnx", "encoder_int8": "exp/encoder-epoch-99-avg-1.int8.onnx", "decoder_fp32": "exp/decoder-epoch-99-avg-1.onnx", "decoder_int8": None, "joiner_fp32": "exp/joiner-epoch-99-avg-1.onnx", "joiner_int8": "exp/joiner-epoch-99-avg-1.int8.onnx", "modeling_unit":"cjkchar+bpe", "bpe_model": "data/lang_char_bpe/bpe.model", }, # Korean-only (CJK chars) "k2-fsa/sherpa-onnx-streaming-zipformer-korean-2024-06-16": { "tokens": "tokens.txt", "encoder_fp32": "encoder-epoch-99-avg-1.onnx", "encoder_int8": "encoder-epoch-99-avg-1.int8.onnx", "decoder_fp32": "decoder-epoch-99-avg-1.onnx", "decoder_int8": "decoder-epoch-99-avg-1.int8.onnx", "joiner_fp32": "joiner-epoch-99-avg-1.onnx", "joiner_int8": "joiner-epoch-99-avg-1.int8.onnx", "modeling_unit":"cjkchar", "bpe_model": "bpe.model", }, # multi Chinese (Hans) (CJK chars) "k2-fsa/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12": { "tokens": "tokens.txt", "encoder_fp32": "encoder-epoch-20-avg-1-chunk-16-left-128.onnx", "encoder_int8": "encoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx", "decoder_fp32": "decoder-epoch-20-avg-1-chunk-16-left-128.onnx", "decoder_int8": "decoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx", "joiner_fp32": "joiner-epoch-20-avg-1-chunk-16-left-128.onnx", "joiner_int8": "joiner-epoch-20-avg-1-chunk-16-left-128.int8.onnx", "modeling_unit":"cjkchar", "bpe_model": "bpe.model", }, # wenetspeech streaming (CJK chars) "pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615": { "tokens": "data/lang_char/tokens.txt", "encoder_fp32": "exp/encoder-epoch-12-avg-4-chunk-16-left-128.onnx", "encoder_int8": "exp/encoder-epoch-12-avg-4-chunk-16-left-128.int8.onnx", "decoder_fp32": "exp/decoder-epoch-12-avg-4-chunk-16-left-128.onnx", "decoder_int8": "exp/decoder-epoch-12-avg-4-chunk-16-left-128.int8.onnx", "joiner_fp32": "exp/joiner-epoch-12-avg-4-chunk-16-left-128.onnx", "joiner_int8": "exp/joiner-epoch-12-avg-4-chunk-16-left-128.int8.onnx", "modeling_unit":"cjkchar", "bpe_model": None, }, # English-only (BPE) "csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26": { "tokens": "tokens.txt", "encoder_fp32": "encoder-epoch-99-avg-1-chunk-16-left-128.onnx", "encoder_int8": "encoder-epoch-99-avg-1-chunk-16-left-128.int8.onnx", "decoder_fp32": "decoder-epoch-99-avg-1-chunk-16-left-128.onnx", "decoder_int8": None, "joiner_fp32": "joiner-epoch-99-avg-1-chunk-16-left-128.onnx", "joiner_int8": "joiner-epoch-99-avg-1-chunk-16-left-128.int8.onnx", "modeling_unit":"bpe", "bpe_model": "bpe.model", }, "csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-21": { "tokens": "tokens.txt", "encoder_fp32": "encoder-epoch-99-avg-1.onnx", "encoder_int8": "encoder-epoch-99-avg-1.int8.onnx", "decoder_fp32": "decoder-epoch-99-avg-1.onnx", "decoder_int8": "decoder-epoch-99-avg-1.int8.onnx", "joiner_fp32": "joiner-epoch-99-avg-1.onnx", "joiner_int8": "joiner-epoch-99-avg-1.int8.onnx", "modeling_unit":"bpe", "bpe_model": None, }, "csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21": { "tokens": "tokens.txt", "encoder_fp32": "encoder-epoch-99-avg-1.onnx", "encoder_int8": "encoder-epoch-99-avg-1.int8.onnx", "decoder_fp32": "decoder-epoch-99-avg-1.onnx", "decoder_int8": "decoder-epoch-99-avg-1.int8.onnx", "joiner_fp32": "joiner-epoch-99-avg-1.onnx", "joiner_int8": "joiner-epoch-99-avg-1.int8.onnx", "modeling_unit":"bpe", "bpe_model": None, }, # older bilingual zh-en (cjkchar+BPE) – no bpe.vocab shipped "csukuangfj/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20": { "tokens": "tokens.txt", "encoder_fp32": "encoder-epoch-99-avg-1.onnx", "encoder_int8": "encoder-epoch-99-avg-1.int8.onnx", "decoder_fp32": "decoder-epoch-99-avg-1.onnx", "decoder_int8": "decoder-epoch-99-avg-1.int8.onnx", "joiner_fp32": "joiner-epoch-99-avg-1.onnx", "joiner_int8": "joiner-epoch-99-avg-1.int8.onnx", "modeling_unit":"cjkchar+bpe", "bpe_model": "bpe.model", }, # French-only (BPE) "shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14": { "tokens": "tokens.txt", "encoder_fp32": "encoder-epoch-29-avg-9-with-averaged-model.onnx", "encoder_int8": "encoder-epoch-29-avg-9-with-averaged-model.int8.onnx", "decoder_fp32": "decoder-epoch-29-avg-9-with-averaged-model.onnx", "decoder_int8": "decoder-epoch-29-avg-9-with-averaged-model.int8.onnx", "joiner_fp32": "joiner-epoch-29-avg-9-with-averaged-model.onnx", "joiner_int8": "joiner-epoch-29-avg-9-with-averaged-model.int8.onnx", "modeling_unit":"bpe", "bpe_model": None, }, # Chinese-only small (CJK chars) "csukuangfj/sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23": { "tokens": "tokens.txt", "encoder_fp32": "encoder-epoch-99-avg-1.onnx", "encoder_int8": "encoder-epoch-99-avg-1.int8.onnx", "decoder_fp32": "decoder-epoch-99-avg-1.onnx", "decoder_int8": "decoder-epoch-99-avg-1.int8.onnx", "joiner_fp32": "joiner-epoch-99-avg-1.onnx", "joiner_int8": "joiner-epoch-99-avg-1.int8.onnx", "modeling_unit":"cjkchar", "bpe_model": None, }, # English-only 20M (BPE) "csukuangfj/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17": { "tokens": "tokens.txt", "encoder_fp32": "encoder-epoch-99-avg-1.onnx", "encoder_int8": "encoder-epoch-99-avg-1.int8.onnx", "decoder_fp32": "decoder-epoch-99-avg-1.onnx", "decoder_int8": "decoder-epoch-99-avg-1.int8.onnx", "joiner_fp32": "joiner-epoch-99-avg-1.onnx", "joiner_int8": "joiner-epoch-99-avg-1.int8.onnx", "modeling_unit":"bpe", "bpe_model": None, }, "csukuangfj/sherpa-onnx-streaming-zipformer-ar_en_id_ja_ru_th_vi_zh-2025-02-10": { "tokens": "tokens.txt", "encoder_fp32": "encoder-epoch-75-avg-11-chunk-16-left-128.int8.onnx", "encoder_int8": None, "decoder_fp32": "decoder-epoch-75-avg-11-chunk-16-left-128.onnx", "decoder_int8": None, "joiner_fp32": "joiner-epoch-75-avg-11-chunk-16-left-128.int8.onnx", "joiner_int8": None, "modeling_unit":"cjkchar+bpe", "bpe_model": "bpe.model", }, } # Audio resampling utility def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray: return scipy.signal.resample_poly(audio, target_sr, orig_sr) # Create an online recognizer for a given model and precision # model_id: full HF repo ID # precision: "int8" or "fp32" def create_recognizer( model_id: str, precision: str, hotwords: List[str] = None, hotwords_score: float = 0.0, ep_rule1: float = 2.4, ep_rule2: float = 1.2, ep_rule3: int = 300, ): if model_id not in STREAMING_ZIPFORMER_MODELS: raise ValueError(f"Model '{model_id}' is not registered.") entry = STREAMING_ZIPFORMER_MODELS[model_id] tokens_file = entry['tokens'] encoder_file = entry['encoder_int8'] if precision == 'int8' and entry['encoder_int8'] else entry['encoder_fp32'] decoder_file = entry['decoder_int8'] if precision == 'int8' and entry['decoder_int8'] else entry['decoder_fp32'] joiner_file = entry['joiner_int8'] if precision == 'int8' and entry['joiner_int8'] else entry['joiner_fp32'] tokens_path = hf_hub_download(repo_id=model_id, filename=tokens_file, cache_dir=str(CACHE_DIR)) encoder_path = hf_hub_download(repo_id=model_id, filename=encoder_file, cache_dir=str(CACHE_DIR)) decoder_path = hf_hub_download(repo_id=model_id, filename=decoder_file, cache_dir=str(CACHE_DIR)) joiner_path = hf_hub_download(repo_id=model_id, filename=joiner_file, cache_dir=str(CACHE_DIR)) # Prepare BPE vocab from .model if provided modeling_unit = entry.get("modeling_unit") bpe_model_rel = entry.get("bpe_model") bpe_vocab_path = None if bpe_model_rel: try: bpe_model_path = hf_hub_download(model_id, bpe_model_rel, cache_dir=str(CACHE_DIR)) print(f"[DEBUG] Downloaded bpe model: {bpe_model_path}") # === export_bpe_vocab.py logic starts here === sp = SentencePieceProcessor() sp.Load(str(bpe_model_path)) vocab_file = Path(CACHE_DIR) / f"{Path(bpe_model_rel).stem}.vocab" with open(vocab_file, "w", encoding="utf-8") as vf: for idx in range(sp.get_piece_size()): piece = sp.id_to_piece(idx) score = sp.get_score(idx) vf.write(f"{piece}\t{score}\n") bpe_vocab_path = str(vocab_file) print(f"[DEBUG] Converted bpe model to vocab: {bpe_vocab_path}") # === export_bpe_vocab.py logic ends here === except Exception as e: print(f"[WARNING] Failed to build BPE vocab from '{bpe_model_rel}': {e}") bpe_vocab_path = None # Decide if we should use beam-search hotword biasing has_hot = bool(hotwords and hotwords_score > 0.0) use_beam = has_hot and ("bpe" not in modeling_unit or bpe_vocab_path is not None) if use_beam: # Write hotword list to a temp file (one entry per line) tf = tempfile.NamedTemporaryFile( mode="w", delete=False, suffix=".txt", dir=str(CACHE_DIR) ) for w in hotwords: # Remove backslashes and angle-bracket tokens clean = w.replace("\\", "").replace("", "").strip() clean = to_ZHCN.convert(clean) # convert all hotword into zh-cn for zh-cn models if clean: # only write non-empty lines tf.write(f"{clean}\n") tf.flush() tf.close() hotwords_file_path = tf.name print(f"[DEBUG asr_worker] Written {len(hotwords)} hotwords to {hotwords_file_path} with score {hotwords_score}") # Create beam-search recognizer with biasing :contentReference[oaicite:0]{index=0} return sherpa_onnx.OnlineRecognizer.from_transducer( tokens=tokens_path, encoder=encoder_path, decoder=decoder_path, joiner=joiner_path, provider="cpu", num_threads=1, sample_rate=16000, feature_dim=80, decoding_method="modified_beam_search", hotwords_file=hotwords_file_path, hotwords_score=hotwords_score, modeling_unit=modeling_unit, bpe_vocab=bpe_vocab_path, # endpoint detection parameters enable_endpoint_detection=True, rule1_min_trailing_silence=ep_rule1, rule2_min_trailing_silence=ep_rule2, rule3_min_utterance_length=ep_rule3, ) # ——— Fallback to original greedy-search (no hotword biasing) ——— return sherpa_onnx.OnlineRecognizer.from_transducer( tokens=tokens_path, encoder=encoder_path, decoder=decoder_path, joiner=joiner_path, provider="cpu", num_threads=1, sample_rate=16000, feature_dim=80, decoding_method="greedy_search", # endpoint detection parameters enable_endpoint_detection=True, rule1_min_trailing_silence=ep_rule1, rule2_min_trailing_silence=ep_rule2, rule3_min_utterance_length=ep_rule3, ) def stream_audio(raw_pcm_bytes, stream, recognizer, orig_sr): audio = np.frombuffer(raw_pcm_bytes, dtype=np.float32) if audio.size == 0: return "", 0.0 resampled = resample_audio(audio, orig_sr, 16000) rms = float(np.sqrt(np.mean(resampled ** 2))) stream.accept_waveform(16000, resampled) if recognizer.is_ready(stream): recognizer.decode_streams([stream]) result = recognizer.get_result(stream) return to_ZHTW.convert(result), rms