Spaces:
Running
Running
bugfix on hotword biasing
Browse files- app/asr_worker.py +38 -24
app/asr_worker.py
CHANGED
@@ -7,6 +7,7 @@ from opencc import OpenCC
|
|
7 |
from huggingface_hub import hf_hub_download
|
8 |
from typing import List
|
9 |
import tempfile
|
|
|
10 |
|
11 |
# Ensure Hugging Face cache is in a user-writable directory
|
12 |
CACHE_DIR = Path(__file__).parent / "hf_cache"
|
@@ -26,7 +27,7 @@ STREAMING_ZIPFORMER_MODELS = {
|
|
26 |
"joiner_fp32": "exp/96/joiner-epoch-99-avg-1.onnx",
|
27 |
"joiner_int8": "exp/96/joiner-epoch-99-avg-1.int8.onnx",
|
28 |
"modeling_unit":"cjkchar+bpe",
|
29 |
-
"
|
30 |
},
|
31 |
# mixed Chinese+English (char+BPE)
|
32 |
"pfluo/k2fsa-zipformer-chinese-english-mixed": {
|
@@ -38,7 +39,7 @@ STREAMING_ZIPFORMER_MODELS = {
|
|
38 |
"joiner_fp32": "exp/joiner-epoch-99-avg-1.onnx",
|
39 |
"joiner_int8": "exp/joiner-epoch-99-avg-1.int8.onnx",
|
40 |
"modeling_unit":"cjkchar+bpe",
|
41 |
-
"
|
42 |
},
|
43 |
# Korean-only (CJK chars)
|
44 |
"k2-fsa/sherpa-onnx-streaming-zipformer-korean-2024-06-16": {
|
@@ -50,7 +51,7 @@ STREAMING_ZIPFORMER_MODELS = {
|
|
50 |
"joiner_fp32": "joiner-epoch-99-avg-1.onnx",
|
51 |
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
|
52 |
"modeling_unit":"cjkchar",
|
53 |
-
"
|
54 |
},
|
55 |
# multi Chinese (Hans) (CJK chars)
|
56 |
"k2-fsa/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12": {
|
@@ -62,7 +63,7 @@ STREAMING_ZIPFORMER_MODELS = {
|
|
62 |
"joiner_fp32": "joiner-epoch-20-avg-1-chunk-16-left-128.onnx",
|
63 |
"joiner_int8": "joiner-epoch-20-avg-1-chunk-16-left-128.int8.onnx",
|
64 |
"modeling_unit":"cjkchar",
|
65 |
-
"
|
66 |
},
|
67 |
# wenetspeech streaming (CJK chars)
|
68 |
"pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615": {
|
@@ -74,7 +75,7 @@ STREAMING_ZIPFORMER_MODELS = {
|
|
74 |
"joiner_fp32": "exp/joiner-epoch-12-avg-4-chunk-16-left-128.onnx",
|
75 |
"joiner_int8": "exp/joiner-epoch-12-avg-4-chunk-16-left-128.int8.onnx",
|
76 |
"modeling_unit":"cjkchar",
|
77 |
-
"
|
78 |
},
|
79 |
# English-only (BPE)
|
80 |
"csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26": {
|
@@ -86,7 +87,7 @@ STREAMING_ZIPFORMER_MODELS = {
|
|
86 |
"joiner_fp32": "joiner-epoch-99-avg-1-chunk-16-left-128.onnx",
|
87 |
"joiner_int8": "joiner-epoch-99-avg-1-chunk-16-left-128.int8.onnx",
|
88 |
"modeling_unit":"bpe",
|
89 |
-
"
|
90 |
},
|
91 |
"csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-21": {
|
92 |
"tokens": "tokens.txt",
|
@@ -97,7 +98,7 @@ STREAMING_ZIPFORMER_MODELS = {
|
|
97 |
"joiner_fp32": "joiner-epoch-99-avg-1.onnx",
|
98 |
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
|
99 |
"modeling_unit":"bpe",
|
100 |
-
"
|
101 |
},
|
102 |
"csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21": {
|
103 |
"tokens": "tokens.txt",
|
@@ -108,7 +109,7 @@ STREAMING_ZIPFORMER_MODELS = {
|
|
108 |
"joiner_fp32": "joiner-epoch-99-avg-1.onnx",
|
109 |
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
|
110 |
"modeling_unit":"bpe",
|
111 |
-
"
|
112 |
},
|
113 |
# older bilingual zh-en (cjkchar+BPE) β no bpe.vocab shipped
|
114 |
"csukuangfj/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20": {
|
@@ -120,7 +121,7 @@ STREAMING_ZIPFORMER_MODELS = {
|
|
120 |
"joiner_fp32": "joiner-epoch-99-avg-1.onnx",
|
121 |
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
|
122 |
"modeling_unit":"cjkchar+bpe",
|
123 |
-
"
|
124 |
},
|
125 |
# French-only (BPE)
|
126 |
"shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14": {
|
@@ -132,7 +133,7 @@ STREAMING_ZIPFORMER_MODELS = {
|
|
132 |
"joiner_fp32": "joiner-epoch-29-avg-9-with-averaged-model.onnx",
|
133 |
"joiner_int8": "joiner-epoch-29-avg-9-with-averaged-model.int8.onnx",
|
134 |
"modeling_unit":"bpe",
|
135 |
-
"
|
136 |
},
|
137 |
# Chinese-only small (CJK chars)
|
138 |
"csukuangfj/sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23": {
|
@@ -144,7 +145,7 @@ STREAMING_ZIPFORMER_MODELS = {
|
|
144 |
"joiner_fp32": "joiner-epoch-99-avg-1.onnx",
|
145 |
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
|
146 |
"modeling_unit":"cjkchar",
|
147 |
-
"
|
148 |
},
|
149 |
# English-only 20M (BPE)
|
150 |
"csukuangfj/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17": {
|
@@ -156,7 +157,7 @@ STREAMING_ZIPFORMER_MODELS = {
|
|
156 |
"joiner_fp32": "joiner-epoch-99-avg-1.onnx",
|
157 |
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
|
158 |
"modeling_unit":"bpe",
|
159 |
-
"
|
160 |
},
|
161 |
}
|
162 |
|
@@ -187,24 +188,37 @@ def create_recognizer(
|
|
187 |
decoder_path = hf_hub_download(repo_id=model_id, filename=decoder_file, cache_dir=str(CACHE_DIR))
|
188 |
joiner_path = hf_hub_download(repo_id=model_id, filename=joiner_file, cache_dir=str(CACHE_DIR))
|
189 |
|
190 |
-
#
|
191 |
modeling_unit = entry.get("modeling_unit")
|
192 |
-
|
193 |
bpe_vocab_path = None
|
194 |
-
if
|
195 |
try:
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
)
|
201 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
except Exception as e:
|
203 |
-
print(f"[WARNING
|
204 |
bpe_vocab_path = None
|
205 |
|
206 |
-
#
|
207 |
-
|
|
|
|
|
208 |
if use_beam:
|
209 |
# Write hotword list to a temp file (one entry per line)
|
210 |
tf = tempfile.NamedTemporaryFile(
|
|
|
7 |
from huggingface_hub import hf_hub_download
|
8 |
from typing import List
|
9 |
import tempfile
|
10 |
+
from sentencepiece import SentencePieceProcessor
|
11 |
|
12 |
# Ensure Hugging Face cache is in a user-writable directory
|
13 |
CACHE_DIR = Path(__file__).parent / "hf_cache"
|
|
|
27 |
"joiner_fp32": "exp/96/joiner-epoch-99-avg-1.onnx",
|
28 |
"joiner_int8": "exp/96/joiner-epoch-99-avg-1.int8.onnx",
|
29 |
"modeling_unit":"cjkchar+bpe",
|
30 |
+
"bpe_model": "data/lang_char_bpe/bpe.model",
|
31 |
},
|
32 |
# mixed Chinese+English (char+BPE)
|
33 |
"pfluo/k2fsa-zipformer-chinese-english-mixed": {
|
|
|
39 |
"joiner_fp32": "exp/joiner-epoch-99-avg-1.onnx",
|
40 |
"joiner_int8": "exp/joiner-epoch-99-avg-1.int8.onnx",
|
41 |
"modeling_unit":"cjkchar+bpe",
|
42 |
+
"bpe_model": "data/lang_char_bpe/bpe.model",
|
43 |
},
|
44 |
# Korean-only (CJK chars)
|
45 |
"k2-fsa/sherpa-onnx-streaming-zipformer-korean-2024-06-16": {
|
|
|
51 |
"joiner_fp32": "joiner-epoch-99-avg-1.onnx",
|
52 |
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
|
53 |
"modeling_unit":"cjkchar",
|
54 |
+
"bpe_model": "bpe.model",
|
55 |
},
|
56 |
# multi Chinese (Hans) (CJK chars)
|
57 |
"k2-fsa/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12": {
|
|
|
63 |
"joiner_fp32": "joiner-epoch-20-avg-1-chunk-16-left-128.onnx",
|
64 |
"joiner_int8": "joiner-epoch-20-avg-1-chunk-16-left-128.int8.onnx",
|
65 |
"modeling_unit":"cjkchar",
|
66 |
+
"bpe_model": "bpe.model",
|
67 |
},
|
68 |
# wenetspeech streaming (CJK chars)
|
69 |
"pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615": {
|
|
|
75 |
"joiner_fp32": "exp/joiner-epoch-12-avg-4-chunk-16-left-128.onnx",
|
76 |
"joiner_int8": "exp/joiner-epoch-12-avg-4-chunk-16-left-128.int8.onnx",
|
77 |
"modeling_unit":"cjkchar",
|
78 |
+
"bpe_model": None,
|
79 |
},
|
80 |
# English-only (BPE)
|
81 |
"csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26": {
|
|
|
87 |
"joiner_fp32": "joiner-epoch-99-avg-1-chunk-16-left-128.onnx",
|
88 |
"joiner_int8": "joiner-epoch-99-avg-1-chunk-16-left-128.int8.onnx",
|
89 |
"modeling_unit":"bpe",
|
90 |
+
"bpe_model": "bpe.model",
|
91 |
},
|
92 |
"csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-21": {
|
93 |
"tokens": "tokens.txt",
|
|
|
98 |
"joiner_fp32": "joiner-epoch-99-avg-1.onnx",
|
99 |
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
|
100 |
"modeling_unit":"bpe",
|
101 |
+
"bpe_model": None,
|
102 |
},
|
103 |
"csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21": {
|
104 |
"tokens": "tokens.txt",
|
|
|
109 |
"joiner_fp32": "joiner-epoch-99-avg-1.onnx",
|
110 |
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
|
111 |
"modeling_unit":"bpe",
|
112 |
+
"bpe_model": None,
|
113 |
},
|
114 |
# older bilingual zh-en (cjkchar+BPE) β no bpe.vocab shipped
|
115 |
"csukuangfj/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20": {
|
|
|
121 |
"joiner_fp32": "joiner-epoch-99-avg-1.onnx",
|
122 |
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
|
123 |
"modeling_unit":"cjkchar+bpe",
|
124 |
+
"bpe_model": "bpe.model",
|
125 |
},
|
126 |
# French-only (BPE)
|
127 |
"shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14": {
|
|
|
133 |
"joiner_fp32": "joiner-epoch-29-avg-9-with-averaged-model.onnx",
|
134 |
"joiner_int8": "joiner-epoch-29-avg-9-with-averaged-model.int8.onnx",
|
135 |
"modeling_unit":"bpe",
|
136 |
+
"bpe_model": None,
|
137 |
},
|
138 |
# Chinese-only small (CJK chars)
|
139 |
"csukuangfj/sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23": {
|
|
|
145 |
"joiner_fp32": "joiner-epoch-99-avg-1.onnx",
|
146 |
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
|
147 |
"modeling_unit":"cjkchar",
|
148 |
+
"bpe_model": None,
|
149 |
},
|
150 |
# English-only 20M (BPE)
|
151 |
"csukuangfj/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17": {
|
|
|
157 |
"joiner_fp32": "joiner-epoch-99-avg-1.onnx",
|
158 |
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
|
159 |
"modeling_unit":"bpe",
|
160 |
+
"bpe_model": None,
|
161 |
},
|
162 |
}
|
163 |
|
|
|
188 |
decoder_path = hf_hub_download(repo_id=model_id, filename=decoder_file, cache_dir=str(CACHE_DIR))
|
189 |
joiner_path = hf_hub_download(repo_id=model_id, filename=joiner_file, cache_dir=str(CACHE_DIR))
|
190 |
|
191 |
+
# Prepare BPE vocab from .model if provided
|
192 |
modeling_unit = entry.get("modeling_unit")
|
193 |
+
bpe_model_rel = entry.get("bpe_model")
|
194 |
bpe_vocab_path = None
|
195 |
+
if bpe_model_rel:
|
196 |
try:
|
197 |
+
bpe_model_path = hf_hub_download(model_id, bpe_model_rel, cache_dir=str(CACHE_DIR))
|
198 |
+
print(f"[DEBUG] Downloaded bpe model: {bpe_model_path}")
|
199 |
+
|
200 |
+
# === export_bpe_vocab.py logic starts here ===
|
201 |
+
sp = SentencePieceProcessor()
|
202 |
+
sp.Load(str(bpe_model_path))
|
203 |
+
|
204 |
+
vocab_file = Path(CACHE_DIR) / f"{Path(bpe_model_rel).stem}.vocab"
|
205 |
+
with open(vocab_file, "w", encoding="utf-8") as vf:
|
206 |
+
for idx in range(sp.get_piece_size()):
|
207 |
+
piece = sp.id_to_piece(idx)
|
208 |
+
score = sp.get_score(idx)
|
209 |
+
vf.write(f"{piece}\t{score}\n")
|
210 |
+
bpe_vocab_path = str(vocab_file)
|
211 |
+
print(f"[DEBUG] Converted bpe model to vocab: {bpe_vocab_path}")
|
212 |
+
# === export_bpe_vocab.py logic ends here ===
|
213 |
+
|
214 |
except Exception as e:
|
215 |
+
print(f"[WARNING] Failed to build BPE vocab from '{bpe_model_rel}': {e}")
|
216 |
bpe_vocab_path = None
|
217 |
|
218 |
+
# Decide if we should use beam-search hotword biasing
|
219 |
+
has_hot = bool(hotwords and hotwords_score > 0.0)
|
220 |
+
use_beam = has_hot and ("bpe" not in modeling_unit or bpe_vocab_path is not None)
|
221 |
+
|
222 |
if use_beam:
|
223 |
# Write hotword list to a temp file (one entry per line)
|
224 |
tf = tempfile.NamedTemporaryFile(
|