Luigi's picture
Revert "adjust eprule1 and 2 defaults"
43609f1
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<title>🎤 Real-Time ASR Demo</title>
<style>
/* Page layout */
body {
font-family: "Segoe UI", sans-serif;
background-color: #f5f6fa;
margin: 0;
padding: 2rem;
color: #2f3640;
display: flex;
flex-direction: column;
align-items: center;
}
h1 {
font-size: 2rem;
margin-bottom: 1rem;
}
section {
width: 100%;
max-width: 900px;
margin-bottom: 1.5rem;
background: white;
border-radius: 8px;
padding: 1rem;
box-shadow: 0 0 8px rgba(0,0,0,0.1);
}
section h2 {
margin-top: 0;
font-size: 1.2rem;
border-bottom: 1px solid #dcdde1;
padding-bottom: 0.5rem;
color: #2f3640;
}
/* Grid for controls */
.controls-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
gap: 1rem;
margin-top: 1rem;
}
.control-item {
display: flex;
flex-direction: column;
}
.control-item > label {
font-weight: 600;
margin-bottom: 0.3rem;
}
.control-item > select,
.control-item > input,
.control-item > textarea {
padding: 0.5rem;
border: 1px solid #dcdde1;
border-radius: 5px;
font-size: 1rem;
background: white;
}
.control-item > textarea {
resize: vertical;
min-height: 4rem;
}
.control-item > button {
margin-top: 0.5rem;
padding: 0.5rem;
border: none;
border-radius: 5px;
background-color: #44bd32;
color: white;
font-size: 1rem;
cursor: pointer;
transition: background-color 0.2s;
}
.control-item > button:hover {
background-color: #4cd137;
}
/* Status text */
#hotwordStatus {
margin-top: 0.5rem;
font-size: 0.9rem;
color: #e1b12c;
font-weight: bold;
text-align: center;
}
/* Mic info and volume */
.mic-info {
font-size: 0.9rem;
color: #353b48;
margin-top: 1rem;
}
.mic-info .label {
font-weight: bold;
}
#vol {
width: 100%;
max-width: 500px;
height: 20px;
margin-top: 0.5rem;
appearance: none;
}
#vol::-webkit-progress-bar {
background-color: #dcdde1;
border-radius: 8px;
}
#vol::-webkit-progress-value {
background-color: #44bd32;
border-radius: 8px;
transition: width 0.2s;
}
#vol::-moz-progress-bar {
background-color: #44bd32;
border-radius: 8px;
transition: width 0.2s;
}
/* Transcript */
.transcript-container {
margin-top: 0.5rem;
padding: 0.5rem;
background: #fff;
border: 1px solid #dcdde1;
border-radius: 8px;
max-height: 300px;
overflow-y: auto;
white-space: pre-wrap;
font-size: 1.1rem;
color: #353b48;
}
.transcript-container .final {
color: green;
display: inline;
margin-right: 0.5em;
}
.transcript-container .interim {
color: red;
display: inline;
}
</style>
</head>
<body>
<h1>🎤 Speak into Your Microphone</h1>
<section class="section--settings">
<h2>Recognition Settings</h2>
<div class="controls-grid">
<div class="control-item">
<label for="modelSelect">Model</label>
<select id="modelSelect">
<option value="csukuangfj/k2fsa-zipformer-bilingual-zh-en-t">k2fsa-small-bilingual-zh-en</option>
<option value="pfluo/k2fsa-zipformer-chinese-english-mixed">k2fsa-chinese-english-mixed</option>
<option value="k2-fsa/sherpa-onnx-streaming-zipformer-korean-2024-06-16">sherpa-onnx-zipformer-korean</option>
<option value="k2-fsa/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12">zipformer-multi-zh-hans</option>
<option value="pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615">icefall-zipformer-wenetspeech</option>
<option value="csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26">zipformer-en-06-26</option>
<option value="csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-21">zipformer-en-06-21</option>
<option value="csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21">zipformer-en-02-21</option>
<option value="csukuangfj/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20">zipformer-zh-en</option>
<option value="shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14">zipformer-fr</option>
<option value="csukuangfj/sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23">zipformer-zh-14M</option>
<option value="csukuangfj/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17">zipformer-en-20M</option>
<option value="csukuangfj/sherpa-onnx-streaming-zipformer-ar_en_id_ja_ru_th_vi_zh-2025-02-10">zipformer-ar_en_id_ja_ru_th_vi_zh</option>
</select>
</div>
<div class="control-item">
<label for="precisionSelect">Precision</label>
<select id="precisionSelect">
<option value="fp32">FP32</option>
<option value="int8">INT8</option>
</select>
</div>
</div>
<div class="model-info">
Languages: <span id="modelLangs"></span> | Size: <span id="modelSize"></span> MB
</div>
</section>
<section class="section--hotwords">
<h2>Hotword Settings</h2>
<div class="controls-grid">
<div class="control-item">
<label for="hotwordsList">Hotwords</label>
<textarea id="hotwordsList" placeholder="One per line"></textarea>
</div>
<div class="control-item">
<label for="boostScore">Boost Score: <span id="boostValue">2.0</span></label>
<input type="range" id="boostScore" min="0" max="10" step="0.1" value="2.0" />
</div>
<div class="control-item">
<button id="applyHotwords">Apply Hotwords</button>
</div>
</div>
<div id="hotwordStatus">Hotword Bias: Off</div>
</section>
<section class="section--endpoint">
<h2>Endpoint Detection</h2>
<div class="controls-grid">
<div class="control-item">
<label for="epRule1">Rule 1 (silence ≥ s)</label>
<input type="number" id="epRule1" step="0.1" value="2.4" />
</div>
<div class="control-item">
<label for="epRule2">Rule 2 (silence ≥ s)</label>
<input type="number" id="epRule2" step="0.1" value="1.2" />
</div>
<div class="control-item">
<label for="epRule3">Rule 3 (min utt ms)</label>
<input type="number" id="epRule3" step="50" value="300" />
</div>
<div class="control-item">
<button id="applyEndpointConfig">Apply Endpoint Config</button>
</div>
</div>
</section>
<section class="section--mic">
<h2>Microphone</h2>
<div class="mic-info">
<span class="label">Device:</span> <span id="micName">Detecting…</span><br>
<span class="label">Sample Rate:</span> <span id="sampleRate">-</span> Hz
</div>
<progress id="vol" max="1" value="0"></progress>
</section>
<section class="section--transcript">
<h2>Transcript</h2>
<div id="transcript" class="transcript-container"></div>
</section>
<script>
const MODEL_METADATA = {
"csukuangfj/k2fsa-zipformer-bilingual-zh-en-t": { language: ["zh", "en"], size: 115 },
"pfluo/k2fsa-zipformer-chinese-english-mixed": { language: ["zh", "en"], size: 342 },
"k2-fsa/sherpa-onnx-streaming-zipformer-korean-2024-06-16": { language: "korean", size: 300 },
"k2-fsa/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12": { language: "zh-Hans", size: 258 },
"pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615": { language: "zh (WenetSpeech)", size: 273 },
"csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26": { language: "english", size: 340 },
"csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-21": { language: "english", size: 340 },
"csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21": { language: "english", size: 341 },
"csukuangfj/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20": { language: ["zh", "en"], size: 342 },
"shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14": { language: "french", size: 282 },
"csukuangfj/sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23": { language: "zh", size: 53 },
"csukuangfj/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17": { language: "en", size: 88 },
"csukuangfj/sherpa-onnx-streaming-zipformer-ar_en_id_ja_ru_th_vi_zh-2025-02-10": {language: ["ar","en","id","ja","ru","th","vi","zh"], size: 338}
};
let orig_sample_rate;
let ws;
const vol = document.getElementById("vol");
const transcript = document.getElementById("transcript");
const modelSelect = document.getElementById("modelSelect");
const precisionSelect = document.getElementById("precisionSelect");
const hotwordsList = document.getElementById("hotwordsList");
const boostScore = document.getElementById("boostScore");
const boostValue = document.getElementById("boostValue");
const applyBtn = document.getElementById("applyHotwords");
const hotwordStatus = document.getElementById("hotwordStatus");
const modelLangs = document.getElementById("modelLangs");
const modelSize = document.getElementById("modelSize");
const micNameElem = document.getElementById("micName");
const sampleRateElem = document.getElementById("sampleRate");
// ← Helper to toggle the status text
function updateHotwordStatus() {
const enabled = hotwordsList.value.split(/\r?\n/).filter(Boolean).length > 0
&& parseFloat(boostScore.value) > 0;
hotwordStatus.textContent = enabled
? "Hotword Bias: On"
: "Hotword Bias: Off";
}
function updateModelInfo() {
const meta = MODEL_METADATA[modelSelect.value];
if (Array.isArray(meta.language)) {
modelLangs.textContent = meta.language.join(", ");
} else {
modelLangs.textContent = meta.language;
}
modelSize.textContent = meta.size;
}
navigator.mediaDevices.getUserMedia({ audio: true }).then(stream => {
const context = new AudioContext();
orig_sample_rate = context.sampleRate;
// Update mic info in UI
const track = stream.getAudioTracks()[0];
micNameElem.textContent = track.label || 'Unknown';
sampleRateElem.textContent = orig_sample_rate;
updateModelInfo();
// Now that we know the sample rate, open the WS
ws = new WebSocket(`wss://${location.host}/ws`);
ws.onopen = () => sendConfig();
ws.onerror = err => console.error("WebSocket error:", err);
ws.onclose = () => console.log("WebSocket closed");
// Unified handler for partial + final messages
ws.onmessage = e => {
const msg = JSON.parse(e.data);
// 1) update volume bar
if (msg.volume !== undefined) {
vol.value = Math.min(msg.volume, 1.0);
}
// 2) distinguish “final” vs “partial”
if (msg.final !== undefined) {
finalUtterances.push(msg.final.trim());
currentInterim = "";
} else if (msg.partial !== undefined) {
currentInterim = msg.partial;
}
// 3) rebuild the full, colored transcript
transcript.innerHTML =
finalUtterances
.map(u => `<span class="final">${u}</span>`)
.join("") /* margin in CSS handles spacing */
+ (currentInterim
? ` <span class="interim">${currentInterim}</span>`
: "");
// 4) auto-scroll to newest text
transcript.scrollTop = transcript.scrollHeight;
};
modelSelect.addEventListener("change", () => {
updateModelInfo();
sendConfig();
updateHotwordStatus();
});
precisionSelect.addEventListener("change", () => {
sendConfig();
updateHotwordStatus();
});
applyBtn.addEventListener("click", () => {
sendConfig();
updateHotwordStatus();
});
// Update boost display and status on slider input
boostScore.addEventListener("input", () => {
boostValue.textContent = boostScore.value;
updateHotwordStatus();
});
const source = context.createMediaStreamSource(stream);
const processor = context.createScriptProcessor(4096, 1, 1);
source.connect(processor);
processor.connect(context.destination);
processor.onaudioprocess = e => {
const input = e.inputBuffer.getChannelData(0);
ws.send(new Float32Array(input).buffer);
};
});
// 2) Declare state for final/interim rendering
const finalUtterances = [];
let currentInterim = "";
// 3) Grab your new inputs + button
const epRule1Input = document.getElementById("epRule1");
const epRule2Input = document.getElementById("epRule2");
const epRule3Input = document.getElementById("epRule3");
const applyEndpointBtn = document.getElementById("applyEndpointConfig");
// 4) Extend sendConfig() to include epRule1/2/3
function sendConfig() {
if (ws && ws.readyState === WebSocket.OPEN) {
ws.send(JSON.stringify({
type: "config",
sampleRate: orig_sample_rate,
model: modelSelect.value,
precision: precisionSelect.value,
hotwords: hotwordsList.value.split(/\r?\n/).filter(Boolean),
hotwordsScore: parseFloat(boostScore.value),
// ← new endpoint fields
epRule1: parseFloat(epRule1Input.value),
epRule2: parseFloat(epRule2Input.value),
epRule3: parseInt( epRule3Input.value, 10),
}));
}
}
// 5) Re-send config when user clicks “Apply Endpoint Config”
applyEndpointBtn.addEventListener("click", () => {
sendConfig();
});
// 6) Replace your existing ws.onmessage handler with this:
ws.onmessage = e => {
const msg = JSON.parse(e.data);
if (msg.volume !== undefined) {
vol.value = Math.min(msg.volume, 1.0);
}
if (msg.final !== undefined) {
// endpoint fired → lock in the final utterance
finalUtterances.push(msg.final.trim());
currentInterim = "";
} else if (msg.partial !== undefined) {
// update the rolling interim
currentInterim = msg.partial;
}
// rebuild the full transcript: green finals + red interim
transcript.innerHTML =
finalUtterances
.map(u => `<span class="final">${u}</span>`)
.join("") // no explicit space here, margin handles it
+ (currentInterim
? `<span class="interim">${currentInterim}</span>`
: "");
// always scroll to bottom
transcript.scrollTop = transcript.scrollHeight;
};
</script>
</body>
</html>