Spaces:
Running
Running
<html lang="en"> | |
<head> | |
<meta charset="UTF-8" /> | |
<title>🎤 Real-Time ASR Demo</title> | |
<style> | |
/* Page layout */ | |
body { | |
font-family: "Segoe UI", sans-serif; | |
background-color: #f5f6fa; | |
margin: 0; | |
padding: 2rem; | |
color: #2f3640; | |
display: flex; | |
flex-direction: column; | |
align-items: center; | |
} | |
h1 { | |
font-size: 2rem; | |
margin-bottom: 1rem; | |
} | |
section { | |
width: 100%; | |
max-width: 900px; | |
margin-bottom: 1.5rem; | |
background: white; | |
border-radius: 8px; | |
padding: 1rem; | |
box-shadow: 0 0 8px rgba(0,0,0,0.1); | |
} | |
section h2 { | |
margin-top: 0; | |
font-size: 1.2rem; | |
border-bottom: 1px solid #dcdde1; | |
padding-bottom: 0.5rem; | |
color: #2f3640; | |
} | |
/* Grid for controls */ | |
.controls-grid { | |
display: grid; | |
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); | |
gap: 1rem; | |
margin-top: 1rem; | |
} | |
.control-item { | |
display: flex; | |
flex-direction: column; | |
} | |
.control-item > label { | |
font-weight: 600; | |
margin-bottom: 0.3rem; | |
} | |
.control-item > select, | |
.control-item > input, | |
.control-item > textarea { | |
padding: 0.5rem; | |
border: 1px solid #dcdde1; | |
border-radius: 5px; | |
font-size: 1rem; | |
background: white; | |
} | |
.control-item > textarea { | |
resize: vertical; | |
min-height: 4rem; | |
} | |
.control-item > button { | |
margin-top: 0.5rem; | |
padding: 0.5rem; | |
border: none; | |
border-radius: 5px; | |
background-color: #44bd32; | |
color: white; | |
font-size: 1rem; | |
cursor: pointer; | |
transition: background-color 0.2s; | |
} | |
.control-item > button:hover { | |
background-color: #4cd137; | |
} | |
/* Status text */ | |
#hotwordStatus { | |
margin-top: 0.5rem; | |
font-size: 0.9rem; | |
color: #e1b12c; | |
font-weight: bold; | |
text-align: center; | |
} | |
/* Mic info and volume */ | |
.mic-info { | |
font-size: 0.9rem; | |
color: #353b48; | |
margin-top: 1rem; | |
} | |
.mic-info .label { | |
font-weight: bold; | |
} | |
#vol { | |
width: 100%; | |
max-width: 500px; | |
height: 20px; | |
margin-top: 0.5rem; | |
appearance: none; | |
} | |
#vol::-webkit-progress-bar { | |
background-color: #dcdde1; | |
border-radius: 8px; | |
} | |
#vol::-webkit-progress-value { | |
background-color: #44bd32; | |
border-radius: 8px; | |
transition: width 0.2s; | |
} | |
#vol::-moz-progress-bar { | |
background-color: #44bd32; | |
border-radius: 8px; | |
transition: width 0.2s; | |
} | |
/* Transcript */ | |
.transcript-container { | |
margin-top: 0.5rem; | |
padding: 0.5rem; | |
background: #fff; | |
border: 1px solid #dcdde1; | |
border-radius: 8px; | |
max-height: 300px; | |
overflow-y: auto; | |
white-space: pre-wrap; | |
font-size: 1.1rem; | |
color: #353b48; | |
} | |
.transcript-container .final { | |
color: green; | |
display: inline; | |
margin-right: 0.5em; | |
} | |
.transcript-container .interim { | |
color: red; | |
display: inline; | |
} | |
</style> | |
</head> | |
<body> | |
<h1>🎤 Speak into Your Microphone</h1> | |
<section class="section--settings"> | |
<h2>Recognition Settings</h2> | |
<div class="controls-grid"> | |
<div class="control-item"> | |
<label for="modelSelect">Model</label> | |
<select id="modelSelect"> | |
<option value="csukuangfj/k2fsa-zipformer-bilingual-zh-en-t">k2fsa-small-bilingual-zh-en</option> | |
<option value="pfluo/k2fsa-zipformer-chinese-english-mixed">k2fsa-chinese-english-mixed</option> | |
<option value="k2-fsa/sherpa-onnx-streaming-zipformer-korean-2024-06-16">sherpa-onnx-zipformer-korean</option> | |
<option value="k2-fsa/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12">zipformer-multi-zh-hans</option> | |
<option value="pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615">icefall-zipformer-wenetspeech</option> | |
<option value="csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26">zipformer-en-06-26</option> | |
<option value="csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-21">zipformer-en-06-21</option> | |
<option value="csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21">zipformer-en-02-21</option> | |
<option value="csukuangfj/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20">zipformer-zh-en</option> | |
<option value="shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14">zipformer-fr</option> | |
<option value="csukuangfj/sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23">zipformer-zh-14M</option> | |
<option value="csukuangfj/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17">zipformer-en-20M</option> | |
<option value="csukuangfj/sherpa-onnx-streaming-zipformer-ar_en_id_ja_ru_th_vi_zh-2025-02-10">zipformer-ar_en_id_ja_ru_th_vi_zh</option> | |
</select> | |
</div> | |
<div class="control-item"> | |
<label for="precisionSelect">Precision</label> | |
<select id="precisionSelect"> | |
<option value="fp32">FP32</option> | |
<option value="int8">INT8</option> | |
</select> | |
</div> | |
</div> | |
<div class="model-info"> | |
Languages: <span id="modelLangs"></span> | Size: <span id="modelSize"></span> MB | |
</div> | |
</section> | |
<section class="section--hotwords"> | |
<h2>Hotword Settings</h2> | |
<div class="controls-grid"> | |
<div class="control-item"> | |
<label for="hotwordsList">Hotwords</label> | |
<textarea id="hotwordsList" placeholder="One per line"></textarea> | |
</div> | |
<div class="control-item"> | |
<label for="boostScore">Boost Score: <span id="boostValue">2.0</span></label> | |
<input type="range" id="boostScore" min="0" max="10" step="0.1" value="2.0" /> | |
</div> | |
<div class="control-item"> | |
<button id="applyHotwords">Apply Hotwords</button> | |
</div> | |
</div> | |
<div id="hotwordStatus">Hotword Bias: Off</div> | |
</section> | |
<section class="section--endpoint"> | |
<h2>Endpoint Detection</h2> | |
<div class="controls-grid"> | |
<div class="control-item"> | |
<label for="epRule1">Rule 1 (silence ≥ s)</label> | |
<input type="number" id="epRule1" step="0.1" value="2.4" /> | |
</div> | |
<div class="control-item"> | |
<label for="epRule2">Rule 2 (silence ≥ s)</label> | |
<input type="number" id="epRule2" step="0.1" value="1.2" /> | |
</div> | |
<div class="control-item"> | |
<label for="epRule3">Rule 3 (min utt ms)</label> | |
<input type="number" id="epRule3" step="50" value="300" /> | |
</div> | |
<div class="control-item"> | |
<button id="applyEndpointConfig">Apply Endpoint Config</button> | |
</div> | |
</div> | |
</section> | |
<section class="section--mic"> | |
<h2>Microphone</h2> | |
<div class="mic-info"> | |
<span class="label">Device:</span> <span id="micName">Detecting…</span><br> | |
<span class="label">Sample Rate:</span> <span id="sampleRate">-</span> Hz | |
</div> | |
<progress id="vol" max="1" value="0"></progress> | |
</section> | |
<section class="section--transcript"> | |
<h2>Transcript</h2> | |
<div id="transcript" class="transcript-container">…</div> | |
</section> | |
<script> | |
const MODEL_METADATA = { | |
"csukuangfj/k2fsa-zipformer-bilingual-zh-en-t": { language: ["zh", "en"], size: 115 }, | |
"pfluo/k2fsa-zipformer-chinese-english-mixed": { language: ["zh", "en"], size: 342 }, | |
"k2-fsa/sherpa-onnx-streaming-zipformer-korean-2024-06-16": { language: "korean", size: 300 }, | |
"k2-fsa/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12": { language: "zh-Hans", size: 258 }, | |
"pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615": { language: "zh (WenetSpeech)", size: 273 }, | |
"csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26": { language: "english", size: 340 }, | |
"csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-21": { language: "english", size: 340 }, | |
"csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21": { language: "english", size: 341 }, | |
"csukuangfj/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20": { language: ["zh", "en"], size: 342 }, | |
"shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14": { language: "french", size: 282 }, | |
"csukuangfj/sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23": { language: "zh", size: 53 }, | |
"csukuangfj/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17": { language: "en", size: 88 }, | |
"csukuangfj/sherpa-onnx-streaming-zipformer-ar_en_id_ja_ru_th_vi_zh-2025-02-10": {language: ["ar","en","id","ja","ru","th","vi","zh"], size: 338} | |
}; | |
let orig_sample_rate; | |
let ws; | |
const vol = document.getElementById("vol"); | |
const transcript = document.getElementById("transcript"); | |
const modelSelect = document.getElementById("modelSelect"); | |
const precisionSelect = document.getElementById("precisionSelect"); | |
const hotwordsList = document.getElementById("hotwordsList"); | |
const boostScore = document.getElementById("boostScore"); | |
const boostValue = document.getElementById("boostValue"); | |
const applyBtn = document.getElementById("applyHotwords"); | |
const hotwordStatus = document.getElementById("hotwordStatus"); | |
const modelLangs = document.getElementById("modelLangs"); | |
const modelSize = document.getElementById("modelSize"); | |
const micNameElem = document.getElementById("micName"); | |
const sampleRateElem = document.getElementById("sampleRate"); | |
// ← Helper to toggle the status text | |
function updateHotwordStatus() { | |
const enabled = hotwordsList.value.split(/\r?\n/).filter(Boolean).length > 0 | |
&& parseFloat(boostScore.value) > 0; | |
hotwordStatus.textContent = enabled | |
? "Hotword Bias: On" | |
: "Hotword Bias: Off"; | |
} | |
function updateModelInfo() { | |
const meta = MODEL_METADATA[modelSelect.value]; | |
if (Array.isArray(meta.language)) { | |
modelLangs.textContent = meta.language.join(", "); | |
} else { | |
modelLangs.textContent = meta.language; | |
} | |
modelSize.textContent = meta.size; | |
} | |
navigator.mediaDevices.getUserMedia({ audio: true }).then(stream => { | |
const context = new AudioContext(); | |
orig_sample_rate = context.sampleRate; | |
// Update mic info in UI | |
const track = stream.getAudioTracks()[0]; | |
micNameElem.textContent = track.label || 'Unknown'; | |
sampleRateElem.textContent = orig_sample_rate; | |
updateModelInfo(); | |
// Now that we know the sample rate, open the WS | |
ws = new WebSocket(`wss://${location.host}/ws`); | |
ws.onopen = () => sendConfig(); | |
ws.onerror = err => console.error("WebSocket error:", err); | |
ws.onclose = () => console.log("WebSocket closed"); | |
// Unified handler for partial + final messages | |
ws.onmessage = e => { | |
const msg = JSON.parse(e.data); | |
// 1) update volume bar | |
if (msg.volume !== undefined) { | |
vol.value = Math.min(msg.volume, 1.0); | |
} | |
// 2) distinguish “final” vs “partial” | |
if (msg.final !== undefined) { | |
finalUtterances.push(msg.final.trim()); | |
currentInterim = ""; | |
} else if (msg.partial !== undefined) { | |
currentInterim = msg.partial; | |
} | |
// 3) rebuild the full, colored transcript | |
transcript.innerHTML = | |
finalUtterances | |
.map(u => `<span class="final">${u}</span>`) | |
.join("") /* margin in CSS handles spacing */ | |
+ (currentInterim | |
? ` <span class="interim">${currentInterim}</span>` | |
: ""); | |
// 4) auto-scroll to newest text | |
transcript.scrollTop = transcript.scrollHeight; | |
}; | |
modelSelect.addEventListener("change", () => { | |
updateModelInfo(); | |
sendConfig(); | |
updateHotwordStatus(); | |
}); | |
precisionSelect.addEventListener("change", () => { | |
sendConfig(); | |
updateHotwordStatus(); | |
}); | |
applyBtn.addEventListener("click", () => { | |
sendConfig(); | |
updateHotwordStatus(); | |
}); | |
// Update boost display and status on slider input | |
boostScore.addEventListener("input", () => { | |
boostValue.textContent = boostScore.value; | |
updateHotwordStatus(); | |
}); | |
const source = context.createMediaStreamSource(stream); | |
const processor = context.createScriptProcessor(4096, 1, 1); | |
source.connect(processor); | |
processor.connect(context.destination); | |
processor.onaudioprocess = e => { | |
const input = e.inputBuffer.getChannelData(0); | |
ws.send(new Float32Array(input).buffer); | |
}; | |
}); | |
// 2) Declare state for final/interim rendering | |
const finalUtterances = []; | |
let currentInterim = ""; | |
// 3) Grab your new inputs + button | |
const epRule1Input = document.getElementById("epRule1"); | |
const epRule2Input = document.getElementById("epRule2"); | |
const epRule3Input = document.getElementById("epRule3"); | |
const applyEndpointBtn = document.getElementById("applyEndpointConfig"); | |
// 4) Extend sendConfig() to include epRule1/2/3 | |
function sendConfig() { | |
if (ws && ws.readyState === WebSocket.OPEN) { | |
ws.send(JSON.stringify({ | |
type: "config", | |
sampleRate: orig_sample_rate, | |
model: modelSelect.value, | |
precision: precisionSelect.value, | |
hotwords: hotwordsList.value.split(/\r?\n/).filter(Boolean), | |
hotwordsScore: parseFloat(boostScore.value), | |
// ← new endpoint fields | |
epRule1: parseFloat(epRule1Input.value), | |
epRule2: parseFloat(epRule2Input.value), | |
epRule3: parseInt( epRule3Input.value, 10), | |
})); | |
} | |
} | |
// 5) Re-send config when user clicks “Apply Endpoint Config” | |
applyEndpointBtn.addEventListener("click", () => { | |
sendConfig(); | |
}); | |
// 6) Replace your existing ws.onmessage handler with this: | |
ws.onmessage = e => { | |
const msg = JSON.parse(e.data); | |
if (msg.volume !== undefined) { | |
vol.value = Math.min(msg.volume, 1.0); | |
} | |
if (msg.final !== undefined) { | |
// endpoint fired → lock in the final utterance | |
finalUtterances.push(msg.final.trim()); | |
currentInterim = ""; | |
} else if (msg.partial !== undefined) { | |
// update the rolling interim | |
currentInterim = msg.partial; | |
} | |
// rebuild the full transcript: green finals + red interim | |
transcript.innerHTML = | |
finalUtterances | |
.map(u => `<span class="final">${u}</span>`) | |
.join("") // no explicit space here, margin handles it | |
+ (currentInterim | |
? `<span class="interim">${currentInterim}</span>` | |
: ""); | |
// always scroll to bottom | |
transcript.scrollTop = transcript.scrollHeight; | |
}; | |
</script> | |
</body> | |
</html> | |