Spaces:

Luigi
/

Streaming-Zipformer

Running

App Files Files Community

Streaming-Zipformer / app /static /index.html

Luigi

Revert "adjust eprule1 and 2 defaults"

43609f1 about 1 month ago

raw

history blame contribute delete

14.9 kB

	<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="UTF-8" />
	<title>🎤 Real-Time ASR Demo</title>
	<style>
	/* Page layout */
	body {
	font-family: "Segoe UI", sans-serif;
	background-color: #f5f6fa;
	margin: 0;
	padding: 2rem;
	color: #2f3640;
	display: flex;
	flex-direction: column;
	align-items: center;
	}
	h1 {
	font-size: 2rem;
	margin-bottom: 1rem;
	}
	section {
	width: 100%;
	max-width: 900px;
	margin-bottom: 1.5rem;
	background: white;
	border-radius: 8px;
	padding: 1rem;
	box-shadow: 0 0 8px rgba(0,0,0,0.1);
	}
	section h2 {
	margin-top: 0;
	font-size: 1.2rem;
	border-bottom: 1px solid #dcdde1;
	padding-bottom: 0.5rem;
	color: #2f3640;
	}

	/* Grid for controls */
	.controls-grid {
	display: grid;
	grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
	gap: 1rem;
	margin-top: 1rem;
	}
	.control-item {
	display: flex;
	flex-direction: column;
	}
	.control-item > label {
	font-weight: 600;
	margin-bottom: 0.3rem;
	}
	.control-item > select,
	.control-item > input,
	.control-item > textarea {
	padding: 0.5rem;
	border: 1px solid #dcdde1;
	border-radius: 5px;
	font-size: 1rem;
	background: white;
	}
	.control-item > textarea {
	resize: vertical;
	min-height: 4rem;
	}
	.control-item > button {
	margin-top: 0.5rem;
	padding: 0.5rem;
	border: none;
	border-radius: 5px;
	background-color: #44bd32;
	color: white;
	font-size: 1rem;
	cursor: pointer;
	transition: background-color 0.2s;
	}
	.control-item > button:hover {
	background-color: #4cd137;
	}

	/* Status text */
	#hotwordStatus {
	margin-top: 0.5rem;
	font-size: 0.9rem;
	color: #e1b12c;
	font-weight: bold;
	text-align: center;
	}

	/* Mic info and volume */
	.mic-info {
	font-size: 0.9rem;
	color: #353b48;
	margin-top: 1rem;
	}
	.mic-info .label {
	font-weight: bold;
	}
	#vol {
	width: 100%;
	max-width: 500px;
	height: 20px;
	margin-top: 0.5rem;
	appearance: none;
	}
	#vol::-webkit-progress-bar {
	background-color: #dcdde1;
	border-radius: 8px;
	}
	#vol::-webkit-progress-value {
	background-color: #44bd32;
	border-radius: 8px;
	transition: width 0.2s;
	}
	#vol::-moz-progress-bar {
	background-color: #44bd32;
	border-radius: 8px;
	transition: width 0.2s;
	}

	/* Transcript */
	.transcript-container {
	margin-top: 0.5rem;
	padding: 0.5rem;
	background: #fff;
	border: 1px solid #dcdde1;
	border-radius: 8px;
	max-height: 300px;
	overflow-y: auto;
	white-space: pre-wrap;
	font-size: 1.1rem;
	color: #353b48;
	}
	.transcript-container .final {
	color: green;
	display: inline;
	margin-right: 0.5em;
	}
	.transcript-container .interim {
	color: red;
	display: inline;
	}
	</style>
	</head>
	<body>
	<h1>🎤 Speak into Your Microphone</h1>

	<section class="section--settings">
	<h2>Recognition Settings</h2>
	<div class="controls-grid">
	<div class="control-item">
	<label for="modelSelect">Model</label>
	<select id="modelSelect">
	<option value="csukuangfj/k2fsa-zipformer-bilingual-zh-en-t">k2fsa-small-bilingual-zh-en</option>
	<option value="pfluo/k2fsa-zipformer-chinese-english-mixed">k2fsa-chinese-english-mixed</option>
	<option value="k2-fsa/sherpa-onnx-streaming-zipformer-korean-2024-06-16">sherpa-onnx-zipformer-korean</option>
	<option value="k2-fsa/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12">zipformer-multi-zh-hans</option>
	<option value="pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615">icefall-zipformer-wenetspeech</option>
	<option value="csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26">zipformer-en-06-26</option>
	<option value="csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-21">zipformer-en-06-21</option>
	<option value="csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21">zipformer-en-02-21</option>
	<option value="csukuangfj/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20">zipformer-zh-en</option>
	<option value="shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14">zipformer-fr</option>
	<option value="csukuangfj/sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23">zipformer-zh-14M</option>
	<option value="csukuangfj/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17">zipformer-en-20M</option>
	<option value="csukuangfj/sherpa-onnx-streaming-zipformer-ar_en_id_ja_ru_th_vi_zh-2025-02-10">zipformer-ar_en_id_ja_ru_th_vi_zh</option>
	</select>
	</div>
	<div class="control-item">
	<label for="precisionSelect">Precision</label>
	<select id="precisionSelect">
	<option value="fp32">FP32</option>
	<option value="int8">INT8</option>
	</select>
	</div>
	</div>
	<div class="model-info">
	Languages: <span id="modelLangs"></span> \| Size: <span id="modelSize"></span> MB
	</div>
	</section>

	<section class="section--hotwords">
	<h2>Hotword Settings</h2>
	<div class="controls-grid">
	<div class="control-item">
	<label for="hotwordsList">Hotwords</label>
	<textarea id="hotwordsList" placeholder="One per line"></textarea>
	</div>
	<div class="control-item">
	<label for="boostScore">Boost Score: <span id="boostValue">2.0</span></label>
	<input type="range" id="boostScore" min="0" max="10" step="0.1" value="2.0" />
	</div>
	<div class="control-item">
	<button id="applyHotwords">Apply Hotwords</button>
	</div>
	</div>
	<div id="hotwordStatus">Hotword Bias: Off</div>
	</section>

	<section class="section--endpoint">
	<h2>Endpoint Detection</h2>
	<div class="controls-grid">
	<div class="control-item">
	<label for="epRule1">Rule 1 (silence ≥ s)</label>
	<input type="number" id="epRule1" step="0.1" value="2.4" />
	</div>
	<div class="control-item">
	<label for="epRule2">Rule 2 (silence ≥ s)</label>
	<input type="number" id="epRule2" step="0.1" value="1.2" />
	</div>
	<div class="control-item">
	<label for="epRule3">Rule 3 (min utt ms)</label>
	<input type="number" id="epRule3" step="50" value="300" />
	</div>
	<div class="control-item">
	<button id="applyEndpointConfig">Apply Endpoint Config</button>
	</div>
	</div>
	</section>

	<section class="section--mic">
	<h2>Microphone</h2>
	<div class="mic-info">
	<span class="label">Device:</span> <span id="micName">Detecting…</span><br>
	<span class="label">Sample Rate:</span> <span id="sampleRate">-</span> Hz
	</div>
	<progress id="vol" max="1" value="0"></progress>
	</section>

	<section class="section--transcript">
	<h2>Transcript</h2>
	<div id="transcript" class="transcript-container">…</div>
	</section>

	<script>
	const MODEL_METADATA = {
	"csukuangfj/k2fsa-zipformer-bilingual-zh-en-t": { language: ["zh", "en"], size: 115 },
	"pfluo/k2fsa-zipformer-chinese-english-mixed": { language: ["zh", "en"], size: 342 },
	"k2-fsa/sherpa-onnx-streaming-zipformer-korean-2024-06-16": { language: "korean", size: 300 },
	"k2-fsa/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12": { language: "zh-Hans", size: 258 },
	"pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615": { language: "zh (WenetSpeech)", size: 273 },
	"csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26": { language: "english", size: 340 },
	"csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-21": { language: "english", size: 340 },
	"csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21": { language: "english", size: 341 },
	"csukuangfj/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20": { language: ["zh", "en"], size: 342 },
	"shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14": { language: "french", size: 282 },
	"csukuangfj/sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23": { language: "zh", size: 53 },
	"csukuangfj/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17": { language: "en", size: 88 },
	"csukuangfj/sherpa-onnx-streaming-zipformer-ar_en_id_ja_ru_th_vi_zh-2025-02-10": {language: ["ar","en","id","ja","ru","th","vi","zh"], size: 338}
	};

	let orig_sample_rate;
	let ws;

	const vol = document.getElementById("vol");
	const transcript = document.getElementById("transcript");
	const modelSelect = document.getElementById("modelSelect");
	const precisionSelect = document.getElementById("precisionSelect");
	const hotwordsList = document.getElementById("hotwordsList");
	const boostScore = document.getElementById("boostScore");
	const boostValue = document.getElementById("boostValue");
	const applyBtn = document.getElementById("applyHotwords");
	const hotwordStatus = document.getElementById("hotwordStatus");
	const modelLangs = document.getElementById("modelLangs");
	const modelSize = document.getElementById("modelSize");
	const micNameElem = document.getElementById("micName");
	const sampleRateElem = document.getElementById("sampleRate");

	// ← Helper to toggle the status text
	function updateHotwordStatus() {
	const enabled = hotwordsList.value.split(/\r?\n/).filter(Boolean).length > 0
	&& parseFloat(boostScore.value) > 0;
	hotwordStatus.textContent = enabled
	? "Hotword Bias: On"
	: "Hotword Bias: Off";
	}

	function updateModelInfo() {
	const meta = MODEL_METADATA[modelSelect.value];
	if (Array.isArray(meta.language)) {
	modelLangs.textContent = meta.language.join(", ");
	} else {
	modelLangs.textContent = meta.language;
	}
	modelSize.textContent = meta.size;
	}

	navigator.mediaDevices.getUserMedia({ audio: true }).then(stream => {
	const context = new AudioContext();
	orig_sample_rate = context.sampleRate;

	// Update mic info in UI
	const track = stream.getAudioTracks()[0];
	micNameElem.textContent = track.label \|\| 'Unknown';
	sampleRateElem.textContent = orig_sample_rate;

	updateModelInfo();

	// Now that we know the sample rate, open the WS
	ws = new WebSocket(`wss://${location.host}/ws`);
	ws.onopen = () => sendConfig();
	ws.onerror = err => console.error("WebSocket error:", err);
	ws.onclose = () => console.log("WebSocket closed");

	// Unified handler for partial + final messages
	ws.onmessage = e => {
	const msg = JSON.parse(e.data);

	// 1) update volume bar
	if (msg.volume !== undefined) {
	vol.value = Math.min(msg.volume, 1.0);
	}

	// 2) distinguish “final” vs “partial”
	if (msg.final !== undefined) {
	finalUtterances.push(msg.final.trim());
	currentInterim = "";
	} else if (msg.partial !== undefined) {
	currentInterim = msg.partial;
	}

	// 3) rebuild the full, colored transcript
	transcript.innerHTML =
	finalUtterances
	.map(u => `<span class="final">${u}</span>`)
	.join("") /* margin in CSS handles spacing */
	+ (currentInterim
	? ` <span class="interim">${currentInterim}</span>`
	: "");

	// 4) auto-scroll to newest text
	transcript.scrollTop = transcript.scrollHeight;
	};

	modelSelect.addEventListener("change", () => {
	updateModelInfo();
	sendConfig();
	updateHotwordStatus();
	});
	precisionSelect.addEventListener("change", () => {
	sendConfig();
	updateHotwordStatus();
	});
	applyBtn.addEventListener("click", () => {
	sendConfig();
	updateHotwordStatus();
	});

	// Update boost display and status on slider input
	boostScore.addEventListener("input", () => {
	boostValue.textContent = boostScore.value;
	updateHotwordStatus();
	});

	const source = context.createMediaStreamSource(stream);
	const processor = context.createScriptProcessor(4096, 1, 1);
	source.connect(processor);
	processor.connect(context.destination);
	processor.onaudioprocess = e => {
	const input = e.inputBuffer.getChannelData(0);
	ws.send(new Float32Array(input).buffer);
	};
	});

	// 2) Declare state for final/interim rendering
	const finalUtterances = [];
	let currentInterim = "";

	// 3) Grab your new inputs + button
	const epRule1Input = document.getElementById("epRule1");
	const epRule2Input = document.getElementById("epRule2");
	const epRule3Input = document.getElementById("epRule3");
	const applyEndpointBtn = document.getElementById("applyEndpointConfig");

	// 4) Extend sendConfig() to include epRule1/2/3
	function sendConfig() {
	if (ws && ws.readyState === WebSocket.OPEN) {
	ws.send(JSON.stringify({
	type: "config",
	sampleRate: orig_sample_rate,
	model: modelSelect.value,
	precision: precisionSelect.value,
	hotwords: hotwordsList.value.split(/\r?\n/).filter(Boolean),
	hotwordsScore: parseFloat(boostScore.value),

	// ← new endpoint fields
	epRule1: parseFloat(epRule1Input.value),
	epRule2: parseFloat(epRule2Input.value),
	epRule3: parseInt( epRule3Input.value, 10),
	}));
	}
	}

	// 5) Re-send config when user clicks “Apply Endpoint Config”
	applyEndpointBtn.addEventListener("click", () => {
	sendConfig();
	});

	// 6) Replace your existing ws.onmessage handler with this:
	ws.onmessage = e => {
	const msg = JSON.parse(e.data);

	if (msg.volume !== undefined) {
	vol.value = Math.min(msg.volume, 1.0);
	}

	if (msg.final !== undefined) {
	// endpoint fired → lock in the final utterance
	finalUtterances.push(msg.final.trim());
	currentInterim = "";
	} else if (msg.partial !== undefined) {
	// update the rolling interim
	currentInterim = msg.partial;
	}

	// rebuild the full transcript: green finals + red interim
	transcript.innerHTML =
	finalUtterances
	.map(u => `<span class="final">${u}</span>`)
	.join("") // no explicit space here, margin handles it
	+ (currentInterim
	? `<span class="interim">${currentInterim}</span>`
	: "");

	// always scroll to bottom
	transcript.scrollTop = transcript.scrollHeight;
	};

	</script>
	</body>
	</html>