nisten commited on
Commit
f7fbf77
·
verified ·
1 Parent(s): 12d1f78

Update src/worker.js

Browse files
Files changed (1) hide show
  1. src/worker.js +177 -102
src/worker.js CHANGED
@@ -26,11 +26,24 @@ import {
26
  MIN_SPEECH_DURATION_SAMPLES,
27
  } from "./constants";
28
 
 
 
 
 
 
 
 
 
 
 
29
  const model_id = "onnx-community/Kokoro-82M-v1.0-ONNX";
30
  let voice;
31
  const tts = await KokoroTTS.from_pretrained(model_id, {
32
- dtype: "fp32",
33
  device: "webgpu",
 
 
 
34
  });
35
 
36
  const device = "webgpu";
@@ -41,18 +54,19 @@ self.postMessage({
41
  duration: "until_next",
42
  });
43
 
44
- // Load models
45
  const silero_vad = await AutoModel.from_pretrained(
46
  "onnx-community/silero-vad",
47
  {
48
  config: { model_type: "custom" },
49
- dtype: "fp32", // Full-precision
50
  },
51
  ).catch((error) => {
52
- self.postMessage({ error });
53
  throw error;
54
  });
55
 
 
56
  const DEVICE_DTYPE_CONFIGS = {
57
  webgpu: {
58
  encoder_model: "fp32",
@@ -63,37 +77,66 @@ const DEVICE_DTYPE_CONFIGS = {
63
  decoder_model_merged: "q8",
64
  },
65
  };
 
66
  const transcriber = await pipeline(
67
  "automatic-speech-recognition",
68
- "onnx-community/whisper-base", // or "onnx-community/moonshine-base-ONNX",
69
  {
70
  device,
71
  dtype: DEVICE_DTYPE_CONFIGS[device],
 
 
 
72
  },
73
  ).catch((error) => {
74
- self.postMessage({ error });
75
  throw error;
76
  });
77
 
78
- await transcriber(new Float32Array(INPUT_SAMPLE_RATE)); // Compile shaders
 
79
 
80
- const llm_model_id = "onnx-community/Qwen3-1.7B-ONNX";
81
- const tokenizer = await AutoTokenizer.from_pretrained(llm_model_id);
82
- const llm = await AutoModelForCausalLM.from_pretrained(llm_model_id, {
 
 
 
 
 
 
 
 
 
83
  dtype: "q4f16",
84
  device: "webgpu",
 
 
 
 
 
 
 
 
85
  });
86
 
 
87
  const SYSTEM_MESSAGE = {
88
  role: "system",
89
  content:
90
- "You're a helpful and conversational voice assistant. Keep your responses short, clear, and casual.",
91
  };
92
- await llm.generate({ ...tokenizer("x"), max_new_tokens: 1 }); // Compile shaders
93
 
 
 
 
 
94
  let messages = [SYSTEM_MESSAGE];
95
  let past_key_values_cache;
96
  let stopping_criteria;
 
 
 
97
  self.postMessage({
98
  type: "status",
99
  status: "ready",
@@ -101,17 +144,17 @@ self.postMessage({
101
  voices: tts.voices,
102
  });
103
 
104
- // Global audio buffer to store incoming audio
105
  const BUFFER = new Float32Array(MAX_BUFFER_DURATION * INPUT_SAMPLE_RATE);
106
  let bufferPointer = 0;
107
 
108
- // Initial state for VAD
109
  const sr = new Tensor("int64", [INPUT_SAMPLE_RATE], []);
110
  let state = new Tensor("float32", new Float32Array(2 * 1 * 128), [2, 1, 128]);
111
 
112
- // Whether we are in the process of adding audio to the buffer
113
  let isRecording = false;
114
- let isPlaying = false; // new flag
115
 
116
  /**
117
  * Perform Voice Activity Detection (VAD)
@@ -122,86 +165,126 @@ async function vad(buffer) {
122
  const input = new Tensor("float32", buffer, [1, buffer.length]);
123
 
124
  const { stateN, output } = await silero_vad({ input, sr, state });
125
- state = stateN; // Update state
126
 
127
  const isSpeech = output.data[0];
128
 
129
- // Use heuristics to determine if the buffer is speech or not
130
  return (
131
- // Case 1: We are above the threshold (definitely speech)
132
  isSpeech > SPEECH_THRESHOLD ||
133
- // Case 2: We are in the process of recording, and the probability is above the negative (exit) threshold
134
  (isRecording && isSpeech >= EXIT_THRESHOLD)
135
  );
136
  }
137
 
138
  /**
139
- * Transcribe the audio buffer
140
  * @param {Float32Array} buffer The audio buffer
141
- * @param {Object} data Additional data
142
  */
143
  const speechToSpeech = async (buffer, data) => {
144
  isPlaying = true;
145
 
146
- // 1. Transcribe the audio from the user
147
- const text = await transcriber(buffer).then(({ text }) => text.trim());
148
- if (["", "[BLANK_AUDIO]"].includes(text)) {
149
- // If the transcription is empty or a blank audio, we skip the rest of the processing
150
- return;
151
- }
152
- messages.push({ role: "user", content: text });
153
-
154
- // Set up text-to-speech streaming
155
- const splitter = new TextSplitterStream();
156
- const stream = tts.stream(splitter, {
157
- voice,
158
- });
159
- (async () => {
160
- for await (const { text, phonemes, audio } of stream) {
161
- self.postMessage({ type: "output", text, result: audio });
162
  }
163
- })();
164
-
165
- // 2. Generate a response using the LLM
166
- const inputs = tokenizer.apply_chat_template(messages, {
167
- add_generation_prompt: true,
168
- return_dict: true,
169
- });
170
- const streamer = new TextStreamer(tokenizer, {
171
- skip_prompt: true,
172
- skip_special_tokens: true,
173
- callback_function: (text) => {
174
- splitter.push(text);
175
- },
176
- token_callback_function: () => {},
177
- });
178
-
179
- stopping_criteria = new InterruptableStoppingCriteria();
180
- const { past_key_values, sequences } = await llm.generate({
181
- ...inputs,
182
- past_key_values: past_key_values_cache,
183
-
184
- do_sample: false, // TODO: do_sample: true is bugged (invalid data location on topk sample)
185
- max_new_tokens: 1024,
186
- streamer,
187
- stopping_criteria,
188
- return_dict_in_generate: true,
189
- });
190
- past_key_values_cache = past_key_values;
191
 
192
- // Finally, close the stream to signal that no more text will be added.
193
- splitter.close();
194
 
195
- const decoded = tokenizer.batch_decode(
196
- sequences.slice(null, [inputs.input_ids.dims[1], null]),
197
- { skip_special_tokens: true },
198
- );
 
199
 
200
- messages.push({ role: "assistant", content: decoded[0] });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  };
202
 
203
- // Track the number of samples after the last speech chunk
204
  let postSpeechSamples = 0;
 
 
205
  const resetAfterRecording = (offset = 0) => {
206
  self.postMessage({
207
  type: "status",
@@ -216,39 +299,39 @@ const resetAfterRecording = (offset = 0) => {
216
  };
217
 
218
  const dispatchForTranscriptionAndResetAudioBuffer = (overflow) => {
219
- // Get start and end time of the speech segment, minus the padding
220
  const now = Date.now();
221
- const end =
222
- now - ((postSpeechSamples + SPEECH_PAD_SAMPLES) / INPUT_SAMPLE_RATE) * 1000;
223
  const start = end - (bufferPointer / INPUT_SAMPLE_RATE) * 1000;
224
  const duration = end - start;
225
  const overflowLength = overflow?.length ?? 0;
226
 
227
- // Send the audio buffer to the worker
228
  const buffer = BUFFER.slice(0, bufferPointer + SPEECH_PAD_SAMPLES);
229
-
230
  const prevLength = prevBuffers.reduce((acc, b) => acc + b.length, 0);
231
  const paddedBuffer = new Float32Array(prevLength + buffer.length);
 
232
  let offset = 0;
233
  for (const prev of prevBuffers) {
234
  paddedBuffer.set(prev, offset);
235
  offset += prev.length;
236
  }
237
  paddedBuffer.set(buffer, offset);
 
 
238
  speechToSpeech(paddedBuffer, { start, end, duration });
239
 
240
- // Set overflow (if present) and reset the rest of the audio buffer
241
  if (overflow) {
242
  BUFFER.set(overflow, 0);
243
  }
244
  resetAfterRecording(overflowLength);
245
  };
246
 
247
- let prevBuffers = [];
248
  self.onmessage = async (event) => {
249
  const { type, buffer } = event.data;
250
 
251
- // refuse new audio while playing back
252
  if (type === "audio" && isPlaying) return;
253
 
254
  switch (type) {
@@ -260,6 +343,7 @@ self.onmessage = async (event) => {
260
  case "end_call":
261
  messages = [SYSTEM_MESSAGE];
262
  past_key_values_cache = null;
 
263
  case "interrupt":
264
  stopping_criteria?.interrupt();
265
  return;
@@ -271,15 +355,13 @@ self.onmessage = async (event) => {
271
  return;
272
  }
273
 
274
- const wasRecording = isRecording; // Save current state
 
275
  const isSpeech = await vad(buffer);
276
 
277
  if (!wasRecording && !isSpeech) {
278
- // We are not recording, and the buffer is not speech,
279
- // so we will probably discard the buffer. So, we insert
280
- // into a FIFO queue with maximum size of PREV_BUFFER_SIZE
281
  if (prevBuffers.length >= MAX_NUM_PREV_BUFFERS) {
282
- // If the queue is full, we discard the oldest buffer
283
  prevBuffers.shift();
284
  }
285
  prevBuffers.push(buffer);
@@ -288,25 +370,21 @@ self.onmessage = async (event) => {
288
 
289
  const remaining = BUFFER.length - bufferPointer;
290
  if (buffer.length >= remaining) {
291
- // The buffer is larger than (or equal to) the remaining space in the global buffer,
292
- // so we perform transcription and copy the overflow to the global buffer
293
  BUFFER.set(buffer.subarray(0, remaining), bufferPointer);
294
  bufferPointer += remaining;
295
 
296
- // Dispatch the audio buffer
297
  const overflow = buffer.subarray(remaining);
298
  dispatchForTranscriptionAndResetAudioBuffer(overflow);
299
  return;
300
  } else {
301
- // The buffer is smaller than the remaining space in the global buffer,
302
- // so we copy it to the global buffer
303
  BUFFER.set(buffer, bufferPointer);
304
  bufferPointer += buffer.length;
305
  }
306
 
307
  if (isSpeech) {
308
  if (!isRecording) {
309
- // Indicate start of recording
310
  self.postMessage({
311
  type: "status",
312
  status: "recording_start",
@@ -314,25 +392,19 @@ self.onmessage = async (event) => {
314
  duration: "until_next",
315
  });
316
  }
317
- // Start or continue recording
318
  isRecording = true;
319
- postSpeechSamples = 0; // Reset the post-speech samples
320
  return;
321
  }
322
 
323
  postSpeechSamples += buffer.length;
324
 
325
- // At this point we're confident that we were recording (wasRecording === true), but the latest buffer is not speech.
326
- // So, we check whether we have reached the end of the current audio chunk.
327
  if (postSpeechSamples < MIN_SILENCE_DURATION_SAMPLES) {
328
- // There was a short pause, but not long enough to consider the end of a speech chunk
329
- // (e.g., the speaker took a breath), so we continue recording
330
  return;
331
  }
332
 
333
  if (bufferPointer < MIN_SPEECH_DURATION_SAMPLES) {
334
- // The entire buffer (including the new chunk) is smaller than the minimum
335
- // duration of a speech chunk, so we can safely discard the buffer.
336
  resetAfterRecording();
337
  return;
338
  }
@@ -340,16 +412,19 @@ self.onmessage = async (event) => {
340
  dispatchForTranscriptionAndResetAudioBuffer();
341
  };
342
 
 
343
  function greet(text) {
344
  isPlaying = true;
345
  const splitter = new TextSplitterStream();
346
  const stream = tts.stream(splitter, { voice });
 
347
  (async () => {
348
  for await (const { text: chunkText, audio } of stream) {
349
  self.postMessage({ type: "output", text: chunkText, result: audio });
350
  }
351
  })();
 
352
  splitter.push(text);
353
  splitter.close();
354
  messages.push({ role: "assistant", content: text });
355
- }
 
26
  MIN_SPEECH_DURATION_SAMPLES,
27
  } from "./constants";
28
 
29
+ // WebGPU availability check - fail fast
30
+ if (!navigator.gpu) {
31
+ self.postMessage({
32
+ type: "error",
33
+ error: new Error("WebGPU not supported. This app requires Chrome 113+, Edge 113+, or Chrome Canary with WebGPU enabled.")
34
+ });
35
+ throw new Error("WebGPU not available");
36
+ }
37
+
38
+ // TTS Configuration
39
  const model_id = "onnx-community/Kokoro-82M-v1.0-ONNX";
40
  let voice;
41
  const tts = await KokoroTTS.from_pretrained(model_id, {
42
+ dtype: "fp16", // Keep fp16 for memory efficiency
43
  device: "webgpu",
44
+ }).catch((error) => {
45
+ self.postMessage({ error: new Error(`TTS loading failed: ${error.message}`) });
46
+ throw error;
47
  });
48
 
49
  const device = "webgpu";
 
54
  duration: "until_next",
55
  });
56
 
57
+ // Load VAD model
58
  const silero_vad = await AutoModel.from_pretrained(
59
  "onnx-community/silero-vad",
60
  {
61
  config: { model_type: "custom" },
62
+ dtype: "fp32",
63
  },
64
  ).catch((error) => {
65
+ self.postMessage({ error: new Error(`VAD loading failed: ${error.message}`) });
66
  throw error;
67
  });
68
 
69
+ // Whisper configuration
70
  const DEVICE_DTYPE_CONFIGS = {
71
  webgpu: {
72
  encoder_model: "fp32",
 
77
  decoder_model_merged: "q8",
78
  },
79
  };
80
+
81
  const transcriber = await pipeline(
82
  "automatic-speech-recognition",
83
+ "onnx-community/whisper-base",
84
  {
85
  device,
86
  dtype: DEVICE_DTYPE_CONFIGS[device],
87
+ // Specify language to avoid warnings
88
+ language: "en",
89
+ task: "transcribe",
90
  },
91
  ).catch((error) => {
92
+ self.postMessage({ error: new Error(`Whisper loading failed: ${error.message}`) });
93
  throw error;
94
  });
95
 
96
+ // Warm up the transcriber
97
+ await transcriber(new Float32Array(INPUT_SAMPLE_RATE));
98
 
99
+ // LLM Configuration - Split tokenizer and model sources
100
+ const TOKENIZER_MODEL_ID = "Qwen/Qwen3-1.7B"; // Original repo has tokenizer
101
+ const ONNX_MODEL_ID = "onnx-community/Qwen3-1.7B-ONNX"; // ONNX weights
102
+
103
+ // Load tokenizer from original repo
104
+ const tokenizer = await AutoTokenizer.from_pretrained(TOKENIZER_MODEL_ID).catch((error) => {
105
+ self.postMessage({ error: new Error(`Tokenizer loading failed: ${error.message}`) });
106
+ throw error;
107
+ });
108
+
109
+ // Load ONNX model weights
110
+ const llm = await AutoModelForCausalLM.from_pretrained(ONNX_MODEL_ID, {
111
  dtype: "q4f16",
112
  device: "webgpu",
113
+ // Add model-specific config for Qwen3
114
+ model_config: {
115
+ use_cache: true,
116
+ attention_bias: false,
117
+ }
118
+ }).catch((error) => {
119
+ self.postMessage({ error: new Error(`LLM loading failed: ${error.message}`) });
120
+ throw error;
121
  });
122
 
123
+ // System prompt optimized for conversational AI
124
  const SYSTEM_MESSAGE = {
125
  role: "system",
126
  content:
127
+ "You're a helpful and conversational voice assistant. Keep your responses short, clear, and casual. Focus on being natural and engaging in conversation.",
128
  };
 
129
 
130
+ // Warm up the LLM
131
+ await llm.generate({ ...tokenizer("x"), max_new_tokens: 1 });
132
+
133
+ // Conversation state
134
  let messages = [SYSTEM_MESSAGE];
135
  let past_key_values_cache;
136
  let stopping_criteria;
137
+ const MAX_CONTEXT_MESSAGES = 20; // Prevent unbounded memory growth
138
+
139
+ // Send ready signal with available voices
140
  self.postMessage({
141
  type: "status",
142
  status: "ready",
 
144
  voices: tts.voices,
145
  });
146
 
147
+ // Audio processing state
148
  const BUFFER = new Float32Array(MAX_BUFFER_DURATION * INPUT_SAMPLE_RATE);
149
  let bufferPointer = 0;
150
 
151
+ // VAD state
152
  const sr = new Tensor("int64", [INPUT_SAMPLE_RATE], []);
153
  let state = new Tensor("float32", new Float32Array(2 * 1 * 128), [2, 1, 128]);
154
 
155
+ // Recording state
156
  let isRecording = false;
157
+ let isPlaying = false;
158
 
159
  /**
160
  * Perform Voice Activity Detection (VAD)
 
165
  const input = new Tensor("float32", buffer, [1, buffer.length]);
166
 
167
  const { stateN, output } = await silero_vad({ input, sr, state });
168
+ state = stateN;
169
 
170
  const isSpeech = output.data[0];
171
 
 
172
  return (
 
173
  isSpeech > SPEECH_THRESHOLD ||
 
174
  (isRecording && isSpeech >= EXIT_THRESHOLD)
175
  );
176
  }
177
 
178
  /**
179
+ * Handle speech-to-speech pipeline
180
  * @param {Float32Array} buffer The audio buffer
181
+ * @param {Object} data Additional timing data
182
  */
183
  const speechToSpeech = async (buffer, data) => {
184
  isPlaying = true;
185
 
186
+ try {
187
+ // 1. Transcribe audio
188
+ const transcription = await transcriber(buffer);
189
+ const text = transcription.text?.trim() || "";
190
+
191
+ if (!text || text === "[BLANK_AUDIO]") {
192
+ isPlaying = false;
193
+ return;
 
 
 
 
 
 
 
 
194
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
 
196
+ // Add user message
197
+ messages.push({ role: "user", content: text });
198
 
199
+ // Manage context window
200
+ if (messages.length > MAX_CONTEXT_MESSAGES) {
201
+ messages = [SYSTEM_MESSAGE, ...messages.slice(-(MAX_CONTEXT_MESSAGES - 1))];
202
+ past_key_values_cache = null; // Reset cache when context changes
203
+ }
204
 
205
+ // Set up TTS streaming
206
+ const splitter = new TextSplitterStream();
207
+ const stream = tts.stream(splitter, { voice });
208
+
209
+ // Stream TTS output
210
+ (async () => {
211
+ try {
212
+ for await (const { text, phonemes, audio } of stream) {
213
+ self.postMessage({ type: "output", text, result: audio });
214
+ }
215
+ } catch (error) {
216
+ console.error("TTS streaming error:", error);
217
+ }
218
+ })();
219
+
220
+ // 2. Generate LLM response
221
+ const inputs = tokenizer.apply_chat_template(messages, {
222
+ add_generation_prompt: true,
223
+ return_dict: true,
224
+ // Qwen3 specific - disable thinking mode for conversational use
225
+ enable_thinking: false,
226
+ });
227
+
228
+ const streamer = new TextStreamer(tokenizer, {
229
+ skip_prompt: true,
230
+ skip_special_tokens: true,
231
+ callback_function: (text) => {
232
+ splitter.push(text);
233
+ },
234
+ token_callback_function: () => {},
235
+ });
236
+
237
+ stopping_criteria = new InterruptableStoppingCriteria();
238
+
239
+ // Generate with appropriate settings for Qwen3
240
+ const { past_key_values, sequences } = await llm.generate({
241
+ ...inputs,
242
+ past_key_values: past_key_values_cache,
243
+
244
+ // Qwen3 optimal settings for non-thinking mode
245
+ do_sample: true,
246
+ temperature: 0.7,
247
+ top_p: 0.8,
248
+ top_k: 20,
249
+ max_new_tokens: 512, // Keep responses concise for voice
250
+
251
+ streamer,
252
+ stopping_criteria,
253
+ return_dict_in_generate: true,
254
+
255
+ // Ensure proper EOS handling for Qwen3
256
+ eos_token_id: [151643, 151645],
257
+ pad_token_id: tokenizer.pad_token_id,
258
+ });
259
+
260
+ past_key_values_cache = past_key_values;
261
+
262
+ // Close the TTS stream
263
+ splitter.close();
264
+
265
+ // Decode and store assistant response
266
+ const decoded = tokenizer.batch_decode(
267
+ sequences.slice(null, [inputs.input_ids.dims[1], null]),
268
+ { skip_special_tokens: true },
269
+ );
270
+
271
+ messages.push({ role: "assistant", content: decoded[0] });
272
+
273
+ } catch (error) {
274
+ console.error("Speech-to-speech error:", error);
275
+ self.postMessage({
276
+ type: "error",
277
+ error: new Error(`Processing failed: ${error.message}`)
278
+ });
279
+ } finally {
280
+ isPlaying = false;
281
+ }
282
  };
283
 
284
+ // Audio buffer management
285
  let postSpeechSamples = 0;
286
+ let prevBuffers = [];
287
+
288
  const resetAfterRecording = (offset = 0) => {
289
  self.postMessage({
290
  type: "status",
 
299
  };
300
 
301
  const dispatchForTranscriptionAndResetAudioBuffer = (overflow) => {
 
302
  const now = Date.now();
303
+ const end = now - ((postSpeechSamples + SPEECH_PAD_SAMPLES) / INPUT_SAMPLE_RATE) * 1000;
 
304
  const start = end - (bufferPointer / INPUT_SAMPLE_RATE) * 1000;
305
  const duration = end - start;
306
  const overflowLength = overflow?.length ?? 0;
307
 
308
+ // Prepare padded buffer
309
  const buffer = BUFFER.slice(0, bufferPointer + SPEECH_PAD_SAMPLES);
 
310
  const prevLength = prevBuffers.reduce((acc, b) => acc + b.length, 0);
311
  const paddedBuffer = new Float32Array(prevLength + buffer.length);
312
+
313
  let offset = 0;
314
  for (const prev of prevBuffers) {
315
  paddedBuffer.set(prev, offset);
316
  offset += prev.length;
317
  }
318
  paddedBuffer.set(buffer, offset);
319
+
320
+ // Process speech
321
  speechToSpeech(paddedBuffer, { start, end, duration });
322
 
323
+ // Handle overflow
324
  if (overflow) {
325
  BUFFER.set(overflow, 0);
326
  }
327
  resetAfterRecording(overflowLength);
328
  };
329
 
330
+ // Message handler
331
  self.onmessage = async (event) => {
332
  const { type, buffer } = event.data;
333
 
334
+ // Block audio during playback
335
  if (type === "audio" && isPlaying) return;
336
 
337
  switch (type) {
 
343
  case "end_call":
344
  messages = [SYSTEM_MESSAGE];
345
  past_key_values_cache = null;
346
+ // Fall through to interrupt
347
  case "interrupt":
348
  stopping_criteria?.interrupt();
349
  return;
 
355
  return;
356
  }
357
 
358
+ // Process audio buffer
359
+ const wasRecording = isRecording;
360
  const isSpeech = await vad(buffer);
361
 
362
  if (!wasRecording && !isSpeech) {
363
+ // Queue non-speech buffers for padding
 
 
364
  if (prevBuffers.length >= MAX_NUM_PREV_BUFFERS) {
 
365
  prevBuffers.shift();
366
  }
367
  prevBuffers.push(buffer);
 
370
 
371
  const remaining = BUFFER.length - bufferPointer;
372
  if (buffer.length >= remaining) {
373
+ // Buffer overflow - trigger transcription
 
374
  BUFFER.set(buffer.subarray(0, remaining), bufferPointer);
375
  bufferPointer += remaining;
376
 
 
377
  const overflow = buffer.subarray(remaining);
378
  dispatchForTranscriptionAndResetAudioBuffer(overflow);
379
  return;
380
  } else {
381
+ // Add to buffer
 
382
  BUFFER.set(buffer, bufferPointer);
383
  bufferPointer += buffer.length;
384
  }
385
 
386
  if (isSpeech) {
387
  if (!isRecording) {
 
388
  self.postMessage({
389
  type: "status",
390
  status: "recording_start",
 
392
  duration: "until_next",
393
  });
394
  }
 
395
  isRecording = true;
396
+ postSpeechSamples = 0;
397
  return;
398
  }
399
 
400
  postSpeechSamples += buffer.length;
401
 
402
+ // Check for end of speech
 
403
  if (postSpeechSamples < MIN_SILENCE_DURATION_SAMPLES) {
 
 
404
  return;
405
  }
406
 
407
  if (bufferPointer < MIN_SPEECH_DURATION_SAMPLES) {
 
 
408
  resetAfterRecording();
409
  return;
410
  }
 
412
  dispatchForTranscriptionAndResetAudioBuffer();
413
  };
414
 
415
+ // Greeting function
416
  function greet(text) {
417
  isPlaying = true;
418
  const splitter = new TextSplitterStream();
419
  const stream = tts.stream(splitter, { voice });
420
+
421
  (async () => {
422
  for await (const { text: chunkText, audio } of stream) {
423
  self.postMessage({ type: "output", text: chunkText, result: audio });
424
  }
425
  })();
426
+
427
  splitter.push(text);
428
  splitter.close();
429
  messages.push({ role: "assistant", content: text });
430
+ }