|
|
|
import { pipeline } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@latest' |
|
import { KokoroTTS } from 'https://cdn.jsdelivr.net/npm/kokoro-js@1.2.1/dist/kokoro.web.js' |
|
|
|
class MyTextToSpeechPipeline { |
|
static task = 'text-to-speech' |
|
static instance = null |
|
|
|
static async getInstance(model, dtype = 'fp32', progress_callback = null) { |
|
try { |
|
|
|
this.instance = await pipeline(this.task, model, { |
|
dtype, |
|
device: 'webgpu', |
|
progress_callback, |
|
quantized: false |
|
}) |
|
return this.instance |
|
} catch (webgpuError) { |
|
|
|
if (progress_callback) { |
|
progress_callback({ |
|
status: 'fallback', |
|
message: 'WebGPU failed, falling back to WASM' |
|
}) |
|
} |
|
try { |
|
this.instance = await pipeline(this.task, model, { |
|
dtype, |
|
device: 'wasm', |
|
progress_callback, |
|
quantized: false |
|
}) |
|
return this.instance |
|
} catch (wasmError) { |
|
throw new Error( |
|
`Both WebGPU and WASM failed. WebGPU error: ${webgpuError.message}. WASM error: ${wasmError.message}` |
|
) |
|
} |
|
} |
|
} |
|
} |
|
|
|
class MyKokoroTTSPipeline { |
|
static instance = null |
|
|
|
static async getInstance(model, dtype = 'fp32', progress_callback = null) { |
|
try { |
|
const device = 'webgpu' |
|
if (progress_callback) { |
|
progress_callback({ |
|
status: 'loading', |
|
message: `Loading Kokoro TTS model with ${device} device` |
|
}) |
|
} |
|
|
|
this.instance = await KokoroTTS.from_pretrained(model, { |
|
dtype, |
|
device, |
|
progress_callback: progress_callback |
|
? (data) => { |
|
progress_callback({ |
|
status: 'loading', |
|
...data |
|
}) |
|
} |
|
: null |
|
}) |
|
return this.instance |
|
} catch (webgpuError) { |
|
|
|
if (progress_callback) { |
|
progress_callback({ |
|
status: 'fallback', |
|
message: 'WebGPU failed, falling back to WASM' |
|
}) |
|
} |
|
try { |
|
this.instance = await KokoroTTS.from_pretrained(model, { |
|
dtype, |
|
device: 'wasm', |
|
progress_callback: progress_callback |
|
? (data) => { |
|
progress_callback({ |
|
status: 'loading', |
|
...data |
|
}) |
|
} |
|
: null |
|
}) |
|
return this.instance |
|
} catch (wasmError) { |
|
throw new Error( |
|
`Both WebGPU and WASM failed for Kokoro TTS. WebGPU error: ${webgpuError.message}. WASM error: ${wasmError.message}` |
|
) |
|
} |
|
} |
|
} |
|
} |
|
|
|
self.addEventListener('message', async (event) => { |
|
try { |
|
const { type, model, dtype, text, isStyleTTS2, config } = event.data |
|
|
|
if (!model) { |
|
self.postMessage({ |
|
status: 'error', |
|
output: 'No model provided' |
|
}) |
|
return |
|
} |
|
|
|
let synthesizer |
|
if (isStyleTTS2) { |
|
|
|
synthesizer = await MyKokoroTTSPipeline.getInstance( |
|
model, |
|
dtype || 'q8', |
|
(x) => { |
|
self.postMessage({ status: 'loading', output: x }) |
|
} |
|
) |
|
} else { |
|
|
|
synthesizer = await MyTextToSpeechPipeline.getInstance( |
|
model, |
|
dtype || 'fp32', |
|
(x) => { |
|
self.postMessage({ status: 'loading', output: x }) |
|
} |
|
) |
|
} |
|
|
|
if (type === 'load') { |
|
self.postMessage({ |
|
status: 'ready', |
|
output: `Model ${model}${isStyleTTS2 ? ' StyleTTS2' : ''}, dtype ${dtype} loaded` |
|
}) |
|
return |
|
} |
|
|
|
if (type === 'synthesize') { |
|
if (!text || typeof text !== 'string' || text.trim() === '') { |
|
self.postMessage({ |
|
status: 'error', |
|
output: 'No text provided for synthesis' |
|
}) |
|
return |
|
} |
|
|
|
try { |
|
let output |
|
|
|
if (isStyleTTS2) { |
|
const options = {} |
|
|
|
options.voice = config.voice |
|
const audioResult = await synthesizer.generate(text.trim(), options) |
|
|
|
output = { |
|
audio: Array.from(audioResult.audio), |
|
sampling_rate: audioResult.sampling_rate || 24000 |
|
} |
|
} else { |
|
const options = {} |
|
|
|
if (config?.speakerEmbeddings) { |
|
try { |
|
const response = await fetch(config.speakerEmbeddings) |
|
if (response.ok) { |
|
const embeddings = await response.arrayBuffer() |
|
options.speaker_embeddings = new Float32Array(embeddings) |
|
} |
|
} catch (error) { |
|
console.warn('Failed to load speaker embeddings:', error) |
|
} |
|
} |
|
|
|
const result = await synthesizer(text.trim(), options) |
|
output = { |
|
audio: Array.from(result.audio), |
|
sampling_rate: result.sampling_rate |
|
} |
|
} |
|
|
|
self.postMessage({ |
|
status: 'output', |
|
output |
|
}) |
|
|
|
self.postMessage({ status: 'ready' }) |
|
} catch (error) { |
|
throw error |
|
} |
|
} |
|
} catch (error) { |
|
self.postMessage({ |
|
status: 'error', |
|
output: |
|
error.message || 'An error occurred during text-to-speech synthesis' |
|
}) |
|
} |
|
}) |
|
|