Spaces:

Vokturz
/

transformers-js-playground

Running

File size: 5,453 Bytes

/* eslint-disable no-restricted-globals */
import { pipeline } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@latest'
import { KokoroTTS } from 'https://cdn.jsdelivr.net/npm/kokoro-js@1.2.1/dist/kokoro.web.js'

class MyTextToSpeechPipeline {
  static task = 'text-to-speech'
  static instance = null

  static async getInstance(model, dtype = 'fp32', progress_callback = null) {
    try {
      // Try WebGPU first
      this.instance = await pipeline(this.task, model, {
        dtype,
        device: 'webgpu',
        progress_callback,
        quantized: false
      })
      return this.instance
    } catch (webgpuError) {
      // Fallback to WASM if WebGPU fails
      if (progress_callback) {
        progress_callback({
          status: 'fallback',
          message: 'WebGPU failed, falling back to WASM'
        })
      }
      try {
        this.instance = await pipeline(this.task, model, {
          dtype,
          device: 'wasm',
          progress_callback,
          quantized: false
        })
        return this.instance
      } catch (wasmError) {
        throw new Error(
          `Both WebGPU and WASM failed. WebGPU error: ${webgpuError.message}. WASM error: ${wasmError.message}`
        )
      }
    }
  }
}

class MyKokoroTTSPipeline {
  static instance = null

  static async getInstance(model, dtype = 'fp32', progress_callback = null) {
    try {
      const device = 'webgpu'
      if (progress_callback) {
        progress_callback({
          status: 'loading',
          message: `Loading Kokoro TTS model with ${device} device`
        })
      }

      this.instance = await KokoroTTS.from_pretrained(model, {
        dtype,
        device,
        progress_callback: progress_callback
          ? (data) => {
              progress_callback({
                status: 'loading',
                ...data
              })
            }
          : null
      })
      return this.instance
    } catch (webgpuError) {
      // Fallback to WASM if WebGPU fails
      if (progress_callback) {
        progress_callback({
          status: 'fallback',
          message: 'WebGPU failed, falling back to WASM'
        })
      }
      try {
        this.instance = await KokoroTTS.from_pretrained(model, {
          dtype,
          device: 'wasm',
          progress_callback: progress_callback
            ? (data) => {
                progress_callback({
                  status: 'loading',
                  ...data
                })
              }
            : null
        })
        return this.instance
      } catch (wasmError) {
        throw new Error(
          `Both WebGPU and WASM failed for Kokoro TTS. WebGPU error: ${webgpuError.message}. WASM error: ${wasmError.message}`
        )
      }
    }
  }
}

self.addEventListener('message', async (event) => {
  try {
    const { type, model, dtype, text, isStyleTTS2, config } = event.data

    if (!model) {
      self.postMessage({
        status: 'error',
        output: 'No model provided'
      })
      return
    }

    let synthesizer
    if (isStyleTTS2) {
      // Use Kokoro TTS for StyleTTS2 models
      synthesizer = await MyKokoroTTSPipeline.getInstance(
        model,
        dtype || 'q8',
        (x) => {
          self.postMessage({ status: 'loading', output: x })
        }
      )
    } else {
      // Use standard transformers pipeline
      synthesizer = await MyTextToSpeechPipeline.getInstance(
        model,
        dtype || 'fp32',
        (x) => {
          self.postMessage({ status: 'loading', output: x })
        }
      )
    }

    if (type === 'load') {
      self.postMessage({
        status: 'ready',
        output: `Model ${model}${isStyleTTS2 ? ' StyleTTS2' : ''}, dtype ${dtype} loaded`
      })
      return
    }

    if (type === 'synthesize') {
      if (!text || typeof text !== 'string' || text.trim() === '') {
        self.postMessage({
          status: 'error',
          output: 'No text provided for synthesis'
        })
        return
      }

      try {
        let output

        if (isStyleTTS2) {
          const options = {}

          options.voice = config.voice
          const audioResult = await synthesizer.generate(text.trim(), options)

          output = {
            audio: Array.from(audioResult.audio),
            sampling_rate: audioResult.sampling_rate || 24000 // Default for Kokoro
          }
        } else {
          const options = {}

          if (config?.speakerEmbeddings) {
            try {
              const response = await fetch(config.speakerEmbeddings)
              if (response.ok) {
                const embeddings = await response.arrayBuffer()
                options.speaker_embeddings = new Float32Array(embeddings)
              }
            } catch (error) {
              console.warn('Failed to load speaker embeddings:', error)
            }
          }

          const result = await synthesizer(text.trim(), options)
          output = {
            audio: Array.from(result.audio),
            sampling_rate: result.sampling_rate
          }
        }

        self.postMessage({
          status: 'output',
          output
        })

        self.postMessage({ status: 'ready' })
      } catch (error) {
        throw error
      }
    }
  } catch (error) {
    self.postMessage({
      status: 'error',
      output:
        error.message || 'An error occurred during text-to-speech synthesis'
    })
  }
})