Spaces:

Vokturz
/

transformers-js-playground

Running

App Files Files Community

Vokturz commited on 12 days ago

Commit

25647ae

1 Parent(s): 63dbafb

add text-to-speech support

Browse files

Files changed (15) hide show

package.json +1 -0
pnpm-lock.yaml +25 -0
public/workers/text-to-speech.js +120 -0
src/components/AudioPlayer.tsx +434 -0
src/components/ModelLoader.tsx +1 -1
src/components/PipelineLayout.tsx +4 -0
src/components/PipelineSelector.tsx +1 -0
src/components/Sidebar.tsx +11 -0
src/components/pipelines/TextToSpeech.tsx +207 -0
src/components/pipelines/TextToSpeechConfig.tsx +42 -0
src/components/ui/label.tsx +22 -0
src/contexts/TextToSpeechContext.tsx +73 -0
src/lib/huggingface.ts +8 -0
src/lib/workerManager.ts +3 -0
src/types.ts +10 -0

package.json CHANGED Viewed

@@ -5,6 +5,7 @@
   "private": true,
   "dependencies": {
     "@headlessui/react": "^2.2.4",
     "@radix-ui/react-select": "^2.2.5",
     "@radix-ui/react-separator": "^1.1.7",
     "@radix-ui/react-slider": "^1.3.5",

   "private": true,
   "dependencies": {
     "@headlessui/react": "^2.2.4",
+    "@radix-ui/react-label": "^2.1.7",
     "@radix-ui/react-select": "^2.2.5",
     "@radix-ui/react-separator": "^1.1.7",
     "@radix-ui/react-slider": "^1.3.5",

pnpm-lock.yaml CHANGED Viewed

@@ -11,6 +11,9 @@ importers:
       '@headlessui/react':
         specifier: ^2.2.4
         version: 2.2.4(react-dom@19.1.0(react@19.1.0))(react@19.1.0)
       '@radix-ui/react-select':
         specifier: ^2.2.5
         version: 2.2.5(@types/react-dom@19.1.6(@types/react@19.1.8))(@types/react@19.1.8)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)
@@ -540,6 +543,19 @@ packages:
       '@types/react':
         optional: true
   '@radix-ui/react-popper@1.2.7':
     resolution: {integrity: sha512-IUFAccz1JyKcf/RjB552PlWwxjeCJB8/4KxT7EhBHOJM+mN7LdW+B3kacJXILm32xawcMMjb2i0cIZpo+f9kiQ==}
     peerDependencies:
@@ -2696,6 +2712,15 @@ snapshots:
     optionalDependencies:
       '@types/react': 19.1.8
   '@radix-ui/react-popper@1.2.7(@types/react-dom@19.1.6(@types/react@19.1.8))(@types/react@19.1.8)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)':
     dependencies:
       '@floating-ui/react-dom': 2.1.4(react-dom@19.1.0(react@19.1.0))(react@19.1.0)

       '@headlessui/react':
         specifier: ^2.2.4
         version: 2.2.4(react-dom@19.1.0(react@19.1.0))(react@19.1.0)
+      '@radix-ui/react-label':
+        specifier: ^2.1.7
+        version: 2.1.7(@types/react-dom@19.1.6(@types/react@19.1.8))(@types/react@19.1.8)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)
       '@radix-ui/react-select':
         specifier: ^2.2.5
         version: 2.2.5(@types/react-dom@19.1.6(@types/react@19.1.8))(@types/react@19.1.8)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)
       '@types/react':
         optional: true
+  '@radix-ui/react-label@2.1.7':
+    resolution: {integrity: sha512-YT1GqPSL8kJn20djelMX7/cTRp/Y9w5IZHvfxQTVHrOqa2yMl7i/UfMqKRU5V7mEyKTrUVgJXhNQPVCG8PBLoQ==}
+    peerDependencies:
+      '@types/react': '*'
+      '@types/react-dom': '*'
+      react: ^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc
+      react-dom: ^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc
+    peerDependenciesMeta:
+      '@types/react':
+        optional: true
+      '@types/react-dom':
+        optional: true
   '@radix-ui/react-popper@1.2.7':
     resolution: {integrity: sha512-IUFAccz1JyKcf/RjB552PlWwxjeCJB8/4KxT7EhBHOJM+mN7LdW+B3kacJXILm32xawcMMjb2i0cIZpo+f9kiQ==}
     peerDependencies:
     optionalDependencies:
       '@types/react': 19.1.8
+  '@radix-ui/react-label@2.1.7(@types/react-dom@19.1.6(@types/react@19.1.8))(@types/react@19.1.8)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)':
+    dependencies:
+      '@radix-ui/react-primitive': 2.1.3(@types/react-dom@19.1.6(@types/react@19.1.8))(@types/react@19.1.8)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)
+      react: 19.1.0
+      react-dom: 19.1.0(react@19.1.0)
+    optionalDependencies:
+      '@types/react': 19.1.8
+      '@types/react-dom': 19.1.6(@types/react@19.1.8)
   '@radix-ui/react-popper@1.2.7(@types/react-dom@19.1.6(@types/react@19.1.8))(@types/react@19.1.8)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)':
     dependencies:
       '@floating-ui/react-dom': 2.1.4(react-dom@19.1.0(react@19.1.0))(react@19.1.0)

public/workers/text-to-speech.js ADDED Viewed

	@@ -0,0 +1,120 @@

+/* eslint-disable no-restricted-globals */
+import { pipeline } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@latest'
+class MyTextToSpeechPipeline {
+  static task = 'text-to-speech'
+  static instance = null
+  static async getInstance(model, dtype = 'fp32', progress_callback = null) {
+    try {
+      // Try WebGPU first
+      this.instance = await pipeline(this.task, model, {
+        dtype,
+        device: 'webgpu',
+        progress_callback,
+        quantized: false
+      })
+      return this.instance
+    } catch (webgpuError) {
+      // Fallback to WASM if WebGPU fails
+      if (progress_callback) {
+        progress_callback({
+          status: 'fallback',
+          message: 'WebGPU failed, falling back to WASM'
+        })
+      }
+      try {
+        this.instance = await pipeline(this.task, model, {
+          dtype,
+          device: 'wasm',
+          progress_callback,
+          quantized: false
+        })
+        return this.instance
+      } catch (wasmError) {
+        throw new Error(
+          `Both WebGPU and WASM failed. WebGPU error: ${webgpuError.message}. WASM error: ${wasmError.message}`
+        )
+      }
+    }
+  }
+}
+// Listen for messages from the main thread
+self.addEventListener('message', async (event) => {
+  try {
+    const { type, model, dtype, text, config } = event.data
+    if (!model) {
+      self.postMessage({
+        status: 'error',
+        output: 'No model provided'
+      })
+      return
+    }
+    // Retrieve the pipeline. This will download the model if not already cached.
+    const synthesizer = await MyTextToSpeechPipeline.getInstance(
+      model,
+      dtype,
+      (x) => {
+        self.postMessage({ status: 'loading', output: x })
+      }
+    )
+    if (type === 'load') {
+      self.postMessage({
+        status: 'ready',
+        output: `Model ${model}, dtype ${dtype} loaded`
+      })
+      return
+    }
+    if (type === 'synthesize') {
+      if (!text || typeof text !== 'string' || text.trim() === '') {
+        self.postMessage({
+          status: 'error',
+          output: 'No text provided for synthesis'
+        })
+        return
+      }
+      const options = {}
+      // Add speaker embeddings if provided
+      if (config?.speakerEmbeddings) {
+        try {
+          const response = await fetch(config.speakerEmbeddings)
+          if (response.ok) {
+            const embeddings = await response.arrayBuffer()
+            options.speaker_embeddings = new Float32Array(embeddings)
+          }
+        } catch (error) {
+          console.warn('Failed to load speaker embeddings:', error)
+          // Continue without speaker embeddings
+        }
+      }
+      try {
+        const output = await synthesizer(text.trim(), options)
+        self.postMessage({
+          status: 'output',
+          output: {
+            audio: Array.from(output.audio),
+            sampling_rate: output.sampling_rate
+          }
+        })
+        self.postMessage({ status: 'ready' })
+      } catch (error) {
+        throw error
+      }
+    }
+  } catch (error) {
+    self.postMessage({
+      status: 'error',
+      output: error.message || 'An error occurred during text-to-speech synthesis'
+    })
+  }
+})

src/components/AudioPlayer.tsx ADDED Viewed

	@@ -0,0 +1,434 @@

+import { useState, useRef, useEffect, useCallback, useMemo } from 'react'
+import { Play, Square, Download, Eraser, Loader2, Volume2 } from 'lucide-react'
+interface AudioPlayerProps {
+  audio: Float32Array
+  samplingRate: number
+  text: string
+  index: number
+}
+function createWavBuffer(
+  audioData: Float32Array,
+  sampleRate: number
+): ArrayBuffer {
+  const length = audioData.length
+  const buffer = new ArrayBuffer(44 + length * 2)
+  const view = new DataView(buffer)
+  // WAV header
+  const writeString = (offset: number, string: string) => {
+    for (let i = 0; i < string.length; i++) {
+      view.setUint8(offset + i, string.charCodeAt(i))
+    }
+  }
+  writeString(0, 'RIFF')
+  view.setUint32(4, 36 + length * 2, true)
+  writeString(8, 'WAVE')
+  writeString(12, 'fmt ')
+  view.setUint32(16, 16, true)
+  view.setUint16(20, 1, true)
+  view.setUint16(22, 1, true)
+  view.setUint32(24, sampleRate, true)
+  view.setUint32(28, sampleRate * 2, true)
+  view.setUint16(32, 2, true)
+  view.setUint16(34, 16, true)
+  writeString(36, 'data')
+  view.setUint32(40, length * 2, true)
+  // Convert float32 to int16
+  let offset = 44
+  for (let i = 0; i < length; i++) {
+    const sample = Math.max(-1, Math.min(1, audioData[i]))
+    view.setInt16(offset, sample < 0 ? sample * 0x8000 : sample * 0x7fff, true)
+    offset += 2
+  }
+  return buffer
+}
+interface CustomAudioVisualizerProps {
+  audio: Float32Array
+  isPlaying: boolean
+  currentTime: number
+  duration: number
+  height?: number
+}
+function CustomAudioVisualizer({
+  audio,
+  isPlaying,
+  currentTime,
+  duration,
+  height = 80
+}: CustomAudioVisualizerProps) {
+  const canvasRef = useRef<HTMLCanvasElement>(null)
+  const containerRef = useRef<HTMLDivElement>(null)
+  // Memoize expensive calculations with smoothing
+  const waveformData = useMemo(() => {
+    if (!audio || audio.length === 0) return []
+    const samples = Math.min(audio.length, 1600) // Fixed high resolution
+    const step = audio.length / samples
+    const data = []
+    for (let i = 0; i < samples; i++) {
+      const sample = Math.abs(audio[Math.floor(i * step)])
+      data.push(sample)
+    }
+    // Apply smoothing filter
+    const smoothedData = []
+    const smoothingWindow = 3
+    for (let i = 0; i < data.length; i++) {
+      let sum = 0
+      let count = 0
+      for (
+        let j = Math.max(0, i - smoothingWindow);
+        j <= Math.min(data.length - 1, i + smoothingWindow);
+        j++
+      ) {
+        sum += data[j]
+        count++
+      }
+      smoothedData.push(sum / count)
+    }
+    return smoothedData
+  }, [audio])
+  // Add resize observer for true responsiveness
+  const [containerWidth, setContainerWidth] = useState(800)
+  useEffect(() => {
+    const container = containerRef.current
+    if (!container) return
+    const resizeObserver = new ResizeObserver((entries) => {
+      for (const entry of entries) {
+        setContainerWidth(entry.contentRect.width)
+      }
+    })
+    resizeObserver.observe(container)
+    setContainerWidth(container.getBoundingClientRect().width)
+    return () => resizeObserver.disconnect()
+  }, [])
+  useEffect(() => {
+    const canvas = canvasRef.current
+    const container = containerRef.current
+    if (!canvas || !container || waveformData.length === 0) return
+    const ctx = canvas.getContext('2d')
+    if (!ctx) return
+    // Use state-tracked container width for responsive behavior
+    const displayWidth = containerWidth
+    const displayHeight = height
+    // Set high DPI for sharper rendering
+    const dpr = window.devicePixelRatio || 1
+    canvas.width = displayWidth * dpr
+    canvas.height = displayHeight * dpr
+    canvas.style.width = `${displayWidth}px`
+    canvas.style.height = `${displayHeight}px`
+    ctx.scale(dpr, dpr)
+    const samples = waveformData.length
+    // Clear canvas
+    ctx.clearRect(0, 0, displayWidth, displayHeight)
+    // Enable smoothing for better quality
+    ctx.imageSmoothingEnabled = true
+    ctx.imageSmoothingQuality = 'high'
+    // Draw waveform bars with full width and logarithmic scaling
+    const actualBars = Math.min(samples, displayWidth) // Use full width
+    const barWidth = displayWidth / actualBars
+    const gap = Math.max(0, barWidth * 0.1)
+    const effectiveBarWidth = Math.max(1, barWidth - gap)
+    for (let i = 0; i < actualBars; i++) {
+      const x = i * barWidth
+      const dataIndex = Math.floor((i / actualBars) * samples)
+      const sample = waveformData[dataIndex] || 0
+      // Logarithmic scaling for better speech visualization
+      // This makes quiet sounds (silence) very small and speech much more prominent
+      const minThreshold = 0.01 // Silence threshold
+      const logSample =
+        sample > minThreshold
+          ? Math.log10(sample * 9 + 1) // Log scale: log10(sample * 9 + 1) ranges from 0 to 1
+          : sample * 0.1 // Very quiet for near-silence
+      const amplified = Math.pow(logSample * 3, 1.5) // Further emphasize speech
+      const barHeight = Math.max(2, amplified * displayHeight * 0.9)
+      const y = (displayHeight - barHeight) / 2
+      let barColor = '#6B7280'
+      if (duration > 0) {
+        const timePosition = (i / actualBars) * duration
+        if (isPlaying && timePosition <= currentTime) {
+          barColor = '#3B82F6'
+        } else if (isPlaying) {
+          barColor = '#9CA3AF'
+        }
+      }
+      ctx.fillStyle = barColor
+      ctx.fillRect(x, y, effectiveBarWidth, barHeight)
+      // Optional: Add rounded corners if supported
+      if (typeof ctx.roundRect === 'function') {
+        ctx.clearRect(x, y, effectiveBarWidth, barHeight)
+        ctx.beginPath()
+        const radius = Math.min(effectiveBarWidth / 4, 2)
+        ctx.roundRect(x, y, effectiveBarWidth, barHeight, radius)
+        ctx.fill()
+      }
+    }
+    // Draw progress line with gradient
+    if (isPlaying && duration > 0 && currentTime >= 0) {
+      const progressX = Math.min(
+        (currentTime / duration) * displayWidth,
+        displayWidth
+      )
+      // Create gradient for progress line
+      const gradient = ctx.createLinearGradient(0, 0, 0, displayHeight)
+      gradient.addColorStop(0, '#EF4444')
+      gradient.addColorStop(0.5, '#DC2626')
+      gradient.addColorStop(1, '#EF4444')
+      ctx.strokeStyle = gradient
+      ctx.lineWidth = 3
+      ctx.lineCap = 'round'
+      ctx.beginPath()
+      ctx.moveTo(progressX, 4)
+      ctx.lineTo(progressX, displayHeight - 4)
+      ctx.stroke()
+    }
+  }, [waveformData, isPlaying, currentTime, duration, height, containerWidth])
+  return (
+    <div ref={containerRef} className="w-full">
+      <canvas
+        ref={canvasRef}
+        className="w-full block"
+        style={{
+          width: '100%',
+          height: `${height}px`,
+          maxWidth: '100%',
+          display: 'block'
+        }}
+      />
+    </div>
+  )
+}
+function AudioPlayer({ audio, samplingRate, text, index }: AudioPlayerProps) {
+  const [isPlaying, setIsPlaying] = useState(false)
+  const [currentTime, setCurrentTime] = useState(0)
+  const [duration, setDuration] = useState(0)
+  const audioRef = useRef<HTMLAudioElement>(null)
+  const audioContextRef = useRef<AudioContext | null>(null)
+  const sourceRef = useRef<AudioBufferSourceNode | null>(null)
+  const startTimeRef = useRef<number>(0)
+  const animationFrameRef = useRef<number | null>(null)
+  const visualizerRef = useRef<HTMLCanvasElement>(null)
+  const stopAudio = useCallback(() => {
+    if (sourceRef.current) {
+      sourceRef.current.stop()
+      sourceRef.current = null
+    }
+    if (animationFrameRef.current) {
+      cancelAnimationFrame(animationFrameRef.current)
+      animationFrameRef.current = null
+    }
+    setIsPlaying(false)
+    setCurrentTime(0)
+  }, [])
+  const playAudio = useCallback(async () => {
+    if (!audio || audio.length === 0) return
+    // Stop current audio if playing
+    if (isPlaying) {
+      stopAudio()
+      return
+    }
+    try {
+      // Create audio context if it doesn't exist
+      if (!audioContextRef.current) {
+        audioContextRef.current = new (window.AudioContext ||
+          (window as any).webkitAudioContext)()
+      }
+      const audioContext = audioContextRef.current
+      // Resume audio context if suspended
+      if (audioContext.state === 'suspended') {
+        await audioContext.resume()
+      }
+      // Create audio buffer
+      const audioBuffer = audioContext.createBuffer(
+        1,
+        audio.length,
+        samplingRate
+      )
+      audioBuffer.getChannelData(0).set(audio)
+      // Create audio source
+      const source = audioContext.createBufferSource()
+      source.buffer = audioBuffer
+      source.connect(audioContext.destination)
+      sourceRef.current = source
+      const audioDuration = audio.length / samplingRate
+      setDuration(audioDuration)
+      setIsPlaying(true)
+      startTimeRef.current = audioContext.currentTime
+      // Update current time during playback
+      const updateTime = () => {
+        if (sourceRef.current && audioContextRef.current) {
+          const elapsed =
+            audioContextRef.current.currentTime - startTimeRef.current
+          const newCurrentTime = Math.min(elapsed, audioDuration)
+          setCurrentTime(newCurrentTime)
+          if (elapsed < audioDuration) {
+            animationFrameRef.current = requestAnimationFrame(updateTime)
+          } else {
+            // Audio finished naturally
+            setIsPlaying(false)
+            setCurrentTime(0)
+            sourceRef.current = null
+            animationFrameRef.current = null
+          }
+        }
+      }
+      animationFrameRef.current = requestAnimationFrame(updateTime)
+      source.onended = () => {
+        if (animationFrameRef.current) {
+          cancelAnimationFrame(animationFrameRef.current)
+          animationFrameRef.current = null
+        }
+        setIsPlaying(false)
+        setCurrentTime(0)
+        sourceRef.current = null
+      }
+      source.start()
+    } catch (error) {
+      console.error('Error playing audio:', error)
+      setIsPlaying(false)
+      setCurrentTime(0)
+    }
+  }, [audio, samplingRate, isPlaying, stopAudio])
+  // Cleanup animation frame on component unmount
+  useEffect(() => {
+    return () => {
+      if (animationFrameRef.current) {
+        cancelAnimationFrame(animationFrameRef.current)
+      }
+    }
+  }, [])
+  const downloadAudio = useCallback(() => {
+    if (!audio || audio.length === 0) return
+    try {
+      const wavBuffer = createWavBuffer(audio, samplingRate)
+      const blob = new Blob([wavBuffer], { type: 'audio/wav' })
+      const url = URL.createObjectURL(blob)
+      const a = document.createElement('a')
+      a.href = url
+      a.download = `tts-output-${index + 1}.wav`
+      document.body.appendChild(a)
+      a.click()
+      document.body.removeChild(a)
+      URL.revokeObjectURL(url)
+    } catch (error) {
+      console.error('Error downloading audio:', error)
+    }
+  }, [audio, samplingRate, index])
+  return (
+    <div className="border border-gray-200 rounded-lg p-4 bg-gray-50">
+      <div className="mb-3">
+        <p className="text-sm text-gray-700 font-medium mb-2">Prompt:</p>
+        <p className="text-sm text-gray-600 italic bg-white p-2 rounded border">
+          "{text}"
+        </p>
+      </div>
+      <div className="mb-3">
+        <div className="w-full border border-gray-200 rounded bg-gray-50 overflow-hidden">
+          {audio && audio.length > 0 ? (
+            <CustomAudioVisualizer
+              audio={audio}
+              isPlaying={isPlaying}
+              currentTime={currentTime}
+              duration={duration}
+              height={80}
+            />
+          ) : (
+            <div className="w-full h-20 flex items-center justify-center">
+              <span className="text-gray-400 text-sm">Loading waveform...</span>
+            </div>
+          )}
+        </div>
+      </div>
+      <div className="flex items-center gap-2">
+        <button
+          onClick={playAudio}
+          className="flex items-center gap-1 px-3 py-1 bg-blue-500 hover:bg-blue-600 text-white rounded text-sm transition-colors"
+        >
+          {isPlaying ? (
+            <>
+              <Square className="w-4 h-4" />
+              Stop
+            </>
+          ) : (
+            <>
+              <Play className="w-4 h-4" />
+              Play
+            </>
+          )}
+        </button>
+        <button
+          onClick={downloadAudio}
+          className="flex items-center gap-1 px-3 py-1 bg-green-500 hover:bg-green-600 text-white rounded text-sm transition-colors"
+        >
+          <Download className="w-4 h-4" />
+          Download
+        </button>
+        {duration > 0 && (
+          <span className="text-xs text-gray-500 ml-2">
+            {currentTime.toFixed(1)}s / {duration.toFixed(1)}s
+          </span>
+        )}
+      </div>
+    </div>
+  )
+}
+export default AudioPlayer

src/components/ModelLoader.tsx CHANGED Viewed

@@ -105,7 +105,7 @@ const ModelLoader = () => {
     return () => {
       newWorker.removeEventListener('message', onMessageReceived)
-      terminateWorker(pipeline)
     }
   }, [
     pipeline,

     return () => {
       newWorker.removeEventListener('message', onMessageReceived)
+      // terminateWorker(pipeline)
     }
   }, [
     pipeline,

src/components/PipelineLayout.tsx CHANGED Viewed

@@ -4,6 +4,7 @@ import { FeatureExtractionProvider } from '../contexts/FeatureExtractionContext'
 import { ZeroShotClassificationProvider } from '../contexts/ZeroShotClassificationContext'
 import { ImageClassificationProvider } from '../contexts/ImageClassificationContext'
 import { TextClassificationProvider } from '../contexts/TextClassificationContext'
 export const PipelineLayout = ({ children }: { children: React.ReactNode }) => {
   const { pipeline } = useModel()
@@ -30,6 +31,9 @@ export const PipelineLayout = ({ children }: { children: React.ReactNode }) => {
     case 'text-classification':
       return <TextClassificationProvider>{children}</TextClassificationProvider>
     default:
       return <>{children}</>
   }

 import { ZeroShotClassificationProvider } from '../contexts/ZeroShotClassificationContext'
 import { ImageClassificationProvider } from '../contexts/ImageClassificationContext'
 import { TextClassificationProvider } from '../contexts/TextClassificationContext'
+import { TextToSpeechProvider } from '../contexts/TextToSpeechContext'
 export const PipelineLayout = ({ children }: { children: React.ReactNode }) => {
   const { pipeline } = useModel()
     case 'text-classification':
       return <TextClassificationProvider>{children}</TextClassificationProvider>
+    case 'text-to-speech':
+      return <TextToSpeechProvider>{children}</TextToSpeechProvider>
     default:
       return <>{children}</>
   }

src/components/PipelineSelector.tsx CHANGED Viewed

@@ -12,6 +12,7 @@ export const supportedPipelines = [
   'image-classification',
   'text-generation',
   'text-classification',
   'zero-shot-classification'
   // 'summarization',
   // 'translation'

   'image-classification',
   'text-generation',
   'text-classification',
+  'text-to-speech',
   'zero-shot-classification'
   // 'summarization',
   // 'translation'

src/components/Sidebar.tsx CHANGED Viewed

@@ -79,6 +79,17 @@ const Sidebar = ({
                     </Tooltip>
                   </span>
                 )}
               </div>
               <PipelineSelector pipeline={pipeline} setPipeline={setPipeline} />
             </div>

                     </Tooltip>
                   </span>
                 )}
+                {pipeline === 'text-to-speech' && (
+                  <span className="flex text-xs text-yellow-500 justify-center text-center">
+                    Not fully supported{' '}
+                    <Tooltip
+                      content="Transformers.js has limited support for text-to-speech"
+                      className="transform -translate-x-1/3 break-keep max-w-12"
+                    >
+                      <CircleQuestionMark className="inline w-4 h-4 ml-1" />
+                    </Tooltip>
+                  </span>
+                )}
               </div>
               <PipelineSelector pipeline={pipeline} setPipeline={setPipeline} />
             </div>

src/components/pipelines/TextToSpeech.tsx ADDED Viewed

	@@ -0,0 +1,207 @@

+import { useState, useRef, useEffect, useCallback, useMemo } from 'react'
+import { Play, Square, Download, Eraser, Loader2, Volume2 } from 'lucide-react'
+import { TextToSpeechWorkerInput, WorkerMessage } from '../../types'
+import { useModel } from '../../contexts/ModelContext'
+import {
+  useTextToSpeech,
+  AudioResult
+} from '../../contexts/TextToSpeechContext'
+import AudioPlayer from '../AudioPlayer'
+const SAMPLE_TEXTS = [
+  'Hello, this is a sample text for text-to-speech synthesis.',
+  'Transformers.js makes it easy to run machine learning models in the browser.',
+  'The quick brown fox jumps over the lazy dog.',
+  'Text-to-speech technology converts written text into spoken words using artificial intelligence.'
+]
+function TextToSpeech() {
+  const {
+    config,
+    audioResults,
+    currentText,
+    setCurrentText,
+    addAudioResult,
+    clearAudioResults
+  } = useTextToSpeech()
+  const [isSynthesizing, setIsSynthesizing] = useState<boolean>(false)
+  const {
+    activeWorker,
+    status,
+    modelInfo,
+    hasBeenLoaded,
+    selectedQuantization
+  } = useModel()
+  const handleSynthesize = useCallback(() => {
+    if (!currentText.trim() || !modelInfo || !activeWorker || isSynthesizing)
+      return
+    setIsSynthesizing(true)
+    const message: TextToSpeechWorkerInput = {
+      type: 'synthesize',
+      text: currentText.trim(),
+      model: modelInfo.id,
+      dtype: selectedQuantization ?? 'fp32',
+      config: {
+        speakerEmbeddings: config.speakerEmbeddings
+      }
+    }
+    activeWorker.postMessage(message)
+  }, [
+    currentText,
+    modelInfo,
+    activeWorker,
+    config,
+    isSynthesizing,
+    selectedQuantization
+  ])
+  useEffect(() => {
+    if (!activeWorker) return
+    const onMessageReceived = (e: MessageEvent<WorkerMessage>) => {
+      const { status, output } = e.data
+      if (status === 'output' && output) {
+        setIsSynthesizing(false)
+        const audioResult = {
+          audio: new Float32Array(output.audio),
+          sampling_rate: output.sampling_rate
+        }
+        addAudioResult(currentText, audioResult)
+      } else if (status === 'ready' || status === 'error') {
+        setIsSynthesizing(false)
+      }
+    }
+    activeWorker.addEventListener('message', onMessageReceived)
+    return () => activeWorker.removeEventListener('message', onMessageReceived)
+  }, [activeWorker, currentText, addAudioResult])
+  const handleKeyPress = (e: React.KeyboardEvent) => {
+    if (e.key === 'Enter' && !e.shiftKey) {
+      e.preventDefault()
+      handleSynthesize()
+    }
+  }
+  const busy = status !== 'ready' || isSynthesizing
+  return (
+    <div className="flex flex-col min-h-[30dvh] max-h-[calc(100dvh-128px)] w-full p-4">
+      <div className="flex items-center justify-between mb-4">
+        <h1 className="text-2xl font-bold">Text to Speech</h1>
+        <button
+          onClick={clearAudioResults}
+          className="p-2 bg-red-100 hover:bg-red-200 rounded-lg transition-colors"
+          title="Clear All Audio"
+        >
+          <Eraser className="w-4 h-4" />
+        </button>
+      </div>
+      <div className="mb-4">
+        <label className="block text-sm font-medium text-gray-700 mb-2">
+          Enter text to synthesize:
+        </label>
+        <textarea
+          value={currentText}
+          onChange={(e) => setCurrentText(e.target.value)}
+          onKeyPress={handleKeyPress}
+          placeholder="Enter your text here... (Press Enter to synthesize, Shift+Enter for new line)"
+          className="w-full p-3 border border-gray-300 rounded-lg resize-none focus:outline-hidden focus:ring-2 focus:ring-blue-500 focus:border-blue-500 disabled:bg-gray-100 disabled:cursor-not-allowed"
+          rows={4}
+          disabled={!hasBeenLoaded || isSynthesizing}
+        />
+      </div>
+      <div className="mb-4">
+        <div className="flex flex-wrap gap-2 mb-2">
+          <span className="text-sm font-medium text-gray-700">
+            Quick samples:
+          </span>
+          {SAMPLE_TEXTS.map((sampleText, index) => (
+            <button
+              key={index}
+              onClick={() => setCurrentText(sampleText)}
+              disabled={!hasBeenLoaded || isSynthesizing}
+              className="px-2 py-1 bg-gray-100 hover:bg-gray-200 disabled:bg-gray-50 disabled:cursor-not-allowed text-gray-700 text-xs rounded transition-colors"
+            >
+              Sample {index + 1}
+            </button>
+          ))}
+        </div>
+      </div>
+      <div className="mb-4">
+        <button
+          onClick={handleSynthesize}
+          disabled={!currentText.trim() || busy || !hasBeenLoaded}
+          className="px-6 py-2 bg-green-500 hover:bg-green-600 disabled:bg-gray-300 disabled:cursor-not-allowed text-white rounded-lg transition-colors flex items-center gap-2"
+        >
+          {isSynthesizing ? (
+            <>
+              <Loader2 className="w-4 h-4 animate-spin" />
+              Synthesizing...
+            </>
+          ) : (
+            <>
+              <Volume2 className="w-4 h-4" />
+              Synthesize Speech
+            </>
+          )}
+        </button>
+      </div>
+      <div className="flex-1 overflow-y-auto">
+        <div className="mb-2">
+          <label className="block text-sm font-medium text-gray-700">
+            Generated Audio ({audioResults.length}):
+          </label>
+        </div>
+        {audioResults.length > 0 ? (
+          <div className="space-y-3">
+            {audioResults.map((result, index) => (
+              <AudioPlayer
+                key={index}
+                audio={result.audio}
+                samplingRate={result.sampling_rate}
+                text={result.text}
+                index={index}
+              />
+            ))}
+          </div>
+        ) : (
+          <div className="text-gray-500 italic flex flex-col items-center gap-3 p-8 border border-gray-200 rounded-lg bg-gray-50">
+            {isSynthesizing ? (
+              <>
+                <Loader2 className="w-6 h-6 animate-spin text-blue-500" />
+                <span>Synthesizing speech...</span>
+              </>
+            ) : (
+              <>
+                <Volume2 className="w-8 h-8 text-gray-400" />
+                <span>Generated audio will appear here</span>
+                <span className="text-xs text-gray-400">
+                  Enter text and click "Synthesize Speech" to get started
+                </span>
+              </>
+            )}
+          </div>
+        )}
+      </div>
+      {!hasBeenLoaded && (
+        <div className="text-center text-gray-500 text-sm mt-2">
+          Please load a model first to start synthesizing speech
+        </div>
+      )}
+    </div>
+  )
+}
+export default TextToSpeech

src/components/pipelines/TextToSpeechConfig.tsx ADDED Viewed

	@@ -0,0 +1,42 @@

+import React from 'react'
+import { Label } from '@/components/ui/label'
+import { Input } from '@/components/ui/input'
+import { useTextToSpeech } from '../../contexts/TextToSpeechContext'
+interface TextToSpeechConfigProps {
+  className?: string
+}
+const TextToSpeechConfig: React.FC<TextToSpeechConfigProps> = ({
+  className = ''
+}) => {
+  const { config, setConfig } = useTextToSpeech()
+  return (
+    <div className={`space-y-4 ${className}`}>
+      <div className="space-y-2">
+        <Label htmlFor="speakerEmbeddings" className="text-sm font-medium">
+          Speaker Embeddings URL
+        </Label>
+        <Input
+          id="speakerEmbeddings"
+          type="url"
+          value={config.speakerEmbeddings}
+          onChange={(e) =>
+            setConfig((prev) => ({
+              ...prev,
+              speakerEmbeddings: e.target.value
+            }))
+          }
+          placeholder="https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin"
+          className="text-sm"
+        />
+        <p className="text-xs text-gray-500">
+          URL to speaker embeddings file for voice characteristics
+        </p>
+      </div>
+    </div>
+  )
+}
+export default TextToSpeechConfig

src/components/ui/label.tsx ADDED Viewed

	@@ -0,0 +1,22 @@

+import * as React from "react"
+import * as LabelPrimitive from "@radix-ui/react-label"
+import { cn } from "@/lib/utils"
+function Label({
+  className,
+  ...props
+}: React.ComponentProps<typeof LabelPrimitive.Root>) {
+  return (
+    <LabelPrimitive.Root
+      data-slot="label"
+      className={cn(
+        "flex items-center gap-2 text-sm leading-none font-medium select-none group-data-[disabled=true]:pointer-events-none group-data-[disabled=true]:opacity-50 peer-disabled:cursor-not-allowed peer-disabled:opacity-50",
+        className
+      )}
+      {...props}
+    />
+  )
+}
+export { Label }

src/contexts/TextToSpeechContext.tsx ADDED Viewed

	@@ -0,0 +1,73 @@

+import { createContext, useContext, useState, ReactNode } from 'react'
+export interface TextToSpeechConfigState {
+  speakerEmbeddings: string
+}
+export interface AudioResult {
+  audio: Float32Array
+  sampling_rate: number
+  text: string
+}
+interface TextToSpeechContextType {
+  config: TextToSpeechConfigState
+  setConfig: React.Dispatch<React.SetStateAction<TextToSpeechConfigState>>
+  audioResults: AudioResult[]
+  setAudioResults: React.Dispatch<React.SetStateAction<AudioResult[]>>
+  currentText: string
+  setCurrentText: React.Dispatch<React.SetStateAction<string>>
+  addAudioResult: (text: string, audio: Omit<AudioResult, 'text'>) => void
+  clearAudioResults: () => void
+}
+const TextToSpeechContext = createContext<TextToSpeechContextType | undefined>(
+  undefined
+)
+export function TextToSpeechProvider({ children }: { children: ReactNode }) {
+  const [config, setConfig] = useState<TextToSpeechConfigState>({
+    speakerEmbeddings:
+      'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin'
+  })
+  const [audioResults, setAudioResults] = useState<AudioResult[]>([])
+  const [currentText, setCurrentText] = useState<string>('')
+  const addAudioResult = (text: string, audio: Omit<AudioResult, 'text'>) => {
+    const fullAudioResult: AudioResult = { ...audio, text }
+    setAudioResults((prev) => [...prev, fullAudioResult])
+  }
+  const clearAudioResults = () => {
+    setAudioResults([])
+    setCurrentText('')
+  }
+  const value = {
+    config,
+    setConfig,
+    audioResults,
+    setAudioResults,
+    currentText,
+    setCurrentText,
+    addAudioResult,
+    clearAudioResults
+  }
+  return (
+    <TextToSpeechContext.Provider value={value}>
+      {children}
+    </TextToSpeechContext.Provider>
+  )
+}
+export function useTextToSpeech() {
+  const context = useContext(TextToSpeechContext)
+  if (context === undefined) {
+    throw new Error(
+      'useTextToSpeech must be used within a TextToSpeechProvider'
+    )
+  }
+  return context
+}

src/lib/huggingface.ts CHANGED Viewed

@@ -176,6 +176,14 @@ const getModelsByPipeline = async (
           !model.id.includes('MiniLM')
       )
       .slice(0, 30)
   }
   return uniqueModels.slice(0, 30)

           !model.id.includes('MiniLM')
       )
       .slice(0, 30)
+  } else if (pipelineTag === 'text-to-speech') {
+    return uniqueModels
+      .filter(
+        (model: ModelInfoResponse) =>
+          !model.tags.includes('style_text_to_speech_2') &&
+          !model.id.includes('qwen2')
+      )
+      .slice(0, 30)
   }
   return uniqueModels.slice(0, 30)

src/lib/workerManager.ts CHANGED Viewed

@@ -20,6 +20,9 @@ export const getWorker = (pipeline: string) => {
       case 'image-classification':
         workerUrl = `/workers/image-classification.js`
         break
       default:
         return null
     }

       case 'image-classification':
         workerUrl = `/workers/image-classification.js`
         break
+      case 'text-to-speech':
+        workerUrl = `/workers/text-to-speech.js`
+        break
       default:
         return null
     }

src/types.ts CHANGED Viewed

@@ -80,6 +80,16 @@ export interface FeatureExtractionWorkerInput {
   }
 }
 export interface ImageClassificationWorkerInput {
   type: 'classify'
   image: string | ImageData | HTMLImageElement | HTMLCanvasElement

   }
 }
+export interface TextToSpeechWorkerInput {
+  type: 'synthesize'
+  text: string
+  model: string
+  dtype: QuantizationType
+  config?: {
+    speakerEmbeddings?: string
+  }
+}
 export interface ImageClassificationWorkerInput {
   type: 'classify'
   image: string | ImageData | HTMLImageElement | HTMLCanvasElement