Spaces:

Vokturz
/

transformers-js-playground

Running

File size: 7,086 Bytes

import { useState, useRef, useEffect, useCallback, useMemo } from 'react'
import { Play, Square, Download, Eraser, Loader2, Volume2 } from 'lucide-react'
import { TextToSpeechWorkerInput, WorkerMessage } from '../../types'
import { useModel } from '../../contexts/ModelContext'
import {
  useTextToSpeech,
  AudioResult
} from '../../contexts/TextToSpeechContext'
import AudioPlayer from '../AudioPlayer'
import { preview } from 'vite'

const SAMPLE_TEXTS = [
  'Hello, this is a sample text for text-to-speech synthesis.',
  'Transformers.js makes it easy to run machine learning models in the browser.',
  'The quick brown fox jumps over the lazy dog.',
  'Text-to-speech technology converts written text into spoken words using artificial intelligence.'
]

function TextToSpeech() {
  const {
    config,
    setConfig,
    audioResults,
    currentText,
    setCurrentText,
    addAudioResult,
    clearAudioResults
  } = useTextToSpeech()

  const [isSynthesizing, setIsSynthesizing] = useState<boolean>(false)

  const {
    activeWorker,
    status,
    modelInfo,
    hasBeenLoaded,
    selectedQuantization
  } = useModel()

  const handleSynthesize = useCallback(() => {
    if (!currentText.trim() || !modelInfo || !activeWorker || isSynthesizing)
      return

    setIsSynthesizing(true)

    const message: TextToSpeechWorkerInput = {
      type: 'synthesize',
      text: currentText.trim(),
      model: modelInfo.id,
      dtype: selectedQuantization ?? 'fp32',
      isStyleTTS2: modelInfo.isStyleTTS2 ?? false,
      config: {
        speakerEmbeddings: config.speakerEmbeddings,
        voice: config.voice
      }
    }

    activeWorker.postMessage(message)
  }, [
    currentText,
    modelInfo,
    activeWorker,
    config,
    isSynthesizing,
    selectedQuantization
  ])

  useEffect(() => {
    if (!activeWorker) return

    const onMessageReceived = (e: MessageEvent<WorkerMessage>) => {
      const { status, output } = e.data
      if (status === 'output' && output) {
        setIsSynthesizing(false)
        const audioResult = {
          audio: new Float32Array(output.audio),
          sampling_rate: output.sampling_rate
        }
        addAudioResult(currentText, audioResult, config.voice)
      } else if (status === 'ready' || status === 'error') {
        setIsSynthesizing(false)
      }
    }

    activeWorker.addEventListener('message', onMessageReceived)
    return () => activeWorker.removeEventListener('message', onMessageReceived)
  }, [activeWorker, currentText, addAudioResult])

  useEffect(() => {
    if (!modelInfo) return
    if (modelInfo && modelInfo?.voices.length > 0)
      setConfig((prev) => ({
        ...prev,
        voice: modelInfo.voices[0]
      }))
  }, [modelInfo])

  const handleKeyPress = (e: React.KeyboardEvent) => {
    if (e.key === 'Enter' && !e.shiftKey) {
      e.preventDefault()
      handleSynthesize()
    }
  }

  const busy = status !== 'ready' || isSynthesizing

  return (
    <div className="flex flex-col min-h-[30dvh] max-h-[calc(100dvh-128px)] w-full p-4">
      <div className="flex items-center justify-between mb-4">
        <h1 className="text-2xl font-bold">Text to Speech</h1>
        <button
          onClick={clearAudioResults}
          className="p-2 bg-red-100 hover:bg-red-200 rounded-lg transition-colors"
          title="Clear All Audio"
        >
          <Eraser className="w-4 h-4" />
        </button>
      </div>

      <div className="mb-4">
        <label className="block text-sm font-medium text-gray-700 mb-2">
          Enter text to synthesize:
        </label>
        <textarea
          value={currentText}
          onChange={(e) => setCurrentText(e.target.value)}
          onKeyPress={handleKeyPress}
          placeholder="Enter your text here... (Press Enter to synthesize, Shift+Enter for new line)"
          className="w-full p-3 border border-gray-300 rounded-lg resize-none focus:outline-hidden focus:ring-2 focus:ring-blue-500 focus:border-blue-500 disabled:bg-gray-100 disabled:cursor-not-allowed"
          rows={4}
          disabled={!hasBeenLoaded || isSynthesizing}
        />
      </div>

      <div className="mb-4">
        <div className="flex flex-wrap gap-2 mb-2">
          <span className="text-sm font-medium text-gray-700">
            Quick samples:
          </span>
          {SAMPLE_TEXTS.map((sampleText, index) => (
            <button
              key={index}
              onClick={() => setCurrentText(sampleText)}
              disabled={!hasBeenLoaded || isSynthesizing}
              className="px-2 py-1 bg-gray-100 hover:bg-gray-200 disabled:bg-gray-50 disabled:cursor-not-allowed text-gray-700 text-xs rounded transition-colors"
            >
              Sample {index + 1}
            </button>
          ))}
        </div>
      </div>

      <div className="mb-4">
        <button
          onClick={handleSynthesize}
          disabled={!currentText.trim() || busy || !hasBeenLoaded}
          className="px-6 py-2 bg-green-500 hover:bg-green-600 disabled:bg-gray-300 disabled:cursor-not-allowed text-white rounded-lg transition-colors flex items-center gap-2"
        >
          {isSynthesizing ? (
            <>
              <Loader2 className="w-4 h-4 animate-spin" />
              Synthesizing...
            </>
          ) : (
            <>
              <Volume2 className="w-4 h-4" />
              Synthesize Speech
            </>
          )}
        </button>
      </div>

      <div className="flex-1 overflow-y-auto">
        <div className="mb-2">
          <label className="block text-sm font-medium text-gray-700">
            Generated Audio ({audioResults.length}):
          </label>
        </div>
        {audioResults.length > 0 ? (
          <div className="space-y-3">
            {audioResults.map((result, index) => (
              <AudioPlayer
                key={index}
                audio={result.audio}
                samplingRate={result.sampling_rate}
                text={result.text}
                index={index}
                voice={result.voice}
              />
            ))}
          </div>
        ) : (
          <div className="text-gray-500 italic flex flex-col items-center gap-3 p-8 border border-gray-200 rounded-lg bg-gray-50">
            {isSynthesizing ? (
              <>
                <Loader2 className="w-6 h-6 animate-spin text-blue-500" />
                <span>Synthesizing speech...</span>
              </>
            ) : (
              <>
                <Volume2 className="w-8 h-8 text-gray-400" />
                <span>Generated audio will appear here</span>
                <span className="text-xs text-gray-400">
                  Enter text and click "Synthesize Speech" to get started
                </span>
              </>
            )}
          </div>
        )}
      </div>

      {!hasBeenLoaded && (
        <div className="text-center text-gray-500 text-sm mt-2">
          Please load a model first to start synthesizing speech
        </div>
      )}
    </div>
  )
}

export default TextToSpeech