Vokturz's picture
Add StyleTTS2 support with KokoroTTS integration
79eafc9
import { useState, useRef, useEffect, useCallback, useMemo } from 'react'
import { Play, Square, Download, Eraser, Loader2, Volume2 } from 'lucide-react'
import { TextToSpeechWorkerInput, WorkerMessage } from '../../types'
import { useModel } from '../../contexts/ModelContext'
import {
useTextToSpeech,
AudioResult
} from '../../contexts/TextToSpeechContext'
import AudioPlayer from '../AudioPlayer'
import { preview } from 'vite'
const SAMPLE_TEXTS = [
'Hello, this is a sample text for text-to-speech synthesis.',
'Transformers.js makes it easy to run machine learning models in the browser.',
'The quick brown fox jumps over the lazy dog.',
'Text-to-speech technology converts written text into spoken words using artificial intelligence.'
]
function TextToSpeech() {
const {
config,
setConfig,
audioResults,
currentText,
setCurrentText,
addAudioResult,
clearAudioResults
} = useTextToSpeech()
const [isSynthesizing, setIsSynthesizing] = useState<boolean>(false)
const {
activeWorker,
status,
modelInfo,
hasBeenLoaded,
selectedQuantization
} = useModel()
const handleSynthesize = useCallback(() => {
if (!currentText.trim() || !modelInfo || !activeWorker || isSynthesizing)
return
setIsSynthesizing(true)
const message: TextToSpeechWorkerInput = {
type: 'synthesize',
text: currentText.trim(),
model: modelInfo.id,
dtype: selectedQuantization ?? 'fp32',
isStyleTTS2: modelInfo.isStyleTTS2 ?? false,
config: {
speakerEmbeddings: config.speakerEmbeddings,
voice: config.voice
}
}
activeWorker.postMessage(message)
}, [
currentText,
modelInfo,
activeWorker,
config,
isSynthesizing,
selectedQuantization
])
useEffect(() => {
if (!activeWorker) return
const onMessageReceived = (e: MessageEvent<WorkerMessage>) => {
const { status, output } = e.data
if (status === 'output' && output) {
setIsSynthesizing(false)
const audioResult = {
audio: new Float32Array(output.audio),
sampling_rate: output.sampling_rate
}
addAudioResult(currentText, audioResult, config.voice)
} else if (status === 'ready' || status === 'error') {
setIsSynthesizing(false)
}
}
activeWorker.addEventListener('message', onMessageReceived)
return () => activeWorker.removeEventListener('message', onMessageReceived)
}, [activeWorker, currentText, addAudioResult])
useEffect(() => {
if (!modelInfo) return
if (modelInfo && modelInfo?.voices.length > 0)
setConfig((prev) => ({
...prev,
voice: modelInfo.voices[0]
}))
}, [modelInfo])
const handleKeyPress = (e: React.KeyboardEvent) => {
if (e.key === 'Enter' && !e.shiftKey) {
e.preventDefault()
handleSynthesize()
}
}
const busy = status !== 'ready' || isSynthesizing
return (
<div className="flex flex-col min-h-[30dvh] max-h-[calc(100dvh-128px)] w-full p-4">
<div className="flex items-center justify-between mb-4">
<h1 className="text-2xl font-bold">Text to Speech</h1>
<button
onClick={clearAudioResults}
className="p-2 bg-red-100 hover:bg-red-200 rounded-lg transition-colors"
title="Clear All Audio"
>
<Eraser className="w-4 h-4" />
</button>
</div>
<div className="mb-4">
<label className="block text-sm font-medium text-gray-700 mb-2">
Enter text to synthesize:
</label>
<textarea
value={currentText}
onChange={(e) => setCurrentText(e.target.value)}
onKeyPress={handleKeyPress}
placeholder="Enter your text here... (Press Enter to synthesize, Shift+Enter for new line)"
className="w-full p-3 border border-gray-300 rounded-lg resize-none focus:outline-hidden focus:ring-2 focus:ring-blue-500 focus:border-blue-500 disabled:bg-gray-100 disabled:cursor-not-allowed"
rows={4}
disabled={!hasBeenLoaded || isSynthesizing}
/>
</div>
<div className="mb-4">
<div className="flex flex-wrap gap-2 mb-2">
<span className="text-sm font-medium text-gray-700">
Quick samples:
</span>
{SAMPLE_TEXTS.map((sampleText, index) => (
<button
key={index}
onClick={() => setCurrentText(sampleText)}
disabled={!hasBeenLoaded || isSynthesizing}
className="px-2 py-1 bg-gray-100 hover:bg-gray-200 disabled:bg-gray-50 disabled:cursor-not-allowed text-gray-700 text-xs rounded transition-colors"
>
Sample {index + 1}
</button>
))}
</div>
</div>
<div className="mb-4">
<button
onClick={handleSynthesize}
disabled={!currentText.trim() || busy || !hasBeenLoaded}
className="px-6 py-2 bg-green-500 hover:bg-green-600 disabled:bg-gray-300 disabled:cursor-not-allowed text-white rounded-lg transition-colors flex items-center gap-2"
>
{isSynthesizing ? (
<>
<Loader2 className="w-4 h-4 animate-spin" />
Synthesizing...
</>
) : (
<>
<Volume2 className="w-4 h-4" />
Synthesize Speech
</>
)}
</button>
</div>
<div className="flex-1 overflow-y-auto">
<div className="mb-2">
<label className="block text-sm font-medium text-gray-700">
Generated Audio ({audioResults.length}):
</label>
</div>
{audioResults.length > 0 ? (
<div className="space-y-3">
{audioResults.map((result, index) => (
<AudioPlayer
key={index}
audio={result.audio}
samplingRate={result.sampling_rate}
text={result.text}
index={index}
voice={result.voice}
/>
))}
</div>
) : (
<div className="text-gray-500 italic flex flex-col items-center gap-3 p-8 border border-gray-200 rounded-lg bg-gray-50">
{isSynthesizing ? (
<>
<Loader2 className="w-6 h-6 animate-spin text-blue-500" />
<span>Synthesizing speech...</span>
</>
) : (
<>
<Volume2 className="w-8 h-8 text-gray-400" />
<span>Generated audio will appear here</span>
<span className="text-xs text-gray-400">
Enter text and click "Synthesize Speech" to get started
</span>
</>
)}
</div>
)}
</div>
{!hasBeenLoaded && (
<div className="text-center text-gray-500 text-sm mt-2">
Please load a model first to start synthesizing speech
</div>
)}
</div>
)
}
export default TextToSpeech