Spaces:

Vokturz
/

transformers-js-playground

Running

App Files Files Community

transformers-js-playground / src /components /pipelines /TextToSpeech.tsx

Vokturz

Add StyleTTS2 support with KokoroTTS integration

79eafc9 7 days ago

raw

history blame contribute delete

7.09 kB

	import { useState, useRef, useEffect, useCallback, useMemo } from 'react'
	import { Play, Square, Download, Eraser, Loader2, Volume2 } from 'lucide-react'
	import { TextToSpeechWorkerInput, WorkerMessage } from '../../types'
	import { useModel } from '../../contexts/ModelContext'
	import {
	useTextToSpeech,
	AudioResult
	} from '../../contexts/TextToSpeechContext'
	import AudioPlayer from '../AudioPlayer'
	import { preview } from 'vite'

	const SAMPLE_TEXTS = [
	'Hello, this is a sample text for text-to-speech synthesis.',
	'Transformers.js makes it easy to run machine learning models in the browser.',
	'The quick brown fox jumps over the lazy dog.',
	'Text-to-speech technology converts written text into spoken words using artificial intelligence.'
	]

	function TextToSpeech() {
	const {
	config,
	setConfig,
	audioResults,
	currentText,
	setCurrentText,
	addAudioResult,
	clearAudioResults
	} = useTextToSpeech()

	const [isSynthesizing, setIsSynthesizing] = useState<boolean>(false)

	const {
	activeWorker,
	status,
	modelInfo,
	hasBeenLoaded,
	selectedQuantization
	} = useModel()

	const handleSynthesize = useCallback(() => {
	if (!currentText.trim() \|\| !modelInfo \|\| !activeWorker \|\| isSynthesizing)
	return

	setIsSynthesizing(true)

	const message: TextToSpeechWorkerInput = {
	type: 'synthesize',
	text: currentText.trim(),
	model: modelInfo.id,
	dtype: selectedQuantization ?? 'fp32',
	isStyleTTS2: modelInfo.isStyleTTS2 ?? false,
	config: {
	speakerEmbeddings: config.speakerEmbeddings,
	voice: config.voice
	}
	}

	activeWorker.postMessage(message)
	}, [
	currentText,
	modelInfo,
	activeWorker,
	config,
	isSynthesizing,
	selectedQuantization
	])

	useEffect(() => {
	if (!activeWorker) return

	const onMessageReceived = (e: MessageEvent<WorkerMessage>) => {
	const { status, output } = e.data
	if (status === 'output' && output) {
	setIsSynthesizing(false)
	const audioResult = {
	audio: new Float32Array(output.audio),
	sampling_rate: output.sampling_rate
	}
	addAudioResult(currentText, audioResult, config.voice)
	} else if (status === 'ready' \|\| status === 'error') {
	setIsSynthesizing(false)
	}
	}

	activeWorker.addEventListener('message', onMessageReceived)
	return () => activeWorker.removeEventListener('message', onMessageReceived)
	}, [activeWorker, currentText, addAudioResult])

	useEffect(() => {
	if (!modelInfo) return
	if (modelInfo && modelInfo?.voices.length > 0)
	setConfig((prev) => ({
	...prev,
	voice: modelInfo.voices[0]
	}))
	}, [modelInfo])

	const handleKeyPress = (e: React.KeyboardEvent) => {
	if (e.key === 'Enter' && !e.shiftKey) {
	e.preventDefault()
	handleSynthesize()
	}
	}

	const busy = status !== 'ready' \|\| isSynthesizing

	return (
	<div className="flex flex-col min-h-[30dvh] max-h-[calc(100dvh-128px)] w-full p-4">
	<div className="flex items-center justify-between mb-4">
	<h1 className="text-2xl font-bold">Text to Speech</h1>
	<button
	onClick={clearAudioResults}
	className="p-2 bg-red-100 hover:bg-red-200 rounded-lg transition-colors"
	title="Clear All Audio"
	>
	<Eraser className="w-4 h-4" />
	</button>
	</div>

	<div className="mb-4">
	<label className="block text-sm font-medium text-gray-700 mb-2">
	Enter text to synthesize:
	</label>
	<textarea
	value={currentText}
	onChange={(e) => setCurrentText(e.target.value)}
	onKeyPress={handleKeyPress}
	placeholder="Enter your text here... (Press Enter to synthesize, Shift+Enter for new line)"
	className="w-full p-3 border border-gray-300 rounded-lg resize-none focus:outline-hidden focus:ring-2 focus:ring-blue-500 focus:border-blue-500 disabled:bg-gray-100 disabled:cursor-not-allowed"
	rows={4}
	disabled={!hasBeenLoaded \|\| isSynthesizing}
	/>
	</div>

	<div className="mb-4">
	<div className="flex flex-wrap gap-2 mb-2">
	<span className="text-sm font-medium text-gray-700">
	Quick samples:
	</span>
	{SAMPLE_TEXTS.map((sampleText, index) => (
	<button
	key={index}
	onClick={() => setCurrentText(sampleText)}
	disabled={!hasBeenLoaded \|\| isSynthesizing}
	className="px-2 py-1 bg-gray-100 hover:bg-gray-200 disabled:bg-gray-50 disabled:cursor-not-allowed text-gray-700 text-xs rounded transition-colors"
	>
	Sample {index + 1}
	</button>
	))}
	</div>
	</div>

	<div className="mb-4">
	<button
	onClick={handleSynthesize}
	disabled={!currentText.trim() \|\| busy \|\| !hasBeenLoaded}
	className="px-6 py-2 bg-green-500 hover:bg-green-600 disabled:bg-gray-300 disabled:cursor-not-allowed text-white rounded-lg transition-colors flex items-center gap-2"
	>
	{isSynthesizing ? (
	<>
	<Loader2 className="w-4 h-4 animate-spin" />
	Synthesizing...
	</>
	) : (
	<>
	<Volume2 className="w-4 h-4" />
	Synthesize Speech
	</>
	)}
	</button>
	</div>

	<div className="flex-1 overflow-y-auto">
	<div className="mb-2">
	<label className="block text-sm font-medium text-gray-700">
	Generated Audio ({audioResults.length}):
	</label>
	</div>
	{audioResults.length > 0 ? (
	<div className="space-y-3">
	{audioResults.map((result, index) => (
	<AudioPlayer
	key={index}
	audio={result.audio}
	samplingRate={result.sampling_rate}
	text={result.text}
	index={index}
	voice={result.voice}
	/>
	))}
	</div>
	) : (
	<div className="text-gray-500 italic flex flex-col items-center gap-3 p-8 border border-gray-200 rounded-lg bg-gray-50">
	{isSynthesizing ? (
	<>
	<Loader2 className="w-6 h-6 animate-spin text-blue-500" />
	<span>Synthesizing speech...</span>
	</>
	) : (
	<>
	<Volume2 className="w-8 h-8 text-gray-400" />
	<span>Generated audio will appear here</span>
	<span className="text-xs text-gray-400">
	Enter text and click "Synthesize Speech" to get started
	</span>
	</>
	)}
	</div>
	)}
	</div>

	{!hasBeenLoaded && (
	<div className="text-center text-gray-500 text-sm mt-2">
	Please load a model first to start synthesizing speech
	</div>
	)}
	</div>
	)
	}

	export default TextToSpeech