Vokturz commited on
Commit
25647ae
·
1 Parent(s): 63dbafb

add text-to-speech support

Browse files
package.json CHANGED
@@ -5,6 +5,7 @@
5
  "private": true,
6
  "dependencies": {
7
  "@headlessui/react": "^2.2.4",
 
8
  "@radix-ui/react-select": "^2.2.5",
9
  "@radix-ui/react-separator": "^1.1.7",
10
  "@radix-ui/react-slider": "^1.3.5",
 
5
  "private": true,
6
  "dependencies": {
7
  "@headlessui/react": "^2.2.4",
8
+ "@radix-ui/react-label": "^2.1.7",
9
  "@radix-ui/react-select": "^2.2.5",
10
  "@radix-ui/react-separator": "^1.1.7",
11
  "@radix-ui/react-slider": "^1.3.5",
pnpm-lock.yaml CHANGED
@@ -11,6 +11,9 @@ importers:
11
  '@headlessui/react':
12
  specifier: ^2.2.4
13
  version: 2.2.4(react-dom@19.1.0(react@19.1.0))(react@19.1.0)
 
 
 
14
  '@radix-ui/react-select':
15
  specifier: ^2.2.5
16
  version: 2.2.5(@types/react-dom@19.1.6(@types/react@19.1.8))(@types/react@19.1.8)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)
@@ -540,6 +543,19 @@ packages:
540
  '@types/react':
541
  optional: true
542
 
 
 
 
 
 
 
 
 
 
 
 
 
 
543
  '@radix-ui/react-popper@1.2.7':
544
  resolution: {integrity: sha512-IUFAccz1JyKcf/RjB552PlWwxjeCJB8/4KxT7EhBHOJM+mN7LdW+B3kacJXILm32xawcMMjb2i0cIZpo+f9kiQ==}
545
  peerDependencies:
@@ -2696,6 +2712,15 @@ snapshots:
2696
  optionalDependencies:
2697
  '@types/react': 19.1.8
2698
 
 
 
 
 
 
 
 
 
 
2699
  '@radix-ui/react-popper@1.2.7(@types/react-dom@19.1.6(@types/react@19.1.8))(@types/react@19.1.8)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)':
2700
  dependencies:
2701
  '@floating-ui/react-dom': 2.1.4(react-dom@19.1.0(react@19.1.0))(react@19.1.0)
 
11
  '@headlessui/react':
12
  specifier: ^2.2.4
13
  version: 2.2.4(react-dom@19.1.0(react@19.1.0))(react@19.1.0)
14
+ '@radix-ui/react-label':
15
+ specifier: ^2.1.7
16
+ version: 2.1.7(@types/react-dom@19.1.6(@types/react@19.1.8))(@types/react@19.1.8)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)
17
  '@radix-ui/react-select':
18
  specifier: ^2.2.5
19
  version: 2.2.5(@types/react-dom@19.1.6(@types/react@19.1.8))(@types/react@19.1.8)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)
 
543
  '@types/react':
544
  optional: true
545
 
546
+ '@radix-ui/react-label@2.1.7':
547
+ resolution: {integrity: sha512-YT1GqPSL8kJn20djelMX7/cTRp/Y9w5IZHvfxQTVHrOqa2yMl7i/UfMqKRU5V7mEyKTrUVgJXhNQPVCG8PBLoQ==}
548
+ peerDependencies:
549
+ '@types/react': '*'
550
+ '@types/react-dom': '*'
551
+ react: ^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc
552
+ react-dom: ^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc
553
+ peerDependenciesMeta:
554
+ '@types/react':
555
+ optional: true
556
+ '@types/react-dom':
557
+ optional: true
558
+
559
  '@radix-ui/react-popper@1.2.7':
560
  resolution: {integrity: sha512-IUFAccz1JyKcf/RjB552PlWwxjeCJB8/4KxT7EhBHOJM+mN7LdW+B3kacJXILm32xawcMMjb2i0cIZpo+f9kiQ==}
561
  peerDependencies:
 
2712
  optionalDependencies:
2713
  '@types/react': 19.1.8
2714
 
2715
+ '@radix-ui/react-label@2.1.7(@types/react-dom@19.1.6(@types/react@19.1.8))(@types/react@19.1.8)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)':
2716
+ dependencies:
2717
+ '@radix-ui/react-primitive': 2.1.3(@types/react-dom@19.1.6(@types/react@19.1.8))(@types/react@19.1.8)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)
2718
+ react: 19.1.0
2719
+ react-dom: 19.1.0(react@19.1.0)
2720
+ optionalDependencies:
2721
+ '@types/react': 19.1.8
2722
+ '@types/react-dom': 19.1.6(@types/react@19.1.8)
2723
+
2724
  '@radix-ui/react-popper@1.2.7(@types/react-dom@19.1.6(@types/react@19.1.8))(@types/react@19.1.8)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)':
2725
  dependencies:
2726
  '@floating-ui/react-dom': 2.1.4(react-dom@19.1.0(react@19.1.0))(react@19.1.0)
public/workers/text-to-speech.js ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* eslint-disable no-restricted-globals */
2
+ import { pipeline } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@latest'
3
+
4
+ class MyTextToSpeechPipeline {
5
+ static task = 'text-to-speech'
6
+ static instance = null
7
+
8
+ static async getInstance(model, dtype = 'fp32', progress_callback = null) {
9
+ try {
10
+ // Try WebGPU first
11
+ this.instance = await pipeline(this.task, model, {
12
+ dtype,
13
+ device: 'webgpu',
14
+ progress_callback,
15
+ quantized: false
16
+ })
17
+ return this.instance
18
+ } catch (webgpuError) {
19
+ // Fallback to WASM if WebGPU fails
20
+ if (progress_callback) {
21
+ progress_callback({
22
+ status: 'fallback',
23
+ message: 'WebGPU failed, falling back to WASM'
24
+ })
25
+ }
26
+ try {
27
+ this.instance = await pipeline(this.task, model, {
28
+ dtype,
29
+ device: 'wasm',
30
+ progress_callback,
31
+ quantized: false
32
+ })
33
+ return this.instance
34
+ } catch (wasmError) {
35
+ throw new Error(
36
+ `Both WebGPU and WASM failed. WebGPU error: ${webgpuError.message}. WASM error: ${wasmError.message}`
37
+ )
38
+ }
39
+ }
40
+ }
41
+ }
42
+
43
+ // Listen for messages from the main thread
44
+ self.addEventListener('message', async (event) => {
45
+ try {
46
+ const { type, model, dtype, text, config } = event.data
47
+
48
+ if (!model) {
49
+ self.postMessage({
50
+ status: 'error',
51
+ output: 'No model provided'
52
+ })
53
+ return
54
+ }
55
+
56
+ // Retrieve the pipeline. This will download the model if not already cached.
57
+ const synthesizer = await MyTextToSpeechPipeline.getInstance(
58
+ model,
59
+ dtype,
60
+ (x) => {
61
+ self.postMessage({ status: 'loading', output: x })
62
+ }
63
+ )
64
+
65
+ if (type === 'load') {
66
+ self.postMessage({
67
+ status: 'ready',
68
+ output: `Model ${model}, dtype ${dtype} loaded`
69
+ })
70
+ return
71
+ }
72
+
73
+ if (type === 'synthesize') {
74
+ if (!text || typeof text !== 'string' || text.trim() === '') {
75
+ self.postMessage({
76
+ status: 'error',
77
+ output: 'No text provided for synthesis'
78
+ })
79
+ return
80
+ }
81
+
82
+ const options = {}
83
+
84
+ // Add speaker embeddings if provided
85
+ if (config?.speakerEmbeddings) {
86
+ try {
87
+ const response = await fetch(config.speakerEmbeddings)
88
+ if (response.ok) {
89
+ const embeddings = await response.arrayBuffer()
90
+ options.speaker_embeddings = new Float32Array(embeddings)
91
+ }
92
+ } catch (error) {
93
+ console.warn('Failed to load speaker embeddings:', error)
94
+ // Continue without speaker embeddings
95
+ }
96
+ }
97
+
98
+ try {
99
+ const output = await synthesizer(text.trim(), options)
100
+
101
+ self.postMessage({
102
+ status: 'output',
103
+ output: {
104
+ audio: Array.from(output.audio),
105
+ sampling_rate: output.sampling_rate
106
+ }
107
+ })
108
+
109
+ self.postMessage({ status: 'ready' })
110
+ } catch (error) {
111
+ throw error
112
+ }
113
+ }
114
+ } catch (error) {
115
+ self.postMessage({
116
+ status: 'error',
117
+ output: error.message || 'An error occurred during text-to-speech synthesis'
118
+ })
119
+ }
120
+ })
src/components/AudioPlayer.tsx ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useState, useRef, useEffect, useCallback, useMemo } from 'react'
2
+ import { Play, Square, Download, Eraser, Loader2, Volume2 } from 'lucide-react'
3
+
4
+ interface AudioPlayerProps {
5
+ audio: Float32Array
6
+ samplingRate: number
7
+ text: string
8
+ index: number
9
+ }
10
+
11
+ function createWavBuffer(
12
+ audioData: Float32Array,
13
+ sampleRate: number
14
+ ): ArrayBuffer {
15
+ const length = audioData.length
16
+ const buffer = new ArrayBuffer(44 + length * 2)
17
+ const view = new DataView(buffer)
18
+
19
+ // WAV header
20
+ const writeString = (offset: number, string: string) => {
21
+ for (let i = 0; i < string.length; i++) {
22
+ view.setUint8(offset + i, string.charCodeAt(i))
23
+ }
24
+ }
25
+
26
+ writeString(0, 'RIFF')
27
+ view.setUint32(4, 36 + length * 2, true)
28
+ writeString(8, 'WAVE')
29
+ writeString(12, 'fmt ')
30
+ view.setUint32(16, 16, true)
31
+ view.setUint16(20, 1, true)
32
+ view.setUint16(22, 1, true)
33
+ view.setUint32(24, sampleRate, true)
34
+ view.setUint32(28, sampleRate * 2, true)
35
+ view.setUint16(32, 2, true)
36
+ view.setUint16(34, 16, true)
37
+ writeString(36, 'data')
38
+ view.setUint32(40, length * 2, true)
39
+
40
+ // Convert float32 to int16
41
+ let offset = 44
42
+ for (let i = 0; i < length; i++) {
43
+ const sample = Math.max(-1, Math.min(1, audioData[i]))
44
+ view.setInt16(offset, sample < 0 ? sample * 0x8000 : sample * 0x7fff, true)
45
+ offset += 2
46
+ }
47
+
48
+ return buffer
49
+ }
50
+
51
+ interface CustomAudioVisualizerProps {
52
+ audio: Float32Array
53
+ isPlaying: boolean
54
+ currentTime: number
55
+ duration: number
56
+ height?: number
57
+ }
58
+
59
+ function CustomAudioVisualizer({
60
+ audio,
61
+ isPlaying,
62
+ currentTime,
63
+ duration,
64
+ height = 80
65
+ }: CustomAudioVisualizerProps) {
66
+ const canvasRef = useRef<HTMLCanvasElement>(null)
67
+ const containerRef = useRef<HTMLDivElement>(null)
68
+
69
+ // Memoize expensive calculations with smoothing
70
+ const waveformData = useMemo(() => {
71
+ if (!audio || audio.length === 0) return []
72
+
73
+ const samples = Math.min(audio.length, 1600) // Fixed high resolution
74
+ const step = audio.length / samples
75
+ const data = []
76
+
77
+ for (let i = 0; i < samples; i++) {
78
+ const sample = Math.abs(audio[Math.floor(i * step)])
79
+ data.push(sample)
80
+ }
81
+
82
+ // Apply smoothing filter
83
+ const smoothedData = []
84
+ const smoothingWindow = 3
85
+ for (let i = 0; i < data.length; i++) {
86
+ let sum = 0
87
+ let count = 0
88
+ for (
89
+ let j = Math.max(0, i - smoothingWindow);
90
+ j <= Math.min(data.length - 1, i + smoothingWindow);
91
+ j++
92
+ ) {
93
+ sum += data[j]
94
+ count++
95
+ }
96
+ smoothedData.push(sum / count)
97
+ }
98
+
99
+ return smoothedData
100
+ }, [audio])
101
+
102
+ // Add resize observer for true responsiveness
103
+ const [containerWidth, setContainerWidth] = useState(800)
104
+
105
+ useEffect(() => {
106
+ const container = containerRef.current
107
+ if (!container) return
108
+
109
+ const resizeObserver = new ResizeObserver((entries) => {
110
+ for (const entry of entries) {
111
+ setContainerWidth(entry.contentRect.width)
112
+ }
113
+ })
114
+
115
+ resizeObserver.observe(container)
116
+ setContainerWidth(container.getBoundingClientRect().width)
117
+
118
+ return () => resizeObserver.disconnect()
119
+ }, [])
120
+
121
+ useEffect(() => {
122
+ const canvas = canvasRef.current
123
+ const container = containerRef.current
124
+ if (!canvas || !container || waveformData.length === 0) return
125
+
126
+ const ctx = canvas.getContext('2d')
127
+ if (!ctx) return
128
+
129
+ // Use state-tracked container width for responsive behavior
130
+ const displayWidth = containerWidth
131
+ const displayHeight = height
132
+
133
+ // Set high DPI for sharper rendering
134
+ const dpr = window.devicePixelRatio || 1
135
+
136
+ canvas.width = displayWidth * dpr
137
+ canvas.height = displayHeight * dpr
138
+ canvas.style.width = `${displayWidth}px`
139
+ canvas.style.height = `${displayHeight}px`
140
+
141
+ ctx.scale(dpr, dpr)
142
+
143
+ const samples = waveformData.length
144
+
145
+ // Clear canvas
146
+ ctx.clearRect(0, 0, displayWidth, displayHeight)
147
+
148
+ // Enable smoothing for better quality
149
+ ctx.imageSmoothingEnabled = true
150
+ ctx.imageSmoothingQuality = 'high'
151
+
152
+ // Draw waveform bars with full width and logarithmic scaling
153
+ const actualBars = Math.min(samples, displayWidth) // Use full width
154
+ const barWidth = displayWidth / actualBars
155
+ const gap = Math.max(0, barWidth * 0.1)
156
+ const effectiveBarWidth = Math.max(1, barWidth - gap)
157
+
158
+ for (let i = 0; i < actualBars; i++) {
159
+ const x = i * barWidth
160
+ const dataIndex = Math.floor((i / actualBars) * samples)
161
+ const sample = waveformData[dataIndex] || 0
162
+
163
+ // Logarithmic scaling for better speech visualization
164
+ // This makes quiet sounds (silence) very small and speech much more prominent
165
+ const minThreshold = 0.01 // Silence threshold
166
+ const logSample =
167
+ sample > minThreshold
168
+ ? Math.log10(sample * 9 + 1) // Log scale: log10(sample * 9 + 1) ranges from 0 to 1
169
+ : sample * 0.1 // Very quiet for near-silence
170
+
171
+ const amplified = Math.pow(logSample * 3, 1.5) // Further emphasize speech
172
+ const barHeight = Math.max(2, amplified * displayHeight * 0.9)
173
+
174
+ const y = (displayHeight - barHeight) / 2
175
+
176
+ let barColor = '#6B7280'
177
+ if (duration > 0) {
178
+ const timePosition = (i / actualBars) * duration
179
+ if (isPlaying && timePosition <= currentTime) {
180
+ barColor = '#3B82F6'
181
+ } else if (isPlaying) {
182
+ barColor = '#9CA3AF'
183
+ }
184
+ }
185
+
186
+ ctx.fillStyle = barColor
187
+ ctx.fillRect(x, y, effectiveBarWidth, barHeight)
188
+
189
+ // Optional: Add rounded corners if supported
190
+ if (typeof ctx.roundRect === 'function') {
191
+ ctx.clearRect(x, y, effectiveBarWidth, barHeight)
192
+ ctx.beginPath()
193
+ const radius = Math.min(effectiveBarWidth / 4, 2)
194
+ ctx.roundRect(x, y, effectiveBarWidth, barHeight, radius)
195
+ ctx.fill()
196
+ }
197
+ }
198
+
199
+ // Draw progress line with gradient
200
+ if (isPlaying && duration > 0 && currentTime >= 0) {
201
+ const progressX = Math.min(
202
+ (currentTime / duration) * displayWidth,
203
+ displayWidth
204
+ )
205
+
206
+ // Create gradient for progress line
207
+ const gradient = ctx.createLinearGradient(0, 0, 0, displayHeight)
208
+ gradient.addColorStop(0, '#EF4444')
209
+ gradient.addColorStop(0.5, '#DC2626')
210
+ gradient.addColorStop(1, '#EF4444')
211
+
212
+ ctx.strokeStyle = gradient
213
+ ctx.lineWidth = 3
214
+ ctx.lineCap = 'round'
215
+ ctx.beginPath()
216
+ ctx.moveTo(progressX, 4)
217
+ ctx.lineTo(progressX, displayHeight - 4)
218
+ ctx.stroke()
219
+ }
220
+ }, [waveformData, isPlaying, currentTime, duration, height, containerWidth])
221
+
222
+ return (
223
+ <div ref={containerRef} className="w-full">
224
+ <canvas
225
+ ref={canvasRef}
226
+ className="w-full block"
227
+ style={{
228
+ width: '100%',
229
+ height: `${height}px`,
230
+ maxWidth: '100%',
231
+ display: 'block'
232
+ }}
233
+ />
234
+ </div>
235
+ )
236
+ }
237
+
238
+ function AudioPlayer({ audio, samplingRate, text, index }: AudioPlayerProps) {
239
+ const [isPlaying, setIsPlaying] = useState(false)
240
+ const [currentTime, setCurrentTime] = useState(0)
241
+ const [duration, setDuration] = useState(0)
242
+
243
+ const audioRef = useRef<HTMLAudioElement>(null)
244
+ const audioContextRef = useRef<AudioContext | null>(null)
245
+ const sourceRef = useRef<AudioBufferSourceNode | null>(null)
246
+ const startTimeRef = useRef<number>(0)
247
+ const animationFrameRef = useRef<number | null>(null)
248
+ const visualizerRef = useRef<HTMLCanvasElement>(null)
249
+
250
+ const stopAudio = useCallback(() => {
251
+ if (sourceRef.current) {
252
+ sourceRef.current.stop()
253
+ sourceRef.current = null
254
+ }
255
+ if (animationFrameRef.current) {
256
+ cancelAnimationFrame(animationFrameRef.current)
257
+ animationFrameRef.current = null
258
+ }
259
+ setIsPlaying(false)
260
+ setCurrentTime(0)
261
+ }, [])
262
+
263
+ const playAudio = useCallback(async () => {
264
+ if (!audio || audio.length === 0) return
265
+
266
+ // Stop current audio if playing
267
+ if (isPlaying) {
268
+ stopAudio()
269
+ return
270
+ }
271
+
272
+ try {
273
+ // Create audio context if it doesn't exist
274
+ if (!audioContextRef.current) {
275
+ audioContextRef.current = new (window.AudioContext ||
276
+ (window as any).webkitAudioContext)()
277
+ }
278
+
279
+ const audioContext = audioContextRef.current
280
+
281
+ // Resume audio context if suspended
282
+ if (audioContext.state === 'suspended') {
283
+ await audioContext.resume()
284
+ }
285
+
286
+ // Create audio buffer
287
+ const audioBuffer = audioContext.createBuffer(
288
+ 1,
289
+ audio.length,
290
+ samplingRate
291
+ )
292
+ audioBuffer.getChannelData(0).set(audio)
293
+
294
+ // Create audio source
295
+ const source = audioContext.createBufferSource()
296
+ source.buffer = audioBuffer
297
+ source.connect(audioContext.destination)
298
+ sourceRef.current = source
299
+
300
+ const audioDuration = audio.length / samplingRate
301
+ setDuration(audioDuration)
302
+ setIsPlaying(true)
303
+ startTimeRef.current = audioContext.currentTime
304
+
305
+ // Update current time during playback
306
+ const updateTime = () => {
307
+ if (sourceRef.current && audioContextRef.current) {
308
+ const elapsed =
309
+ audioContextRef.current.currentTime - startTimeRef.current
310
+ const newCurrentTime = Math.min(elapsed, audioDuration)
311
+ setCurrentTime(newCurrentTime)
312
+
313
+ if (elapsed < audioDuration) {
314
+ animationFrameRef.current = requestAnimationFrame(updateTime)
315
+ } else {
316
+ // Audio finished naturally
317
+ setIsPlaying(false)
318
+ setCurrentTime(0)
319
+ sourceRef.current = null
320
+ animationFrameRef.current = null
321
+ }
322
+ }
323
+ }
324
+ animationFrameRef.current = requestAnimationFrame(updateTime)
325
+
326
+ source.onended = () => {
327
+ if (animationFrameRef.current) {
328
+ cancelAnimationFrame(animationFrameRef.current)
329
+ animationFrameRef.current = null
330
+ }
331
+ setIsPlaying(false)
332
+ setCurrentTime(0)
333
+ sourceRef.current = null
334
+ }
335
+
336
+ source.start()
337
+ } catch (error) {
338
+ console.error('Error playing audio:', error)
339
+ setIsPlaying(false)
340
+ setCurrentTime(0)
341
+ }
342
+ }, [audio, samplingRate, isPlaying, stopAudio])
343
+
344
+ // Cleanup animation frame on component unmount
345
+ useEffect(() => {
346
+ return () => {
347
+ if (animationFrameRef.current) {
348
+ cancelAnimationFrame(animationFrameRef.current)
349
+ }
350
+ }
351
+ }, [])
352
+
353
+ const downloadAudio = useCallback(() => {
354
+ if (!audio || audio.length === 0) return
355
+
356
+ try {
357
+ const wavBuffer = createWavBuffer(audio, samplingRate)
358
+ const blob = new Blob([wavBuffer], { type: 'audio/wav' })
359
+ const url = URL.createObjectURL(blob)
360
+
361
+ const a = document.createElement('a')
362
+ a.href = url
363
+ a.download = `tts-output-${index + 1}.wav`
364
+ document.body.appendChild(a)
365
+ a.click()
366
+ document.body.removeChild(a)
367
+ URL.revokeObjectURL(url)
368
+ } catch (error) {
369
+ console.error('Error downloading audio:', error)
370
+ }
371
+ }, [audio, samplingRate, index])
372
+
373
+ return (
374
+ <div className="border border-gray-200 rounded-lg p-4 bg-gray-50">
375
+ <div className="mb-3">
376
+ <p className="text-sm text-gray-700 font-medium mb-2">Prompt:</p>
377
+ <p className="text-sm text-gray-600 italic bg-white p-2 rounded border">
378
+ "{text}"
379
+ </p>
380
+ </div>
381
+
382
+ <div className="mb-3">
383
+ <div className="w-full border border-gray-200 rounded bg-gray-50 overflow-hidden">
384
+ {audio && audio.length > 0 ? (
385
+ <CustomAudioVisualizer
386
+ audio={audio}
387
+ isPlaying={isPlaying}
388
+ currentTime={currentTime}
389
+ duration={duration}
390
+ height={80}
391
+ />
392
+ ) : (
393
+ <div className="w-full h-20 flex items-center justify-center">
394
+ <span className="text-gray-400 text-sm">Loading waveform...</span>
395
+ </div>
396
+ )}
397
+ </div>
398
+ </div>
399
+
400
+ <div className="flex items-center gap-2">
401
+ <button
402
+ onClick={playAudio}
403
+ className="flex items-center gap-1 px-3 py-1 bg-blue-500 hover:bg-blue-600 text-white rounded text-sm transition-colors"
404
+ >
405
+ {isPlaying ? (
406
+ <>
407
+ <Square className="w-4 h-4" />
408
+ Stop
409
+ </>
410
+ ) : (
411
+ <>
412
+ <Play className="w-4 h-4" />
413
+ Play
414
+ </>
415
+ )}
416
+ </button>
417
+ <button
418
+ onClick={downloadAudio}
419
+ className="flex items-center gap-1 px-3 py-1 bg-green-500 hover:bg-green-600 text-white rounded text-sm transition-colors"
420
+ >
421
+ <Download className="w-4 h-4" />
422
+ Download
423
+ </button>
424
+ {duration > 0 && (
425
+ <span className="text-xs text-gray-500 ml-2">
426
+ {currentTime.toFixed(1)}s / {duration.toFixed(1)}s
427
+ </span>
428
+ )}
429
+ </div>
430
+ </div>
431
+ )
432
+ }
433
+
434
+ export default AudioPlayer
src/components/ModelLoader.tsx CHANGED
@@ -105,7 +105,7 @@ const ModelLoader = () => {
105
 
106
  return () => {
107
  newWorker.removeEventListener('message', onMessageReceived)
108
- terminateWorker(pipeline)
109
  }
110
  }, [
111
  pipeline,
 
105
 
106
  return () => {
107
  newWorker.removeEventListener('message', onMessageReceived)
108
+ // terminateWorker(pipeline)
109
  }
110
  }, [
111
  pipeline,
src/components/PipelineLayout.tsx CHANGED
@@ -4,6 +4,7 @@ import { FeatureExtractionProvider } from '../contexts/FeatureExtractionContext'
4
  import { ZeroShotClassificationProvider } from '../contexts/ZeroShotClassificationContext'
5
  import { ImageClassificationProvider } from '../contexts/ImageClassificationContext'
6
  import { TextClassificationProvider } from '../contexts/TextClassificationContext'
 
7
 
8
  export const PipelineLayout = ({ children }: { children: React.ReactNode }) => {
9
  const { pipeline } = useModel()
@@ -30,6 +31,9 @@ export const PipelineLayout = ({ children }: { children: React.ReactNode }) => {
30
  case 'text-classification':
31
  return <TextClassificationProvider>{children}</TextClassificationProvider>
32
 
 
 
 
33
  default:
34
  return <>{children}</>
35
  }
 
4
  import { ZeroShotClassificationProvider } from '../contexts/ZeroShotClassificationContext'
5
  import { ImageClassificationProvider } from '../contexts/ImageClassificationContext'
6
  import { TextClassificationProvider } from '../contexts/TextClassificationContext'
7
+ import { TextToSpeechProvider } from '../contexts/TextToSpeechContext'
8
 
9
  export const PipelineLayout = ({ children }: { children: React.ReactNode }) => {
10
  const { pipeline } = useModel()
 
31
  case 'text-classification':
32
  return <TextClassificationProvider>{children}</TextClassificationProvider>
33
 
34
+ case 'text-to-speech':
35
+ return <TextToSpeechProvider>{children}</TextToSpeechProvider>
36
+
37
  default:
38
  return <>{children}</>
39
  }
src/components/PipelineSelector.tsx CHANGED
@@ -12,6 +12,7 @@ export const supportedPipelines = [
12
  'image-classification',
13
  'text-generation',
14
  'text-classification',
 
15
  'zero-shot-classification'
16
  // 'summarization',
17
  // 'translation'
 
12
  'image-classification',
13
  'text-generation',
14
  'text-classification',
15
+ 'text-to-speech',
16
  'zero-shot-classification'
17
  // 'summarization',
18
  // 'translation'
src/components/Sidebar.tsx CHANGED
@@ -79,6 +79,17 @@ const Sidebar = ({
79
  </Tooltip>
80
  </span>
81
  )}
 
 
 
 
 
 
 
 
 
 
 
82
  </div>
83
  <PipelineSelector pipeline={pipeline} setPipeline={setPipeline} />
84
  </div>
 
79
  </Tooltip>
80
  </span>
81
  )}
82
+ {pipeline === 'text-to-speech' && (
83
+ <span className="flex text-xs text-yellow-500 justify-center text-center">
84
+ Not fully supported{' '}
85
+ <Tooltip
86
+ content="Transformers.js has limited support for text-to-speech"
87
+ className="transform -translate-x-1/3 break-keep max-w-12"
88
+ >
89
+ <CircleQuestionMark className="inline w-4 h-4 ml-1" />
90
+ </Tooltip>
91
+ </span>
92
+ )}
93
  </div>
94
  <PipelineSelector pipeline={pipeline} setPipeline={setPipeline} />
95
  </div>
src/components/pipelines/TextToSpeech.tsx ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useState, useRef, useEffect, useCallback, useMemo } from 'react'
2
+ import { Play, Square, Download, Eraser, Loader2, Volume2 } from 'lucide-react'
3
+ import { TextToSpeechWorkerInput, WorkerMessage } from '../../types'
4
+ import { useModel } from '../../contexts/ModelContext'
5
+ import {
6
+ useTextToSpeech,
7
+ AudioResult
8
+ } from '../../contexts/TextToSpeechContext'
9
+ import AudioPlayer from '../AudioPlayer'
10
+
11
+ const SAMPLE_TEXTS = [
12
+ 'Hello, this is a sample text for text-to-speech synthesis.',
13
+ 'Transformers.js makes it easy to run machine learning models in the browser.',
14
+ 'The quick brown fox jumps over the lazy dog.',
15
+ 'Text-to-speech technology converts written text into spoken words using artificial intelligence.'
16
+ ]
17
+
18
+ function TextToSpeech() {
19
+ const {
20
+ config,
21
+ audioResults,
22
+ currentText,
23
+ setCurrentText,
24
+ addAudioResult,
25
+ clearAudioResults
26
+ } = useTextToSpeech()
27
+
28
+ const [isSynthesizing, setIsSynthesizing] = useState<boolean>(false)
29
+
30
+ const {
31
+ activeWorker,
32
+ status,
33
+ modelInfo,
34
+ hasBeenLoaded,
35
+ selectedQuantization
36
+ } = useModel()
37
+
38
+ const handleSynthesize = useCallback(() => {
39
+ if (!currentText.trim() || !modelInfo || !activeWorker || isSynthesizing)
40
+ return
41
+
42
+ setIsSynthesizing(true)
43
+
44
+ const message: TextToSpeechWorkerInput = {
45
+ type: 'synthesize',
46
+ text: currentText.trim(),
47
+ model: modelInfo.id,
48
+ dtype: selectedQuantization ?? 'fp32',
49
+ config: {
50
+ speakerEmbeddings: config.speakerEmbeddings
51
+ }
52
+ }
53
+
54
+ activeWorker.postMessage(message)
55
+ }, [
56
+ currentText,
57
+ modelInfo,
58
+ activeWorker,
59
+ config,
60
+ isSynthesizing,
61
+ selectedQuantization
62
+ ])
63
+
64
+ useEffect(() => {
65
+ if (!activeWorker) return
66
+
67
+ const onMessageReceived = (e: MessageEvent<WorkerMessage>) => {
68
+ const { status, output } = e.data
69
+ if (status === 'output' && output) {
70
+ setIsSynthesizing(false)
71
+ const audioResult = {
72
+ audio: new Float32Array(output.audio),
73
+ sampling_rate: output.sampling_rate
74
+ }
75
+ addAudioResult(currentText, audioResult)
76
+ } else if (status === 'ready' || status === 'error') {
77
+ setIsSynthesizing(false)
78
+ }
79
+ }
80
+
81
+ activeWorker.addEventListener('message', onMessageReceived)
82
+ return () => activeWorker.removeEventListener('message', onMessageReceived)
83
+ }, [activeWorker, currentText, addAudioResult])
84
+
85
+ const handleKeyPress = (e: React.KeyboardEvent) => {
86
+ if (e.key === 'Enter' && !e.shiftKey) {
87
+ e.preventDefault()
88
+ handleSynthesize()
89
+ }
90
+ }
91
+
92
+ const busy = status !== 'ready' || isSynthesizing
93
+
94
+ return (
95
+ <div className="flex flex-col min-h-[30dvh] max-h-[calc(100dvh-128px)] w-full p-4">
96
+ <div className="flex items-center justify-between mb-4">
97
+ <h1 className="text-2xl font-bold">Text to Speech</h1>
98
+ <button
99
+ onClick={clearAudioResults}
100
+ className="p-2 bg-red-100 hover:bg-red-200 rounded-lg transition-colors"
101
+ title="Clear All Audio"
102
+ >
103
+ <Eraser className="w-4 h-4" />
104
+ </button>
105
+ </div>
106
+
107
+ <div className="mb-4">
108
+ <label className="block text-sm font-medium text-gray-700 mb-2">
109
+ Enter text to synthesize:
110
+ </label>
111
+ <textarea
112
+ value={currentText}
113
+ onChange={(e) => setCurrentText(e.target.value)}
114
+ onKeyPress={handleKeyPress}
115
+ placeholder="Enter your text here... (Press Enter to synthesize, Shift+Enter for new line)"
116
+ className="w-full p-3 border border-gray-300 rounded-lg resize-none focus:outline-hidden focus:ring-2 focus:ring-blue-500 focus:border-blue-500 disabled:bg-gray-100 disabled:cursor-not-allowed"
117
+ rows={4}
118
+ disabled={!hasBeenLoaded || isSynthesizing}
119
+ />
120
+ </div>
121
+
122
+ <div className="mb-4">
123
+ <div className="flex flex-wrap gap-2 mb-2">
124
+ <span className="text-sm font-medium text-gray-700">
125
+ Quick samples:
126
+ </span>
127
+ {SAMPLE_TEXTS.map((sampleText, index) => (
128
+ <button
129
+ key={index}
130
+ onClick={() => setCurrentText(sampleText)}
131
+ disabled={!hasBeenLoaded || isSynthesizing}
132
+ className="px-2 py-1 bg-gray-100 hover:bg-gray-200 disabled:bg-gray-50 disabled:cursor-not-allowed text-gray-700 text-xs rounded transition-colors"
133
+ >
134
+ Sample {index + 1}
135
+ </button>
136
+ ))}
137
+ </div>
138
+ </div>
139
+
140
+ <div className="mb-4">
141
+ <button
142
+ onClick={handleSynthesize}
143
+ disabled={!currentText.trim() || busy || !hasBeenLoaded}
144
+ className="px-6 py-2 bg-green-500 hover:bg-green-600 disabled:bg-gray-300 disabled:cursor-not-allowed text-white rounded-lg transition-colors flex items-center gap-2"
145
+ >
146
+ {isSynthesizing ? (
147
+ <>
148
+ <Loader2 className="w-4 h-4 animate-spin" />
149
+ Synthesizing...
150
+ </>
151
+ ) : (
152
+ <>
153
+ <Volume2 className="w-4 h-4" />
154
+ Synthesize Speech
155
+ </>
156
+ )}
157
+ </button>
158
+ </div>
159
+
160
+ <div className="flex-1 overflow-y-auto">
161
+ <div className="mb-2">
162
+ <label className="block text-sm font-medium text-gray-700">
163
+ Generated Audio ({audioResults.length}):
164
+ </label>
165
+ </div>
166
+ {audioResults.length > 0 ? (
167
+ <div className="space-y-3">
168
+ {audioResults.map((result, index) => (
169
+ <AudioPlayer
170
+ key={index}
171
+ audio={result.audio}
172
+ samplingRate={result.sampling_rate}
173
+ text={result.text}
174
+ index={index}
175
+ />
176
+ ))}
177
+ </div>
178
+ ) : (
179
+ <div className="text-gray-500 italic flex flex-col items-center gap-3 p-8 border border-gray-200 rounded-lg bg-gray-50">
180
+ {isSynthesizing ? (
181
+ <>
182
+ <Loader2 className="w-6 h-6 animate-spin text-blue-500" />
183
+ <span>Synthesizing speech...</span>
184
+ </>
185
+ ) : (
186
+ <>
187
+ <Volume2 className="w-8 h-8 text-gray-400" />
188
+ <span>Generated audio will appear here</span>
189
+ <span className="text-xs text-gray-400">
190
+ Enter text and click "Synthesize Speech" to get started
191
+ </span>
192
+ </>
193
+ )}
194
+ </div>
195
+ )}
196
+ </div>
197
+
198
+ {!hasBeenLoaded && (
199
+ <div className="text-center text-gray-500 text-sm mt-2">
200
+ Please load a model first to start synthesizing speech
201
+ </div>
202
+ )}
203
+ </div>
204
+ )
205
+ }
206
+
207
+ export default TextToSpeech
src/components/pipelines/TextToSpeechConfig.tsx ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import React from 'react'
2
+ import { Label } from '@/components/ui/label'
3
+ import { Input } from '@/components/ui/input'
4
+ import { useTextToSpeech } from '../../contexts/TextToSpeechContext'
5
+
6
+ interface TextToSpeechConfigProps {
7
+ className?: string
8
+ }
9
+
10
+ const TextToSpeechConfig: React.FC<TextToSpeechConfigProps> = ({
11
+ className = ''
12
+ }) => {
13
+ const { config, setConfig } = useTextToSpeech()
14
+
15
+ return (
16
+ <div className={`space-y-4 ${className}`}>
17
+ <div className="space-y-2">
18
+ <Label htmlFor="speakerEmbeddings" className="text-sm font-medium">
19
+ Speaker Embeddings URL
20
+ </Label>
21
+ <Input
22
+ id="speakerEmbeddings"
23
+ type="url"
24
+ value={config.speakerEmbeddings}
25
+ onChange={(e) =>
26
+ setConfig((prev) => ({
27
+ ...prev,
28
+ speakerEmbeddings: e.target.value
29
+ }))
30
+ }
31
+ placeholder="https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin"
32
+ className="text-sm"
33
+ />
34
+ <p className="text-xs text-gray-500">
35
+ URL to speaker embeddings file for voice characteristics
36
+ </p>
37
+ </div>
38
+ </div>
39
+ )
40
+ }
41
+
42
+ export default TextToSpeechConfig
src/components/ui/label.tsx ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import * as React from "react"
2
+ import * as LabelPrimitive from "@radix-ui/react-label"
3
+
4
+ import { cn } from "@/lib/utils"
5
+
6
+ function Label({
7
+ className,
8
+ ...props
9
+ }: React.ComponentProps<typeof LabelPrimitive.Root>) {
10
+ return (
11
+ <LabelPrimitive.Root
12
+ data-slot="label"
13
+ className={cn(
14
+ "flex items-center gap-2 text-sm leading-none font-medium select-none group-data-[disabled=true]:pointer-events-none group-data-[disabled=true]:opacity-50 peer-disabled:cursor-not-allowed peer-disabled:opacity-50",
15
+ className
16
+ )}
17
+ {...props}
18
+ />
19
+ )
20
+ }
21
+
22
+ export { Label }
src/contexts/TextToSpeechContext.tsx ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { createContext, useContext, useState, ReactNode } from 'react'
2
+
3
+ export interface TextToSpeechConfigState {
4
+ speakerEmbeddings: string
5
+ }
6
+
7
+ export interface AudioResult {
8
+ audio: Float32Array
9
+ sampling_rate: number
10
+ text: string
11
+ }
12
+
13
+ interface TextToSpeechContextType {
14
+ config: TextToSpeechConfigState
15
+ setConfig: React.Dispatch<React.SetStateAction<TextToSpeechConfigState>>
16
+ audioResults: AudioResult[]
17
+ setAudioResults: React.Dispatch<React.SetStateAction<AudioResult[]>>
18
+ currentText: string
19
+ setCurrentText: React.Dispatch<React.SetStateAction<string>>
20
+ addAudioResult: (text: string, audio: Omit<AudioResult, 'text'>) => void
21
+ clearAudioResults: () => void
22
+ }
23
+
24
+ const TextToSpeechContext = createContext<TextToSpeechContextType | undefined>(
25
+ undefined
26
+ )
27
+
28
+ export function TextToSpeechProvider({ children }: { children: ReactNode }) {
29
+ const [config, setConfig] = useState<TextToSpeechConfigState>({
30
+ speakerEmbeddings:
31
+ 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin'
32
+ })
33
+
34
+ const [audioResults, setAudioResults] = useState<AudioResult[]>([])
35
+ const [currentText, setCurrentText] = useState<string>('')
36
+
37
+ const addAudioResult = (text: string, audio: Omit<AudioResult, 'text'>) => {
38
+ const fullAudioResult: AudioResult = { ...audio, text }
39
+ setAudioResults((prev) => [...prev, fullAudioResult])
40
+ }
41
+
42
+ const clearAudioResults = () => {
43
+ setAudioResults([])
44
+ setCurrentText('')
45
+ }
46
+
47
+ const value = {
48
+ config,
49
+ setConfig,
50
+ audioResults,
51
+ setAudioResults,
52
+ currentText,
53
+ setCurrentText,
54
+ addAudioResult,
55
+ clearAudioResults
56
+ }
57
+
58
+ return (
59
+ <TextToSpeechContext.Provider value={value}>
60
+ {children}
61
+ </TextToSpeechContext.Provider>
62
+ )
63
+ }
64
+
65
+ export function useTextToSpeech() {
66
+ const context = useContext(TextToSpeechContext)
67
+ if (context === undefined) {
68
+ throw new Error(
69
+ 'useTextToSpeech must be used within a TextToSpeechProvider'
70
+ )
71
+ }
72
+ return context
73
+ }
src/lib/huggingface.ts CHANGED
@@ -176,6 +176,14 @@ const getModelsByPipeline = async (
176
  !model.id.includes('MiniLM')
177
  )
178
  .slice(0, 30)
 
 
 
 
 
 
 
 
179
  }
180
 
181
  return uniqueModels.slice(0, 30)
 
176
  !model.id.includes('MiniLM')
177
  )
178
  .slice(0, 30)
179
+ } else if (pipelineTag === 'text-to-speech') {
180
+ return uniqueModels
181
+ .filter(
182
+ (model: ModelInfoResponse) =>
183
+ !model.tags.includes('style_text_to_speech_2') &&
184
+ !model.id.includes('qwen2')
185
+ )
186
+ .slice(0, 30)
187
  }
188
 
189
  return uniqueModels.slice(0, 30)
src/lib/workerManager.ts CHANGED
@@ -20,6 +20,9 @@ export const getWorker = (pipeline: string) => {
20
  case 'image-classification':
21
  workerUrl = `/workers/image-classification.js`
22
  break
 
 
 
23
  default:
24
  return null
25
  }
 
20
  case 'image-classification':
21
  workerUrl = `/workers/image-classification.js`
22
  break
23
+ case 'text-to-speech':
24
+ workerUrl = `/workers/text-to-speech.js`
25
+ break
26
  default:
27
  return null
28
  }
src/types.ts CHANGED
@@ -80,6 +80,16 @@ export interface FeatureExtractionWorkerInput {
80
  }
81
  }
82
 
 
 
 
 
 
 
 
 
 
 
83
  export interface ImageClassificationWorkerInput {
84
  type: 'classify'
85
  image: string | ImageData | HTMLImageElement | HTMLCanvasElement
 
80
  }
81
  }
82
 
83
+ export interface TextToSpeechWorkerInput {
84
+ type: 'synthesize'
85
+ text: string
86
+ model: string
87
+ dtype: QuantizationType
88
+ config?: {
89
+ speakerEmbeddings?: string
90
+ }
91
+ }
92
+
93
  export interface ImageClassificationWorkerInput {
94
  type: 'classify'
95
  image: string | ImageData | HTMLImageElement | HTMLCanvasElement