Spaces:

Vokturz
/

transformers-js-playground

Running

App Files Files Community

Vokturz commited on 7 days ago

Commit

79eafc9

1 Parent(s): 25647ae

Add StyleTTS2 support with KokoroTTS integration

Browse files

Files changed (10) hide show

public/workers/text-to-speech.js +114 -31
src/components/AudioPlayer.tsx +11 -2
src/components/ModelLoader.tsx +4 -3
src/components/ModelSelector.tsx +22 -11
src/components/Sidebar.tsx +0 -11
src/components/pipelines/TextToSpeech.tsx +16 -2
src/components/pipelines/TextToSpeechConfig.tsx +62 -21
src/contexts/TextToSpeechContext.tsx +11 -4
src/lib/huggingface.ts +19 -3
src/types.ts +6 -1

public/workers/text-to-speech.js CHANGED Viewed

@@ -1,5 +1,6 @@
 /* eslint-disable no-restricted-globals */
 import { pipeline } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@latest'
 class MyTextToSpeechPipeline {
   static task = 'text-to-speech'
@@ -40,10 +41,66 @@ class MyTextToSpeechPipeline {
   }
 }
-// Listen for messages from the main thread
 self.addEventListener('message', async (event) => {
   try {
-    const { type, model, dtype, text, config } = event.data
     if (!model) {
       self.postMessage({
@@ -53,19 +110,31 @@ self.addEventListener('message', async (event) => {
       return
     }
-    // Retrieve the pipeline. This will download the model if not already cached.
-    const synthesizer = await MyTextToSpeechPipeline.getInstance(
-      model,
-      dtype,
-      (x) => {
-        self.postMessage({ status: 'loading', output: x })
-      }
-    )
     if (type === 'load') {
       self.postMessage({
         status: 'ready',
-        output: `Model ${model}, dtype ${dtype} loaded`
       })
       return
     }
@@ -79,31 +148,44 @@ self.addEventListener('message', async (event) => {
         return
       }
-      const options = {}
-      // Add speaker embeddings if provided
-      if (config?.speakerEmbeddings) {
-        try {
-          const response = await fetch(config.speakerEmbeddings)
-          if (response.ok) {
-            const embeddings = await response.arrayBuffer()
-            options.speaker_embeddings = new Float32Array(embeddings)
           }
-        } catch (error) {
-          console.warn('Failed to load speaker embeddings:', error)
-          // Continue without speaker embeddings
-        }
-      }
-      try {
-        const output = await synthesizer(text.trim(), options)
         self.postMessage({
           status: 'output',
-          output: {
-            audio: Array.from(output.audio),
-            sampling_rate: output.sampling_rate
-          }
         })
         self.postMessage({ status: 'ready' })
@@ -114,7 +196,8 @@ self.addEventListener('message', async (event) => {
   } catch (error) {
     self.postMessage({
       status: 'error',
-      output: error.message || 'An error occurred during text-to-speech synthesis'
     })
   }
 })

 /* eslint-disable no-restricted-globals */
 import { pipeline } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@latest'
+import { KokoroTTS } from 'https://cdn.jsdelivr.net/npm/kokoro-js@1.2.1/dist/kokoro.web.js'
 class MyTextToSpeechPipeline {
   static task = 'text-to-speech'
   }
 }
+class MyKokoroTTSPipeline {
+  static instance = null
+  static async getInstance(model, dtype = 'fp32', progress_callback = null) {
+    try {
+      const device = 'webgpu'
+      if (progress_callback) {
+        progress_callback({
+          status: 'loading',
+          message: `Loading Kokoro TTS model with ${device} device`
+        })
+      }
+      this.instance = await KokoroTTS.from_pretrained(model, {
+        dtype,
+        device,
+        progress_callback: progress_callback
+          ? (data) => {
+              progress_callback({
+                status: 'loading',
+                ...data
+              })
+            }
+          : null
+      })
+      return this.instance
+    } catch (webgpuError) {
+      // Fallback to WASM if WebGPU fails
+      if (progress_callback) {
+        progress_callback({
+          status: 'fallback',
+          message: 'WebGPU failed, falling back to WASM'
+        })
+      }
+      try {
+        this.instance = await KokoroTTS.from_pretrained(model, {
+          dtype,
+          device: 'wasm',
+          progress_callback: progress_callback
+            ? (data) => {
+                progress_callback({
+                  status: 'loading',
+                  ...data
+                })
+              }
+            : null
+        })
+        return this.instance
+      } catch (wasmError) {
+        throw new Error(
+          `Both WebGPU and WASM failed for Kokoro TTS. WebGPU error: ${webgpuError.message}. WASM error: ${wasmError.message}`
+        )
+      }
+    }
+  }
+}
 self.addEventListener('message', async (event) => {
   try {
+    const { type, model, dtype, text, isStyleTTS2, config } = event.data
     if (!model) {
       self.postMessage({
       return
     }
+    let synthesizer
+    if (isStyleTTS2) {
+      // Use Kokoro TTS for StyleTTS2 models
+      synthesizer = await MyKokoroTTSPipeline.getInstance(
+        model,
+        dtype || 'q8',
+        (x) => {
+          self.postMessage({ status: 'loading', output: x })
+        }
+      )
+    } else {
+      // Use standard transformers pipeline
+      synthesizer = await MyTextToSpeechPipeline.getInstance(
+        model,
+        dtype || 'fp32',
+        (x) => {
+          self.postMessage({ status: 'loading', output: x })
+        }
+      )
+    }
     if (type === 'load') {
       self.postMessage({
         status: 'ready',
+        output: `Model ${model}${isStyleTTS2 ? ' StyleTTS2' : ''}, dtype ${dtype} loaded`
       })
       return
     }
         return
       }
+      try {
+        let output
+        if (isStyleTTS2) {
+          const options = {}
+          options.voice = config.voice
+          const audioResult = await synthesizer.generate(text.trim(), options)
+          output = {
+            audio: Array.from(audioResult.audio),
+            sampling_rate: audioResult.sampling_rate || 24000 // Default for Kokoro
           }
+        } else {
+          const options = {}
+          if (config?.speakerEmbeddings) {
+            try {
+              const response = await fetch(config.speakerEmbeddings)
+              if (response.ok) {
+                const embeddings = await response.arrayBuffer()
+                options.speaker_embeddings = new Float32Array(embeddings)
+              }
+            } catch (error) {
+              console.warn('Failed to load speaker embeddings:', error)
+            }
+          }
+          const result = await synthesizer(text.trim(), options)
+          output = {
+            audio: Array.from(result.audio),
+            sampling_rate: result.sampling_rate
+          }
+        }
         self.postMessage({
           status: 'output',
+          output
         })
         self.postMessage({ status: 'ready' })
   } catch (error) {
     self.postMessage({
       status: 'error',
+      output:
+        error.message || 'An error occurred during text-to-speech synthesis'
     })
   }
 })

src/components/AudioPlayer.tsx CHANGED Viewed

@@ -6,6 +6,7 @@ interface AudioPlayerProps {
   samplingRate: number
   text: string
   index: number
 }
 function createWavBuffer(
@@ -235,7 +236,13 @@ function CustomAudioVisualizer({
   )
 }
-function AudioPlayer({ audio, samplingRate, text, index }: AudioPlayerProps) {
   const [isPlaying, setIsPlaying] = useState(false)
   const [currentTime, setCurrentTime] = useState(0)
   const [duration, setDuration] = useState(0)
@@ -373,7 +380,9 @@ function AudioPlayer({ audio, samplingRate, text, index }: AudioPlayerProps) {
   return (
     <div className="border border-gray-200 rounded-lg p-4 bg-gray-50">
       <div className="mb-3">
-        <p className="text-sm text-gray-700 font-medium mb-2">Prompt:</p>
         <p className="text-sm text-gray-600 italic bg-white p-2 rounded border">
           "{text}"
         </p>

   samplingRate: number
   text: string
   index: number
+  voice?: string
 }
 function createWavBuffer(
   )
 }
+function AudioPlayer({
+  audio,
+  samplingRate,
+  text,
+  index,
+  voice
+}: AudioPlayerProps) {
   const [isPlaying, setIsPlaying] = useState(false)
   const [currentTime, setCurrentTime] = useState(0)
   const [duration, setDuration] = useState(0)
   return (
     <div className="border border-gray-200 rounded-lg p-4 bg-gray-50">
       <div className="mb-3">
+        <p className="text-sm text-gray-700 font-medium mb-2">
+          Prompt{voice ? ` (${voice})` : ''}:
+        </p>
         <p className="text-sm text-gray-600 italic bg-white p-2 rounded border">
           "{text}"
         </p>

src/components/ModelLoader.tsx CHANGED Viewed

@@ -8,7 +8,6 @@ import { Alert, AlertDescription } from './ui/alert'
 const ModelLoader = () => {
   const [showAlert, setShowAlert] = useState(false)
   const [alertMessage, setAlertMessage] = useState<React.ReactNode>('')
-  const [lastModel, setLastModel] = useState<string | null>(null)
   const {
     modelInfo,
     selectedQuantization,
@@ -134,7 +133,9 @@ const ModelLoader = () => {
     const message = {
       type: 'load',
       model: modelInfo.name,
-      dtype: selectedQuantization ?? 'fp32'
     }
     activeWorker?.postMessage(message)
   }, [modelInfo, selectedQuantization, activeWorker])
@@ -149,7 +150,7 @@ const ModelLoader = () => {
       <div className="flex items-center justify-between space-x-4">
         <div className="flex items-center space-x-2">
-          {modelInfo.supportedQuantizations.length > 1 ? (
             <>
               <span className="text-xs text-gray-600 font-medium">Quant:</span>

 const ModelLoader = () => {
   const [showAlert, setShowAlert] = useState(false)
   const [alertMessage, setAlertMessage] = useState<React.ReactNode>('')
   const {
     modelInfo,
     selectedQuantization,
     const message = {
       type: 'load',
       model: modelInfo.name,
+      dtype: selectedQuantization ?? 'fp32',
+      isStyleTTS2:
+        modelInfo.isStyleTTS2 || modelInfo.name.includes('kitten-tts') || false // text-to-speech only
     }
     activeWorker?.postMessage(message)
   }, [modelInfo, selectedQuantization, activeWorker])
       <div className="flex items-center justify-between space-x-4">
         <div className="flex items-center space-x-2">
+          {modelInfo.supportedQuantizations.length >= 1 ? (
             <>
               <span className="text-xs text-gray-600 font-medium">Quant:</span>

src/components/ModelSelector.tsx CHANGED Viewed

@@ -20,6 +20,7 @@ import {
   X
 } from 'lucide-react'
 import Tooltip from './Tooltip'
 type SortOption = 'likes' | 'downloads' | 'createdAt' | 'name'
@@ -80,9 +81,9 @@ function ModelSelector() {
   // Function to fetch detailed model info and set as selected
   const fetchAndSetModelInfo = useCallback(
-    async (modelId: string, isCustom: boolean = false) => {
       try {
-        const modelInfoResponse = await getModelInfo(modelId, pipeline)
         let parameters = 0
         if (modelInfoResponse.safetensors) {
@@ -95,9 +96,11 @@ function ModelSelector() {
             0
         }
         const modelInfo = {
-          id: modelId,
-          name: modelInfoResponse.id || modelId,
           architecture:
             modelInfoResponse.config?.architectures?.[0] || 'Unknown',
           parameters,
@@ -112,7 +115,9 @@ function ModelSelector() {
           hasChatTemplate: Boolean(
             modelInfoResponse.config?.tokenizer_config?.chat_template
           ),
-          widgetData: modelInfoResponse.widgetData
         }
         setModelInfo(modelInfo)
         setIsCustomModel(isCustom)
@@ -143,12 +148,12 @@ function ModelSelector() {
   useEffect(() => {
     if (models.length > 0 && !isCustomModel && !modelInfo) {
       const firstModel = sortedModels[0]
-      fetchAndSetModelInfo(firstModel.id, false)
     }
   }, [models, sortedModels, fetchAndSetModelInfo, isCustomModel, modelInfo])
-  const handleModelSelect = (modelId: string) => {
-    fetchAndSetModelInfo(modelId, false)
   }
   const handleSortChange = (newSortBy: SortOption) => {
@@ -170,7 +175,13 @@ function ModelSelector() {
     setCustomModelError('')
     try {
-      await fetchAndSetModelInfo(customModelName.trim(), true)
       setShowCustomInput(false)
       setCustomModelName('')
     } catch (error) {
@@ -186,7 +197,7 @@ function ModelSelector() {
     setIsCustomModel(false)
     // Load the first model from the list
     if (sortedModels.length > 0) {
-      fetchAndSetModelInfo(sortedModels[0].id, false)
     }
   }
@@ -281,7 +292,7 @@ function ModelSelector() {
     <div className="relative">
       <Listbox
         value={selectedModel}
-        onChange={(model) => handleModelSelect(model.id)}
       >
         <div className="relative">
           <ListboxButton className="w-full px-3 py-2 border border-gray-300 rounded-md focus:outline-hidden focus:ring-2 focus:ring-blue-500 focus:border-transparent bg-white text-left flex items-center justify-between">

   X
 } from 'lucide-react'
 import Tooltip from './Tooltip'
+import { ModelInfoResponse } from '@/types'
 type SortOption = 'likes' | 'downloads' | 'createdAt' | 'name'
   // Function to fetch detailed model info and set as selected
   const fetchAndSetModelInfo = useCallback(
+    async (model: ModelInfoResponse, isCustom: boolean = false) => {
       try {
+        const modelInfoResponse = await getModelInfo(model.id, pipeline)
         let parameters = 0
         if (modelInfoResponse.safetensors) {
             0
         }
+        const allTags = [...model.tags, ...modelInfoResponse.tags]
         const modelInfo = {
+          id: model.id,
+          name: modelInfoResponse.id || model.id,
           architecture:
             modelInfoResponse.config?.architectures?.[0] || 'Unknown',
           parameters,
           hasChatTemplate: Boolean(
             modelInfoResponse.config?.tokenizer_config?.chat_template
           ),
+          isStyleTTS2: Boolean(allTags.includes('style_text_to_speech_2')),
+          widgetData: modelInfoResponse.widgetData,
+          voices: modelInfoResponse.voices
         }
         setModelInfo(modelInfo)
         setIsCustomModel(isCustom)
   useEffect(() => {
     if (models.length > 0 && !isCustomModel && !modelInfo) {
       const firstModel = sortedModels[0]
+      fetchAndSetModelInfo(firstModel, false)
     }
   }, [models, sortedModels, fetchAndSetModelInfo, isCustomModel, modelInfo])
+  const handleModelSelect = (model: ModelInfoResponse) => {
+    fetchAndSetModelInfo(model, false)
   }
   const handleSortChange = (newSortBy: SortOption) => {
     setCustomModelError('')
     try {
+      await fetchAndSetModelInfo(
+        {
+          id: customModelName.trim(),
+          tags: []
+        } as unknown as ModelInfoResponse,
+        true
+      )
       setShowCustomInput(false)
       setCustomModelName('')
     } catch (error) {
     setIsCustomModel(false)
     // Load the first model from the list
     if (sortedModels.length > 0) {
+      fetchAndSetModelInfo(sortedModels[0], false)
     }
   }
     <div className="relative">
       <Listbox
         value={selectedModel}
+        onChange={(model) => handleModelSelect(model)}
       >
         <div className="relative">
           <ListboxButton className="w-full px-3 py-2 border border-gray-300 rounded-md focus:outline-hidden focus:ring-2 focus:ring-blue-500 focus:border-transparent bg-white text-left flex items-center justify-between">

src/components/Sidebar.tsx CHANGED Viewed

@@ -79,17 +79,6 @@ const Sidebar = ({
                     </Tooltip>
                   </span>
                 )}
-                {pipeline === 'text-to-speech' && (
-                  <span className="flex text-xs text-yellow-500 justify-center text-center">
-                    Not fully supported{' '}
-                    <Tooltip
-                      content="Transformers.js has limited support for text-to-speech"
-                      className="transform -translate-x-1/3 break-keep max-w-12"
-                    >
-                      <CircleQuestionMark className="inline w-4 h-4 ml-1" />
-                    </Tooltip>
-                  </span>
-                )}
               </div>
               <PipelineSelector pipeline={pipeline} setPipeline={setPipeline} />
             </div>

                     </Tooltip>
                   </span>
                 )}
               </div>
               <PipelineSelector pipeline={pipeline} setPipeline={setPipeline} />
             </div>

src/components/pipelines/TextToSpeech.tsx CHANGED Viewed

@@ -7,6 +7,7 @@ import {
   AudioResult
 } from '../../contexts/TextToSpeechContext'
 import AudioPlayer from '../AudioPlayer'
 const SAMPLE_TEXTS = [
   'Hello, this is a sample text for text-to-speech synthesis.',
@@ -18,6 +19,7 @@ const SAMPLE_TEXTS = [
 function TextToSpeech() {
   const {
     config,
     audioResults,
     currentText,
     setCurrentText,
@@ -46,8 +48,10 @@ function TextToSpeech() {
       text: currentText.trim(),
       model: modelInfo.id,
       dtype: selectedQuantization ?? 'fp32',
       config: {
-        speakerEmbeddings: config.speakerEmbeddings
       }
     }
@@ -72,7 +76,7 @@ function TextToSpeech() {
           audio: new Float32Array(output.audio),
           sampling_rate: output.sampling_rate
         }
-        addAudioResult(currentText, audioResult)
       } else if (status === 'ready' || status === 'error') {
         setIsSynthesizing(false)
       }
@@ -82,6 +86,15 @@ function TextToSpeech() {
     return () => activeWorker.removeEventListener('message', onMessageReceived)
   }, [activeWorker, currentText, addAudioResult])
   const handleKeyPress = (e: React.KeyboardEvent) => {
     if (e.key === 'Enter' && !e.shiftKey) {
       e.preventDefault()
@@ -172,6 +185,7 @@ function TextToSpeech() {
                 samplingRate={result.sampling_rate}
                 text={result.text}
                 index={index}
               />
             ))}
           </div>

   AudioResult
 } from '../../contexts/TextToSpeechContext'
 import AudioPlayer from '../AudioPlayer'
+import { preview } from 'vite'
 const SAMPLE_TEXTS = [
   'Hello, this is a sample text for text-to-speech synthesis.',
 function TextToSpeech() {
   const {
     config,
+    setConfig,
     audioResults,
     currentText,
     setCurrentText,
       text: currentText.trim(),
       model: modelInfo.id,
       dtype: selectedQuantization ?? 'fp32',
+      isStyleTTS2: modelInfo.isStyleTTS2 ?? false,
       config: {
+        speakerEmbeddings: config.speakerEmbeddings,
+        voice: config.voice
       }
     }
           audio: new Float32Array(output.audio),
           sampling_rate: output.sampling_rate
         }
+        addAudioResult(currentText, audioResult, config.voice)
       } else if (status === 'ready' || status === 'error') {
         setIsSynthesizing(false)
       }
     return () => activeWorker.removeEventListener('message', onMessageReceived)
   }, [activeWorker, currentText, addAudioResult])
+  useEffect(() => {
+    if (!modelInfo) return
+    if (modelInfo && modelInfo?.voices.length > 0)
+      setConfig((prev) => ({
+        ...prev,
+        voice: modelInfo.voices[0]
+      }))
+  }, [modelInfo])
   const handleKeyPress = (e: React.KeyboardEvent) => {
     if (e.key === 'Enter' && !e.shiftKey) {
       e.preventDefault()
                 samplingRate={result.sampling_rate}
                 text={result.text}
                 index={index}
+                voice={result.voice}
               />
             ))}
           </div>

src/components/pipelines/TextToSpeechConfig.tsx CHANGED Viewed

@@ -2,6 +2,14 @@ import React from 'react'
 import { Label } from '@/components/ui/label'
 import { Input } from '@/components/ui/input'
 import { useTextToSpeech } from '../../contexts/TextToSpeechContext'
 interface TextToSpeechConfigProps {
   className?: string
@@ -10,31 +18,64 @@ interface TextToSpeechConfigProps {
 const TextToSpeechConfig: React.FC<TextToSpeechConfigProps> = ({
   className = ''
 }) => {
   const { config, setConfig } = useTextToSpeech()
   return (
     <div className={`space-y-4 ${className}`}>
-      <div className="space-y-2">
-        <Label htmlFor="speakerEmbeddings" className="text-sm font-medium">
-          Speaker Embeddings URL
-        </Label>
-        <Input
-          id="speakerEmbeddings"
-          type="url"
-          value={config.speakerEmbeddings}
-          onChange={(e) =>
-            setConfig((prev) => ({
-              ...prev,
-              speakerEmbeddings: e.target.value
-            }))
-          }
-          placeholder="https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin"
-          className="text-sm"
-        />
-        <p className="text-xs text-gray-500">
-          URL to speaker embeddings file for voice characteristics
-        </p>
-      </div>
     </div>
   )
 }

 import { Label } from '@/components/ui/label'
 import { Input } from '@/components/ui/input'
 import { useTextToSpeech } from '../../contexts/TextToSpeechContext'
+import { useModel } from '@/contexts/ModelContext'
+import {
+  Select,
+  SelectContent,
+  SelectItem,
+  SelectTrigger,
+  SelectValue
+} from '../ui/select'
 interface TextToSpeechConfigProps {
   className?: string
 const TextToSpeechConfig: React.FC<TextToSpeechConfigProps> = ({
   className = ''
 }) => {
+  const { modelInfo } = useModel()
   const { config, setConfig } = useTextToSpeech()
   return (
     <div className={`space-y-4 ${className}`}>
+      {modelInfo?.isStyleTTS2 ? (
+        <div className="space-y-2 h-1/3">
+          <p className="text-xs text-gray-500">Style TTS2 Model</p>
+          <Label htmlFor="speakerEmbeddings" className="text-sm font-medium">
+            Select Voice
+          </Label>
+          <Select
+            value={config.voice}
+            onValueChange={(value) =>
+              setConfig((prev) => ({
+                ...prev,
+                voice: value
+              }))
+            }
+          >
+            <SelectTrigger className="w-full text-sm xl:text-base">
+              <SelectValue placeholder="Select a voice" />
+            </SelectTrigger>
+            <SelectContent className="max-h-96">
+              {modelInfo.voices.map((voice) => (
+                <SelectItem key={voice} value={voice} className="text-sm">
+                  {voice}
+                </SelectItem>
+              ))}
+            </SelectContent>
+          </Select>
+          <p className="text-xs text-gray-500">
+            Voice to use for text-to-speech synthesis.
+          </p>
+        </div>
+      ) : (
+        <div className="space-y-2">
+          <Label htmlFor="speakerEmbeddings" className="text-sm font-medium">
+            Speaker Embeddings URL
+          </Label>
+          <Input
+            id="speakerEmbeddings"
+            type="url"
+            value={config.speakerEmbeddings}
+            onChange={(e) =>
+              setConfig((prev) => ({
+                ...prev,
+                speakerEmbeddings: e.target.value
+              }))
+            }
+            placeholder="https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin"
+            className="text-sm"
+          />
+          <p className="text-xs text-gray-500">
+            URL to speaker embeddings file for voice characteristics
+          </p>
+        </div>
+      )}
     </div>
   )
 }

src/contexts/TextToSpeechContext.tsx CHANGED Viewed

@@ -1,13 +1,15 @@
 import { createContext, useContext, useState, ReactNode } from 'react'
 export interface TextToSpeechConfigState {
-  speakerEmbeddings: string
 }
 export interface AudioResult {
   audio: Float32Array
   sampling_rate: number
   text: string
 }
 interface TextToSpeechContextType {
@@ -28,14 +30,19 @@ const TextToSpeechContext = createContext<TextToSpeechContextType | undefined>(
 export function TextToSpeechProvider({ children }: { children: ReactNode }) {
   const [config, setConfig] = useState<TextToSpeechConfigState>({
     speakerEmbeddings:
-      'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin'
   })
   const [audioResults, setAudioResults] = useState<AudioResult[]>([])
   const [currentText, setCurrentText] = useState<string>('')
-  const addAudioResult = (text: string, audio: Omit<AudioResult, 'text'>) => {
-    const fullAudioResult: AudioResult = { ...audio, text }
     setAudioResults((prev) => [...prev, fullAudioResult])
   }

 import { createContext, useContext, useState, ReactNode } from 'react'
 export interface TextToSpeechConfigState {
+  speakerEmbeddings?: string
+  voice?: string
 }
 export interface AudioResult {
   audio: Float32Array
   sampling_rate: number
   text: string
+  voice?: string
 }
 interface TextToSpeechContextType {
 export function TextToSpeechProvider({ children }: { children: ReactNode }) {
   const [config, setConfig] = useState<TextToSpeechConfigState>({
     speakerEmbeddings:
+      'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin',
+    voice: undefined
   })
   const [audioResults, setAudioResults] = useState<AudioResult[]>([])
   const [currentText, setCurrentText] = useState<string>('')
+  const addAudioResult = (
+    text: string,
+    audio: Omit<AudioResult, 'text'>,
+    voice?: string
+  ) => {
+    const fullAudioResult: AudioResult = { ...audio, text, voice }
     setAudioResults((prev) => [...prev, fullAudioResult])
   }

src/lib/huggingface.ts CHANGED Viewed

@@ -75,6 +75,20 @@ const getModelInfo = async (
     return getNumericValue(a) - getNumericValue(b)
   })
   // Fetch README content
   const fetchReadme = async (modelId: string): Promise<string> => {
     try {
@@ -111,7 +125,8 @@ const getModelInfo = async (
         incompatibilityReason,
         supportedQuantizations:
           uniqueSupportedQuantizations as QuantizationType[],
-        readme
       }
     }
   }
@@ -123,7 +138,8 @@ const getModelInfo = async (
     isCompatible,
     incompatibilityReason,
     supportedQuantizations: uniqueSupportedQuantizations as QuantizationType[],
-    readme
   }
 }
@@ -180,7 +196,7 @@ const getModelsByPipeline = async (
     return uniqueModels
       .filter(
         (model: ModelInfoResponse) =>
-          !model.tags.includes('style_text_to_speech_2') &&
           !model.id.includes('qwen2')
       )
       .slice(0, 30)

     return getNumericValue(a) - getNumericValue(b)
   })
+  if (
+    uniqueSupportedQuantizations.length === 0 &&
+    siblingFiles.some((file) => file.endsWith('_quantized.onnx'))
+  ) {
+    uniqueSupportedQuantizations.push('q8')
+  }
+  const voices: string[] = []
+  siblingFiles
+    .filter((file) => file.startsWith('voices/') && !file.endsWith('af.bin'))
+    .forEach((file) => {
+      voices.push(file.split('/')[1].split('.')[0])
+    })
   // Fetch README content
   const fetchReadme = async (modelId: string): Promise<string> => {
     try {
         incompatibilityReason,
         supportedQuantizations:
           uniqueSupportedQuantizations as QuantizationType[],
+        readme,
+        voices
       }
     }
   }
     isCompatible,
     incompatibilityReason,
     supportedQuantizations: uniqueSupportedQuantizations as QuantizationType[],
+    readme,
+    voices
   }
 }
     return uniqueModels
       .filter(
         (model: ModelInfoResponse) =>
+          // !model.tags.includes('style_text_to_speech_2') &&
           !model.id.includes('qwen2')
       )
       .slice(0, 30)

src/types.ts CHANGED Viewed

@@ -85,8 +85,10 @@ export interface TextToSpeechWorkerInput {
   text: string
   model: string
   dtype: QuantizationType
   config?: {
     speakerEmbeddings?: string
   }
 }
@@ -157,8 +159,10 @@ export interface ModelInfo {
   supportedQuantizations: QuantizationType[]
   baseId?: string
   readme?: string
-  hasChatTemplate: boolean
   widgetData?: any
 }
 export interface ModelInfoResponse {
@@ -202,4 +206,5 @@ export interface ModelInfoResponse {
   likes: number
   downloads: number
   readme?: string
 }

   text: string
   model: string
   dtype: QuantizationType
+  isStyleTTS2: boolean
   config?: {
     speakerEmbeddings?: string
+    voice?: string
   }
 }
   supportedQuantizations: QuantizationType[]
   baseId?: string
   readme?: string
+  hasChatTemplate: boolean // text-generation only
+  isStyleTTS2: boolean // text-to-speech only
   widgetData?: any
+  voices: string[] // text-to-speech only
 }
 export interface ModelInfoResponse {
   likes: number
   downloads: number
   readme?: string
+  voices: string[] // text-to-speech only
 }