Spaces:
Runtime error
Runtime error
""" | |
Copyright (c) 2025 Xposed73 | |
All rights reserved. | |
This file is part of the Manim Voiceover project. | |
""" | |
import hashlib | |
import json | |
import numpy as np | |
from pathlib import Path | |
from manim_voiceover.services.base import SpeechService | |
from kokoro_onnx import Kokoro | |
from manim_voiceover.helper import remove_bookmarks, wav2mp3 | |
from scipy.io.wavfile import write as write_wav | |
from src.config.config import Config | |
class KokoroService(SpeechService): | |
"""Speech service class for kokoro_self (using text_to_speech via Kokoro ONNX).""" | |
def __init__(self, engine=None, | |
model_path: str = Config.KOKORO_MODEL_PATH, | |
voices_path: str = Config.KOKORO_VOICES_PATH, | |
voice: str = Config.KOKORO_DEFAULT_VOICE, | |
speed: float = Config.KOKORO_DEFAULT_SPEED, | |
lang: str = Config.KOKORO_DEFAULT_LANG, | |
**kwargs): | |
self.kokoro = Kokoro(model_path, voices_path) | |
self.voice = voice | |
self.speed = speed | |
self.lang = lang | |
if engine is None: | |
engine = self.text_to_speech # Default to local function | |
self.engine = engine | |
super().__init__(**kwargs) | |
def get_data_hash(self, input_data: dict) -> str: | |
""" | |
Generates a hash based on the input data dictionary. | |
The hash is used to create a unique identifier for the input data. | |
Parameters: | |
input_data (dict): A dictionary of input data (e.g., text, voice, etc.). | |
Returns: | |
str: The generated hash as a string. | |
""" | |
# Convert the input data dictionary to a JSON string (sorted for consistency) | |
data_str = json.dumps(input_data, sort_keys=True) | |
# Generate a SHA-256 hash of the JSON string | |
return hashlib.sha256(data_str.encode('utf-8')).hexdigest() | |
def text_to_speech(self, text, output_file, voice_name, speed, lang): | |
""" | |
Generates speech from text using Kokoro ONNX and saves the audio file. | |
Normalizes the audio to make it audible. | |
""" | |
# Generate audio samples using Kokoro | |
samples, sample_rate = self.kokoro.create( | |
text, voice=voice_name, speed=speed, lang=lang | |
) | |
# Normalize audio to the range [-1, 1] | |
max_val = np.max(np.abs(samples)) | |
if max_val > 0: | |
samples = samples / max_val | |
# Convert to 16-bit integer PCM format | |
samples = (samples * 32767).astype("int16") | |
# Save the normalized audio as a .wav file | |
write_wav(output_file, sample_rate, samples) | |
print(f"Saved at {output_file}") | |
return output_file | |
def generate_from_text(self, text: str, cache_dir: str = None, path: str = None) -> dict: | |
if cache_dir is None: | |
cache_dir = self.cache_dir | |
input_data = {"input_text": text, "service": "kokoro_self", "voice": self.voice, "lang": self.lang} | |
cached_result = self.get_cached_result(input_data, cache_dir) | |
if cached_result is not None: | |
return cached_result | |
if path is None: | |
audio_path = self.get_data_hash(input_data) + ".mp3" | |
else: | |
audio_path = path | |
# Generate .wav file using the text_to_speech function | |
audio_path_wav = str(Path(cache_dir) / audio_path.replace(".mp3", ".wav")) | |
self.engine( | |
text=text, | |
output_file=audio_path_wav, | |
voice_name=self.voice, | |
speed=self.speed, | |
lang=self.lang, | |
) | |
# Convert .wav to .mp3 | |
mp3_audio_path = str(Path(cache_dir) / audio_path) | |
wav2mp3(audio_path_wav, mp3_audio_path) | |
# Remove original .wav file | |
remove_bookmarks(audio_path_wav) | |
json_dict = { | |
"input_text": text, | |
"input_data": input_data, | |
"original_audio": audio_path, | |
} | |
return json_dict |