Algorithmvoice / app.py
Artificial-superintelligence's picture
Update app.py
2158d6f verified
raw
history blame
5.36 kB
import streamlit as st
import torch
import torchaudio
import numpy as np
import librosa
import soundfile as sf
from TTS.api import TTS
from fairseq import checkpoint_utils
import wget
import os
from io import BytesIO
import tempfile
import huggingface_hub
class VoiceConverter:
def __init__(self):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.load_models()
def load_models(self):
# Download pre-trained models if not exists
models_dir = "pretrained_models"
os.makedirs(models_dir, exist_ok=True)
# Load Coqui TTS model
self.tts = TTS("tts_models/multilingual/multi-dataset/your_tts", progress_bar=False)
# Load VITS model
vits_path = os.path.join(models_dir, "vits_female.pth")
if not os.path.exists(vits_path):
# Download VITS pre-trained model
wget.download(
"https://huggingface.co/spaces/sayashi/vits-uma-genshin-honkai/resolve/main/G_953000.pth",
vits_path
)
self.vits_model = torch.load(vits_path, map_location=self.device)
self.vits_model.eval()
def convert_voice(self, audio_path, speaker_id=1, emotion="Happy"):
# Load audio
wav, sr = librosa.load(audio_path)
# Resample if needed
if sr != 22050:
wav = librosa.resample(wav, orig_sr=sr, target_sr=22050)
sr = 22050
# Convert to tensor
wav_tensor = torch.FloatTensor(wav).unsqueeze(0).to(self.device)
# Process with VITS
with torch.no_grad():
converted = self.vits_model.voice_conversion(
wav_tensor,
speaker_id=speaker_id
)
# Process with Coqui TTS for emotion
wav_path = "temp.wav"
sf.write(wav_path, converted.cpu().numpy(), sr)
emotional_wav = self.tts.tts_with_vc(
wav_path,
speaker_wav=wav_path,
emotion=emotion
)
return emotional_wav, sr
def save_audio(audio_data, sr):
buffer = BytesIO()
sf.write(buffer, audio_data, sr, format='WAV')
return buffer
# Streamlit Interface
st.title("AI Voice Converter - Female Voice Transformation")
# Model selection
model_type = st.selectbox(
"Select Voice Model",
["VITS Female", "YourTTS Female", "Mixed Model"]
)
# Voice character selection
voice_character = st.selectbox(
"Select Voice Character",
["Anime Female", "Natural Female", "Young Female", "Mature Female"]
)
# Emotion selection
emotion = st.selectbox(
"Select Emotion",
["Happy", "Sad", "Angry", "Neutral", "Excited"]
)
# Additional parameters
with st.expander("Advanced Settings"):
pitch_adjust = st.slider("Pitch Adjustment", -10, 10, 0)
clarity = st.slider("Voice Clarity", 0.0, 1.0, 0.8)
speed = st.slider("Speaking Speed", 0.5, 2.0, 1.0)
# File upload
uploaded_file = st.file_uploader("Upload an audio file", type=['wav', 'mp3'])
if uploaded_file is not None:
# Initialize converter
converter = VoiceConverter()
# Save uploaded file temporarily
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
tmp_file.write(uploaded_file.getvalue())
tmp_path = tmp_file.name
if st.button("Convert Voice"):
try:
with st.spinner("Converting voice... This may take a few moments."):
# Get speaker ID based on voice character
speaker_id = {
"Anime Female": 0,
"Natural Female": 1,
"Young Female": 2,
"Mature Female": 3
}[voice_character]
# Convert voice
converted_audio, sr = converter.convert_voice(
tmp_path,
speaker_id=speaker_id,
emotion=emotion
)
# Create audio buffer
audio_buffer = save_audio(converted_audio, sr)
# Display audio player
st.audio(audio_buffer, format='audio/wav')
# Download button
st.download_button(
label="Download Converted Audio",
data=audio_buffer,
file_name="ai_converted_voice.wav",
mime="audio/wav"
)
except Exception as e:
st.error(f"Error during conversion: {str(e)}")
# Add information about the models
st.markdown("""
### Model Information:
1. **VITS Female**: Pre-trained on a large dataset of female voices
2. **YourTTS**: Multi-speaker, multi-lingual voice conversion model
3. **Mixed Model**: Combination of multiple models for better quality
### Voice Characters:
- **Anime Female**: High-pitched, animated style voice
- **Natural Female**: Realistic female voice
- **Young Female**: Young adult female voice
- **Mature Female**: Mature female voice
### Tips for Best Results:
- Use clear audio input with minimal background noise
- Short audio clips (5-30 seconds) work best
- Experiment with different emotions and voice characters
- Adjust advanced settings for fine-tuning
""")
# Requirements
"""
pip install requirements:
TTS
fairseq
torch
torchaudio
streamlit
librosa
soundfile
numpy
wget
huggingface_hub
"""