Spaces:

Mohaddz
/

speach

Runtime error

App Files Files Community

Mohaddz commited on Mar 26

Commit

a9aa3c7

verified ·

1 Parent(s): 580bcb7

Update app.py

Browse files

Files changed (1) hide show

app.py +105 -198

app.py CHANGED Viewed

@@ -3,102 +3,66 @@ from snac import SNAC
 import torch
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
-# Removed snapshot_download as from_pretrained handles caching
 from dotenv import load_dotenv
-import gc # Import garbage collector for memory management
 load_dotenv()
-# --- Global Variables ---
-current_model = None
-current_tokenizer = None
-current_model_name = None
-model_choices = ["Mohaddz/orpheus-3b-0.1-ft-ar", "Mohaddz/orpheus-arabic-exp"]
-default_model_name = "Mohaddz/orpheus-3b-0.1-ft-ar" # Or your preferred default
-# --- End Global Variables ---
 # Check if CUDA is available
 device = "cuda" if torch.cuda.is_available() else "cpu"
-dtype = torch.bfloat16 if device == "cuda" else torch.float32 # Use float32 on CPU
 print("Loading SNAC model...")
-try:
-    snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
-    snac_model = snac_model.to(device)
-    print("SNAC model loaded.")
-except Exception as e:
-    print(f"Error loading SNAC model: {e}")
-    snac_model = None # Handle case where SNAC fails
-# --- Model Loading Function ---
-def load_model_and_tokenizer(model_name_to_load, progress=gr.Progress(track_tqdm=True)):
-    global current_model, current_tokenizer, current_model_name, device, dtype
-    if model_name_to_load == current_model_name and current_model is not None:
-        print(f"Model {model_name_to_load} is already loaded.")
-        gr.Info(f"Model {model_name_to_load} is already loaded.")
-        return f"Model {model_name_to_load} already loaded." # Return status message
-    print(f"Unloading previous model if exists...")
-    # Explicitly delete previous model and clear cache to free VRAM
-    if current_model is not None:
-        del current_model
-        current_model = None
-    if current_tokenizer is not None:
-        del current_tokenizer
-        current_tokenizer = None
-    gc.collect() # Run garbage collection
-    if device == "cuda":
-        torch.cuda.empty_cache() # Clear CUDA cache
-    print(f"Loading Orpheus model: {model_name_to_load}...")
-    try:
-        # Use from_pretrained which handles download and caching
-        new_model = AutoModelForCausalLM.from_pretrained(model_name_to_load, torch_dtype=dtype)
-        new_model.to(device)
-        new_tokenizer = AutoTokenizer.from_pretrained(model_name_to_load)
-        # Update global variables
-        current_model = new_model
-        current_tokenizer = new_tokenizer
-        current_model_name = model_name_to_load
-        print(f"Orpheus model {current_model_name} loaded successfully to {device}")
-        gr.Info(f"Model {current_model_name} loaded.")
-        return f"Model {current_model_name} loaded." # Return status message
-    except Exception as e:
-        print(f"Error loading model {model_name_to_load}: {e}")
-        # Reset globals if loading fails
-        current_model = None
-        current_tokenizer = None
-        current_model_name = None
-        gr.Warning(f"Failed to load model {model_name_to_load}. Please try again or select another model.")
-        return f"Error loading {model_name_to_load}." # Return status message
-# --- End Model Loading Function ---
-# Process text prompt (Uses global tokenizer now)
-def process_prompt(prompt, voice, device):
-    if current_tokenizer is None:
-        raise ValueError("Tokenizer not loaded.")
     prompt = f"{voice}: {prompt}"
-    input_ids = current_tokenizer(prompt, return_tensors="pt").input_ids
     start_token = torch.tensor([[128259]], dtype=torch.int64)  # Start of human
     end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64)  # End of text, End of human
     modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)  # SOH SOT Text EOT EOH
     attention_mask = torch.ones_like(modified_input_ids)
     return modified_input_ids.to(device), attention_mask.to(device)
-# Parse output tokens to audio (no change needed)
 def parse_output(generated_ids):
     token_to_find = 128257
     token_to_remove = 128258
     token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
     if len(token_indices[1]) > 0:
@@ -117,23 +81,19 @@ def parse_output(generated_ids):
         row_length = row.size(0)
         new_length = (row_length // 7) * 7
         trimmed_row = row[:new_length]
-        trimmed_row = [t - 128266 for t in trimmed_row] # Adjust based on actual token IDs if needed
         code_lists.append(trimmed_row)
-    return code_lists[0] if code_lists else [] # Handle empty case
-# Redistribute codes for audio generation (no change needed)
-def redistribute_codes(code_list, snac_model_instance):
-    if not snac_model_instance or not code_list:
-         print("SNAC model not loaded or code list empty.")
-         return None
-    snac_device = next(snac_model_instance.parameters()).device
     layer_1 = []
     layer_2 = []
     layer_3 = []
-    num_frames = len(code_list) // 7 # Use integer division
-    for i in range(num_frames):
         layer_1.append(code_list[7*i])
         layer_2.append(code_list[7*i+1]-4096)
         layer_3.append(code_list[7*i+2]-(2*4096))
@@ -141,190 +101,137 @@ def redistribute_codes(code_list, snac_model_instance):
         layer_2.append(code_list[7*i+4]-(4*4096))
         layer_3.append(code_list[7*i+5]-(5*4096))
         layer_3.append(code_list[7*i+6]-(6*4096))
-    if not layer_1: # Check if any codes were processed
-        print("No valid frames found in code list.")
-        return None
     codes = [
-        torch.tensor(layer_1, device=snac_device).unsqueeze(0),
-        torch.tensor(layer_2, device=snac_device).unsqueeze(0),
-        torch.tensor(layer_3, device=snac_device).unsqueeze(0)
     ]
-    with torch.no_grad():
-        audio_hat = snac_model_instance.decode(codes)
-    return audio_hat.detach().squeeze().cpu().numpy()
-# Main generation function (Uses global model now)
 @spaces.GPU()
-def generate_speech(text, voice, temperature, top_p, repetition_penalty, max_new_tokens, progress=gr.Progress(track_tqdm=True)):
-    global current_model, device # Access globals
-    if current_model is None or current_tokenizer is None:
-        gr.Warning("Orpheus model not loaded. Please select a model and wait for it to load.")
-        return None
-    if snac_model is None:
-        gr.Warning("SNAC vocoder model failed to load. Cannot generate audio.")
-        return None
     if not text.strip():
-        gr.Info("Please enter some text.")
         return None
     try:
         progress(0.1, "Processing text...")
-        input_ids, attention_mask = process_prompt(text, voice, device)
         progress(0.3, "Generating speech tokens...")
         with torch.no_grad():
-            # Make sure generation parameters are appropriate
-            generated_ids = current_model.generate(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
                 max_new_tokens=max_new_tokens,
                 do_sample=True,
-                temperature=max(temperature, 0.01), # Ensure temp is not zero
                 top_p=top_p,
                 repetition_penalty=repetition_penalty,
                 num_return_sequences=1,
-                eos_token_id=128258, # Make sure this is correct for the models
-                pad_token_id=current_tokenizer.pad_token_id if current_tokenizer.pad_token_id is not None else current_tokenizer.eos_token_id # Use tokenizer's pad/eos token
             )
         progress(0.6, "Processing speech tokens...")
         code_list = parse_output(generated_ids)
         progress(0.8, "Converting to audio...")
         audio_samples = redistribute_codes(code_list, snac_model)
-        if audio_samples is None:
-            gr.Warning("Failed to generate audio samples.")
-            return None
         return (24000, audio_samples)  # Return sample rate and audio
     except Exception as e:
         print(f"Error generating speech: {e}")
-        import traceback
-        traceback.print_exc() # Print full traceback for debugging
-        gr.Error(f"An error occurred during generation: {e}")
         return None
-# --- Load Default Model at Startup ---
-# Moved initial loading to happen *before* launching the UI
-# This ensures a model is ready when the interface appears.
-print("Loading default model...")
-initial_status = load_model_and_tokenizer(default_model_name)
-print(initial_status)
-# --- End Load Default Model ---
 # Examples for the UI
 examples = [
-    # Examples might need adjusting if voices/behavior differ between models
-    ["السلام عليكم كيف حالكم اليوم؟", "tara", 0.6, 0.95, 1.1, 1200],
-    ["أنا نموذج لتحويل النص إلى كلام يمكنه التحدث باللغة العربية.", "dan", 0.7, 0.95, 1.1, 1200],
-    # ["I live in San Francisco, and have, uhm let's see, 3 billion 7 hundred ... well, lets just say a lot of parameters.", "emma", 0.6, 0.9, 1.2, 1200] # Keep or remove English examples
 ]
-# Available voices (Might need updating based on your fine-tuned models)
-# You might need different voice lists per model, or just use 'tara'/'dan' if they exist in both
-VOICES = ["tara", "dan", "josh", "emma"] # Adjust as needed
 # Create Gradio interface
 with gr.Blocks(title="Orpheus Text-to-Speech") as demo:
     gr.Markdown("""
-    # 🎵 Orpheus Text-to-Speech (Arabic Fine-tuned)
-    Enter your text below and hear it converted to natural-sounding speech.
-    Select the desired fine-tuned model below.
-    """)
-    with gr.Row():
-        # Model Selection Dropdown
-        model_selector = gr.Dropdown(
-            choices=model_choices,
-            value=current_model_name, # Default to the loaded model
-            label="Select Fine-Tuned Model",
-            interactive=True
-        )
-        # Status Textbox (Optional)
-        status_display = gr.Textbox(label="Model Status", value=initial_status, interactive=False)
     with gr.Row():
         with gr.Column(scale=3):
             text_input = gr.Textbox(
-                label="Text to speak (النص)",
-                placeholder="أدخل النص هنا...",
-                lines=5,
-                text_align="right" # Align text right for Arabic
             )
             voice = gr.Dropdown(
-                choices=VOICES,
-                value="tara", # Default voice
-                label="Voice (الصوت)"
             )
-            with gr.Accordion("Advanced Settings (إعدادات متقدمة)", open=False):
                 temperature = gr.Slider(
                     minimum=0.1, maximum=1.5, value=0.6, step=0.05,
-                    label="Temperature (درجة الحرارة)",
                     info="Higher values (0.7-1.0) create more expressive but less stable speech"
                 )
                 top_p = gr.Slider(
                     minimum=0.1, maximum=1.0, value=0.95, step=0.05,
-                    label="Top P",
                     info="Nucleus sampling threshold"
                 )
                 repetition_penalty = gr.Slider(
                     minimum=1.0, maximum=2.0, value=1.1, step=0.05,
-                    label="Repetition Penalty (عقوبة التكرار)",
                     info="Higher values discourage repetitive patterns"
                 )
                 max_new_tokens = gr.Slider(
                     minimum=100, maximum=2000, value=1200, step=100,
-                    label="Max Length (الطول الأقصى)",
                     info="Maximum length of generated audio (in tokens)"
                 )
             with gr.Row():
-                submit_btn = gr.Button("Generate Speech (توليد الكلام)", variant="primary")
-                clear_btn = gr.Button("Clear (مسح)")
         with gr.Column(scale=2):
-            audio_output = gr.Audio(label="Generated Speech (الكلام المولّد)", type="numpy")
     # Set up examples
     gr.Examples(
         examples=examples,
         inputs=[text_input, voice, temperature, top_p, repetition_penalty, max_new_tokens],
         outputs=audio_output,
-        fn=generate_speech, # Function to call for examples
-        cache_examples=False, # Disable caching if models change behavior
-    )
-    # --- Event Handlers ---
-    # Trigger model loading when dropdown changes
-    model_selector.change(
-        fn=load_model_and_tokenizer,
-        inputs=[model_selector],
-        outputs=[status_display] # Update status display
     )
-    # Generate speech button click
     submit_btn.click(
         fn=generate_speech,
         inputs=[text_input, voice, temperature, top_p, repetition_penalty, max_new_tokens],
         outputs=audio_output
     )
-    # Clear button click
     clear_btn.click(
         fn=lambda: (None, None),
         inputs=[],
         outputs=[text_input, audio_output]
     )
-    # --- End Event Handlers ---
 # Launch the app
 if __name__ == "__main__":
-    demo.queue().launch(share=False) # Removed ssr_mode=False, queue is usually enough

 import torch
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
+from huggingface_hub import snapshot_download
 from dotenv import load_dotenv
 load_dotenv()
 # Check if CUDA is available
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print("Loading SNAC model...")
+snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
+snac_model = snac_model.to(device)
+model_name = "Mohaddz/orpheus-arabic-exp"
+# Download only model config and safetensors
+snapshot_download(
+    repo_id=model_name,
+    allow_patterns=[
+        "config.json",
+        "*.safetensors",
+        "model.safetensors.index.json",
+    ],
+    ignore_patterns=[
+        "optimizer.pt",
+        "pytorch_model.bin",
+        "training_args.bin",
+        "scheduler.pt",
+        "tokenizer.json",
+        "tokenizer_config.json",
+        "special_tokens_map.json",
+        "vocab.json",
+        "merges.txt",
+        "tokenizer.*"
+    ]
+)
+model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
+model.to(device)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+print(f"Orpheus model loaded to {device}")
+# Process text prompt
+def process_prompt(prompt, voice, tokenizer, device):
     prompt = f"{voice}: {prompt}"
+    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
     start_token = torch.tensor([[128259]], dtype=torch.int64)  # Start of human
     end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64)  # End of text, End of human
     modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)  # SOH SOT Text EOT EOH
+    # No padding needed for single input
     attention_mask = torch.ones_like(modified_input_ids)
     return modified_input_ids.to(device), attention_mask.to(device)
+# Parse output tokens to audio
 def parse_output(generated_ids):
     token_to_find = 128257
     token_to_remove = 128258
     token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
     if len(token_indices[1]) > 0:
         row_length = row.size(0)
         new_length = (row_length // 7) * 7
         trimmed_row = row[:new_length]
+        trimmed_row = [t - 128266 for t in trimmed_row]
         code_lists.append(trimmed_row)
+    return code_lists[0]  # Return just the first one for single sample
+# Redistribute codes for audio generation
+def redistribute_codes(code_list, snac_model):
+    device = next(snac_model.parameters()).device  # Get the device of SNAC model
     layer_1 = []
     layer_2 = []
     layer_3 = []
+    for i in range((len(code_list)+1)//7):
         layer_1.append(code_list[7*i])
         layer_2.append(code_list[7*i+1]-4096)
         layer_3.append(code_list[7*i+2]-(2*4096))
         layer_2.append(code_list[7*i+4]-(4*4096))
         layer_3.append(code_list[7*i+5]-(5*4096))
         layer_3.append(code_list[7*i+6]-(6*4096))
+    # Move tensors to the same device as the SNAC model
     codes = [
+        torch.tensor(layer_1, device=device).unsqueeze(0),
+        torch.tensor(layer_2, device=device).unsqueeze(0),
+        torch.tensor(layer_3, device=device).unsqueeze(0)
     ]
+    audio_hat = snac_model.decode(codes)
+    return audio_hat.detach().squeeze().cpu().numpy()  # Always return CPU numpy array
+# Main generation function
 @spaces.GPU()
+def generate_speech(text, voice, temperature, top_p, repetition_penalty, max_new_tokens, progress=gr.Progress()):
     if not text.strip():
         return None
     try:
         progress(0.1, "Processing text...")
+        input_ids, attention_mask = process_prompt(text, voice, tokenizer, device)
         progress(0.3, "Generating speech tokens...")
         with torch.no_grad():
+            generated_ids = model.generate(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
                 max_new_tokens=max_new_tokens,
                 do_sample=True,
+                temperature=temperature,
                 top_p=top_p,
                 repetition_penalty=repetition_penalty,
                 num_return_sequences=1,
+                eos_token_id=128258,
             )
         progress(0.6, "Processing speech tokens...")
         code_list = parse_output(generated_ids)
         progress(0.8, "Converting to audio...")
         audio_samples = redistribute_codes(code_list, snac_model)
         return (24000, audio_samples)  # Return sample rate and audio
     except Exception as e:
         print(f"Error generating speech: {e}")
         return None
 # Examples for the UI
 examples = [
+    ["Hey there my name is Tara, <chuckle> and I'm a speech generation model that can sound like a person.", "tara", 0.6, 0.95, 1.1, 1200],
+    ["I've also been taught to understand and produce paralinguistic things like sighing, or chuckling, or yawning!", "dan", 0.7, 0.95, 1.1, 1200],
+    ["I live in San Francisco, and have, uhm let's see, 3 billion 7 hundred ... well, lets just say a lot of parameters.", "emma", 0.6, 0.9, 1.2, 1200]
 ]
+# Available voices
+VOICES = ["tara", "dan", "josh", "emma"]
 # Create Gradio interface
 with gr.Blocks(title="Orpheus Text-to-Speech") as demo:
     gr.Markdown("""
+    # 🎵 [Orpheus Text-to-Speech](https://github.com/canopyai/Orpheus-TTS)
+    Enter your text below and hear it converted to natural-sounding speech with the Orpheus TTS model.
+    ## Tips for better prompts:
+    - Add paralinguistic elements like `<chuckle>`, `<sigh>`, or `uhm` for more human-like speech.
+    - Longer text prompts generally work better than very short phrases
+    - Adjust the temperature slider for more varied (higher) or consistent (lower) speech patterns
+    """)
     with gr.Row():
         with gr.Column(scale=3):
             text_input = gr.Textbox(
+                label="Text to speak",
+                placeholder="Enter your text here...",
+                lines=5
             )
             voice = gr.Dropdown(
+                choices=VOICES,
+                value="tara",
+                label="Voice"
             )
+            with gr.Accordion("Advanced Settings", open=False):
                 temperature = gr.Slider(
                     minimum=0.1, maximum=1.5, value=0.6, step=0.05,
+                    label="Temperature",
                     info="Higher values (0.7-1.0) create more expressive but less stable speech"
                 )
                 top_p = gr.Slider(
                     minimum=0.1, maximum=1.0, value=0.95, step=0.05,
+                    label="Top P",
                     info="Nucleus sampling threshold"
                 )
                 repetition_penalty = gr.Slider(
                     minimum=1.0, maximum=2.0, value=1.1, step=0.05,
+                    label="Repetition Penalty",
                     info="Higher values discourage repetitive patterns"
                 )
                 max_new_tokens = gr.Slider(
                     minimum=100, maximum=2000, value=1200, step=100,
+                    label="Max Length",
                     info="Maximum length of generated audio (in tokens)"
                 )
             with gr.Row():
+                submit_btn = gr.Button("Generate Speech", variant="primary")
+                clear_btn = gr.Button("Clear")
         with gr.Column(scale=2):
+            audio_output = gr.Audio(label="Generated Speech", type="numpy")
     # Set up examples
     gr.Examples(
         examples=examples,
         inputs=[text_input, voice, temperature, top_p, repetition_penalty, max_new_tokens],
         outputs=audio_output,
+        fn=generate_speech,
+        cache_examples=True,
     )
+    # Set up event handlers
     submit_btn.click(
         fn=generate_speech,
         inputs=[text_input, voice, temperature, top_p, repetition_penalty, max_new_tokens],
         outputs=audio_output
     )
     clear_btn.click(
         fn=lambda: (None, None),
         inputs=[],
         outputs=[text_input, audio_output]
     )
 # Launch the app
 if __name__ == "__main__":
+    demo.queue().launch(share=False, ssr_mode=False)