Spaces:

mrfakename
/

VibeVoice-1.5B

Runtime error

File size: 2,783 Bytes

2a7fe05
9cc2d55
 
 
a0e2cb7
9cc2d55
 
 
a0e2cb7
9cc2d55
 
 
 
 
a0e2cb7
9cc2d55
a0e2cb7
9cc2d55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a0e2cb7
9cc2d55
 
 
 
 
a0e2cb7
9cc2d55
 
 
 
 
 
a0e2cb7
9cc2d55
 
 
a0e2cb7
 
9cc2d55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a0e2cb7
9cc2d55
a0e2cb7
9cc2d55

import spaces
import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer

# Load model and tokenizer
model_path = "apple/DiffuCoder-7B-cpGRPO"
device = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModel.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
).to(device).eval()

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

@spaces.GPU
def generate_code(query, temperature=0.4, top_p=0.95, max_new_tokens=256):
    # Format prompt using chat template
    prompt = f"""<|im_start|>system
You are a helpful coding assistant.<|im_end|>
<|im_start|>user
{query.strip()}<|im_end|>
<|im_start|>assistant
"""
    
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)
    
    # Generate with token streaming
    TOKEN_PER_STEP = 1
    steps = max_new_tokens // TOKEN_PER_STEP
    
    full_output = ""
    for _ in range(steps):
        output = model.diffusion_generate(
            input_ids,
            attention_mask=attention_mask,
            max_new_tokens=TOKEN_PER_STEP,
            output_history=True,
            return_dict_in_generate=True,
            steps=1,
            temperature=temperature,
            top_p=top_p,
            alg="entropy",
            alg_temp=0.,
        )
        
        # Decode new tokens
        new_tokens = tokenizer.decode(
            output.sequences[0, -TOKEN_PER_STEP:].tolist(),
            skip_special_tokens=True
        )
        
        # Update input for next step
        input_ids = output.sequences
        attention_mask = torch.cat([
            attention_mask, 
            torch.ones(1, 1, dtype=attention_mask.dtype, device=device)
        ], dim=1)
        
        # Append to full output and stream
        full_output += new_tokens
        yield full_output.split('<|dlm_pad|>')[0].strip()

# Create Gradio interface
demo = gr.Interface(
    fn=generate_code,
    inputs=[
        gr.Textbox(label="Code Request", lines=3, 
                  placeholder="Describe the code you want..."),
        gr.Slider(0.1, 1.0, value=0.4, label="Temperature"),
        gr.Slider(0.5, 1.0, value=0.95, label="Top-p"),
        gr.Slider(32, 512, value=256, step=32, label="Max Tokens")
    ],
    outputs=gr.Textbox(label="Generated Code", lines=10),
    title="🧠 DiffuCoder Code Generator",
    description="Generate code with Apple's DiffuCoder-7B model",
    examples=[
        ["Write a Python function to calculate factorial"],
        ["Create a function to merge two sorted lists"],
        ["How to reverse a string in JavaScript?"]
    ]
)

# Run the demo
if __name__ == "__main__":
    demo.queue().launch()