File size: 2,783 Bytes
2a7fe05
9cc2d55
 
 
a0e2cb7
9cc2d55
 
 
a0e2cb7
9cc2d55
 
 
 
 
a0e2cb7
9cc2d55
a0e2cb7
9cc2d55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a0e2cb7
9cc2d55
 
 
 
 
a0e2cb7
9cc2d55
 
 
 
 
 
a0e2cb7
9cc2d55
 
 
a0e2cb7
 
9cc2d55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a0e2cb7
9cc2d55
a0e2cb7
9cc2d55
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import spaces
import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer

# Load model and tokenizer
model_path = "apple/DiffuCoder-7B-cpGRPO"
device = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModel.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
).to(device).eval()

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

@spaces.GPU
def generate_code(query, temperature=0.4, top_p=0.95, max_new_tokens=256):
    # Format prompt using chat template
    prompt = f"""<|im_start|>system
You are a helpful coding assistant.<|im_end|>
<|im_start|>user
{query.strip()}<|im_end|>
<|im_start|>assistant
"""
    
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)
    
    # Generate with token streaming
    TOKEN_PER_STEP = 1
    steps = max_new_tokens // TOKEN_PER_STEP
    
    full_output = ""
    for _ in range(steps):
        output = model.diffusion_generate(
            input_ids,
            attention_mask=attention_mask,
            max_new_tokens=TOKEN_PER_STEP,
            output_history=True,
            return_dict_in_generate=True,
            steps=1,
            temperature=temperature,
            top_p=top_p,
            alg="entropy",
            alg_temp=0.,
        )
        
        # Decode new tokens
        new_tokens = tokenizer.decode(
            output.sequences[0, -TOKEN_PER_STEP:].tolist(),
            skip_special_tokens=True
        )
        
        # Update input for next step
        input_ids = output.sequences
        attention_mask = torch.cat([
            attention_mask, 
            torch.ones(1, 1, dtype=attention_mask.dtype, device=device)
        ], dim=1)
        
        # Append to full output and stream
        full_output += new_tokens
        yield full_output.split('<|dlm_pad|>')[0].strip()

# Create Gradio interface
demo = gr.Interface(
    fn=generate_code,
    inputs=[
        gr.Textbox(label="Code Request", lines=3, 
                  placeholder="Describe the code you want..."),
        gr.Slider(0.1, 1.0, value=0.4, label="Temperature"),
        gr.Slider(0.5, 1.0, value=0.95, label="Top-p"),
        gr.Slider(32, 512, value=256, step=32, label="Max Tokens")
    ],
    outputs=gr.Textbox(label="Generated Code", lines=10),
    title="🧠 DiffuCoder Code Generator",
    description="Generate code with Apple's DiffuCoder-7B model",
    examples=[
        ["Write a Python function to calculate factorial"],
        ["Create a function to merge two sorted lists"],
        ["How to reverse a string in JavaScript?"]
    ]
)

# Run the demo
if __name__ == "__main__":
    demo.queue().launch()