Spaces:
Sleeping
Sleeping
import spaces | |
import gradio as gr | |
import torch | |
from transformers import AutoModel, AutoTokenizer | |
# Load model and tokenizer | |
model_path = "apple/DiffuCoder-7B-cpGRPO" | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model = AutoModel.from_pretrained( | |
model_path, | |
torch_dtype=torch.bfloat16, | |
trust_remote_code=True | |
).to(device).eval() | |
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) | |
tokenizer.eos_token = "<|im_end|>" # Set EOS token | |
def generate_code(query, temperature=0.4, top_p=0.95, max_new_tokens=256): | |
# Format prompt using ChatML template | |
messages = [ | |
{"role": "system", "content": "You are a helpful coding assistant."}, | |
{"role": "user", "content": query.strip()}, | |
{"role": "assistant", "content": ""} # Start of assistant response | |
] | |
# Apply chat template | |
prompt = tokenizer.apply_chat_template( | |
messages, | |
tokenize=False, | |
add_generation_prompt=True | |
) | |
inputs = tokenizer(prompt, return_tensors="pt") | |
input_ids = inputs.input_ids.to(device) | |
attention_mask = inputs.attention_mask.to(device) | |
# Calculate initial prompt length | |
initial_prompt_len = input_ids.shape[1] | |
# Track EOS status | |
eos_detected = False | |
# Generate with token streaming | |
TOKEN_PER_STEP = 1 | |
steps = max_new_tokens // TOKEN_PER_STEP | |
for i in range(steps): | |
if eos_detected: | |
break | |
output = model.diffusion_generate( | |
input_ids, | |
attention_mask=attention_mask, | |
max_new_tokens=TOKEN_PER_STEP, | |
output_history=True, | |
return_dict_in_generate=True, | |
steps=1, | |
temperature=temperature, | |
top_p=top_p, | |
alg="entropy", | |
alg_temp=0., | |
) | |
# Get all new tokens (after initial prompt) | |
new_tokens = output.sequences[0, initial_prompt_len:] | |
# Check for EOS token | |
if tokenizer.eos_token_id in new_tokens: | |
eos_index = (new_tokens == tokenizer.eos_token_id).nonzero(as_tuple=True)[0] | |
if eos_index.numel() > 0: | |
new_tokens = new_tokens[:eos_index[0]] | |
eos_detected = True | |
# Decode new tokens | |
new_text = tokenizer.decode( | |
new_tokens, | |
skip_special_tokens=True | |
) | |
# Update input for next step | |
input_ids = output.sequences | |
attention_mask = torch.cat([ | |
attention_mask, | |
torch.ones(1, 1, dtype=attention_mask.dtype, device=device) | |
], dim=1) | |
# Yield current output | |
yield new_text.split('<|dlm_pad|>')[0].strip() | |
if eos_detected: | |
break | |
# Create Gradio interface | |
demo = gr.Interface( | |
fn=generate_code, | |
inputs=[ | |
gr.Textbox(label="Code Request", lines=3, | |
placeholder="Describe the code you want..."), | |
gr.Slider(0.1, 1.0, value=0.4, label="Temperature"), | |
gr.Slider(0.5, 1.0, value=0.95, label="Top-p"), | |
gr.Slider(32, 512, value=256, step=32, label="Max Tokens") | |
], | |
outputs=gr.Textbox(label="Generated Code", lines=10), | |
title="🧠 DiffuCoder Code Generator", | |
description="Generate code with Apple's DiffuCoder-7B model", | |
examples=[ | |
["Write a Python function to calculate factorial"], | |
["Create a function to merge two sorted lists"], | |
["How to reverse a string in JavaScript?"] | |
] | |
) | |
# Run the demo | |
if __name__ == "__main__": | |
demo.queue().launch() |