import spaces
import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer

# Load model and tokenizer
model_path = "apple/DiffuCoder-7B-cpGRPO"
device = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModel.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
).to(device).eval()

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
tokenizer.eos_token = "<|im_end|>"  # Set EOS token

@spaces.GPU
def generate_code(query, temperature=0.4, top_p=0.95, max_new_tokens=256):
    # Format prompt using ChatML template
    messages = [
        {"role": "system", "content": "You are a helpful coding assistant."},
        {"role": "user", "content": query.strip()},
        {"role": "assistant", "content": ""}  # Start of assistant response
    ]
    
    # Apply chat template
    prompt = tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
    )
    
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)
    
    # Calculate initial prompt length
    initial_prompt_len = input_ids.shape[1]
    
    # Track EOS status
    eos_detected = False
    
    # Generate with token streaming
    TOKEN_PER_STEP = 1
    steps = max_new_tokens // TOKEN_PER_STEP
    
    for i in range(steps):
        if eos_detected:
            break
            
        output = model.diffusion_generate(
            input_ids,
            attention_mask=attention_mask,
            max_new_tokens=TOKEN_PER_STEP,
            output_history=True,
            return_dict_in_generate=True,
            steps=1,
            temperature=temperature,
            top_p=top_p,
            alg="entropy",
            alg_temp=0.,
        )
        
        # Get all new tokens (after initial prompt)
        new_tokens = output.sequences[0, initial_prompt_len:]
        
        # Check for EOS token
        if tokenizer.eos_token_id in new_tokens:
            eos_index = (new_tokens == tokenizer.eos_token_id).nonzero(as_tuple=True)[0]
            if eos_index.numel() > 0:
                new_tokens = new_tokens[:eos_index[0]]
                eos_detected = True
        
        # Decode new tokens
        new_text = tokenizer.decode(
            new_tokens, 
            skip_special_tokens=True
        )
        
        # Update input for next step
        input_ids = output.sequences
        attention_mask = torch.cat([
            attention_mask, 
            torch.ones(1, 1, dtype=attention_mask.dtype, device=device)
        ], dim=1)
        
        # Yield current output
        yield new_text.split('<|dlm_pad|>')[0].strip()
        
        if eos_detected:
            break

# Create Gradio interface
demo = gr.Interface(
    fn=generate_code,
    inputs=[
        gr.Textbox(label="Code Request", lines=3, 
                  placeholder="Describe the code you want..."),
        gr.Slider(0.1, 1.0, value=0.4, label="Temperature"),
        gr.Slider(0.5, 1.0, value=0.95, label="Top-p"),
        gr.Slider(32, 512, value=256, step=32, label="Max Tokens")
    ],
    outputs=gr.Textbox(label="Generated Code", lines=10),
    title="🧠 DiffuCoder Code Generator",
    description="Generate code with Apple's DiffuCoder-7B model",
    examples=[
        ["Write a Python function to calculate factorial"],
        ["Create a function to merge two sorted lists"],
        ["How to reverse a string in JavaScript?"]
    ]
)

# Run the demo
if __name__ == "__main__":
    demo.queue().launch()