Spaces:

mrfakename
/

DiffuCoder

Running on Zero

App Files Files Community

mrfakename commited on 22 days ago

Commit

987b437

verified ·

1 Parent(s): 9cc2d55

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -14

app.py CHANGED Viewed

@@ -14,27 +14,42 @@ model = AutoModel.from_pretrained(
 ).to(device).eval()
 tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 @spaces.GPU
 def generate_code(query, temperature=0.4, top_p=0.95, max_new_tokens=256):
-    # Format prompt using chat template
-    prompt = f"""<|im_start|>system
-You are a helpful coding assistant.<|im_end|>
-<|im_start|>user
-{query.strip()}<|im_end|>
-<|im_start|>assistant
-"""
     inputs = tokenizer(prompt, return_tensors="pt")
     input_ids = inputs.input_ids.to(device)
     attention_mask = inputs.attention_mask.to(device)
     # Generate with token streaming
     TOKEN_PER_STEP = 1
     steps = max_new_tokens // TOKEN_PER_STEP
-    full_output = ""
-    for _ in range(steps):
         output = model.diffusion_generate(
             input_ids,
             attention_mask=attention_mask,
@@ -48,9 +63,19 @@ You are a helpful coding assistant.<|im_end|>
             alg_temp=0.,
         )
         # Decode new tokens
-        new_tokens = tokenizer.decode(
-            output.sequences[0, -TOKEN_PER_STEP:].tolist(),
             skip_special_tokens=True
         )
@@ -61,9 +86,11 @@ You are a helpful coding assistant.<|im_end|>
             torch.ones(1, 1, dtype=attention_mask.dtype, device=device)
         ], dim=1)
-        # Append to full output and stream
-        full_output += new_tokens
-        yield full_output.split('<|dlm_pad|>')[0].strip()
 # Create Gradio interface
 demo = gr.Interface(

 ).to(device).eval()
 tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+tokenizer.eos_token = "<|im_end|>"  # Set EOS token
 @spaces.GPU
 def generate_code(query, temperature=0.4, top_p=0.95, max_new_tokens=256):
+    # Format prompt using ChatML template
+    messages = [
+        {"role": "system", "content": "You are a helpful coding assistant."},
+        {"role": "user", "content": query.strip()},
+        {"role": "assistant", "content": ""}  # Start of assistant response
+    ]
+    # Apply chat template
+    prompt = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True
+    )
     inputs = tokenizer(prompt, return_tensors="pt")
     input_ids = inputs.input_ids.to(device)
     attention_mask = inputs.attention_mask.to(device)
+    # Calculate initial prompt length
+    initial_prompt_len = input_ids.shape[1]
+    # Track EOS status
+    eos_detected = False
     # Generate with token streaming
     TOKEN_PER_STEP = 1
     steps = max_new_tokens // TOKEN_PER_STEP
+    for i in range(steps):
+        if eos_detected:
+            break
         output = model.diffusion_generate(
             input_ids,
             attention_mask=attention_mask,
             alg_temp=0.,
         )
+        # Get all new tokens (after initial prompt)
+        new_tokens = output.sequences[0, initial_prompt_len:]
+        # Check for EOS token
+        if tokenizer.eos_token_id in new_tokens:
+            eos_index = (new_tokens == tokenizer.eos_token_id).nonzero(as_tuple=True)[0]
+            if eos_index.numel() > 0:
+                new_tokens = new_tokens[:eos_index[0]]
+                eos_detected = True
         # Decode new tokens
+        new_text = tokenizer.decode(
+            new_tokens,
             skip_special_tokens=True
         )
             torch.ones(1, 1, dtype=attention_mask.dtype, device=device)
         ], dim=1)
+        # Yield current output
+        yield new_text.split('<|dlm_pad|>')[0].strip()
+        if eos_detected:
+            break
 # Create Gradio interface
 demo = gr.Interface(