Spaces:

mrfakename
/

DiffuCoder

Running on Zero

App Files Files Community

mrfakename commited on 22 days ago

Commit

6f96c51

verified ·

1 Parent(s): 987b437

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -20

app.py CHANGED Viewed

@@ -14,29 +14,29 @@ model = AutoModel.from_pretrained(
 ).to(device).eval()
 tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-tokenizer.eos_token = "<|im_end|>"  # Set EOS token
 @spaces.GPU
 def generate_code(query, temperature=0.4, top_p=0.95, max_new_tokens=256):
-    # Format prompt using ChatML template
     messages = [
         {"role": "system", "content": "You are a helpful coding assistant."},
-        {"role": "user", "content": query.strip()},
-        {"role": "assistant", "content": ""}  # Start of assistant response
     ]
-    # Apply chat template
     prompt = tokenizer.apply_chat_template(
         messages,
         tokenize=False,
         add_generation_prompt=True
     )
     inputs = tokenizer(prompt, return_tensors="pt")
     input_ids = inputs.input_ids.to(device)
     attention_mask = inputs.attention_mask.to(device)
-    # Calculate initial prompt length
     initial_prompt_len = input_ids.shape[1]
     # Track EOS status
@@ -44,7 +44,10 @@ def generate_code(query, temperature=0.4, top_p=0.95, max_new_tokens=256):
     # Generate with token streaming
     TOKEN_PER_STEP = 1
-    steps = max_new_tokens // TOKEN_PER_STEP
     for i in range(steps):
         if eos_detected:
@@ -63,20 +66,24 @@ def generate_code(query, temperature=0.4, top_p=0.95, max_new_tokens=256):
             alg_temp=0.,
         )
-        # Get all new tokens (after initial prompt)
-        new_tokens = output.sequences[0, initial_prompt_len:]
-        # Check for EOS token
-        if tokenizer.eos_token_id in new_tokens:
-            eos_index = (new_tokens == tokenizer.eos_token_id).nonzero(as_tuple=True)[0]
-            if eos_index.numel() > 0:
-                new_tokens = new_tokens[:eos_index[0]]
-                eos_detected = True
-        # Decode new tokens
         new_text = tokenizer.decode(
-            new_tokens,
-            skip_special_tokens=True
         )
         # Update input for next step
@@ -86,8 +93,11 @@ def generate_code(query, temperature=0.4, top_p=0.95, max_new_tokens=256):
             torch.ones(1, 1, dtype=attention_mask.dtype, device=device)
         ], dim=1)
-        # Yield current output
-        yield new_text.split('<|dlm_pad|>')[0].strip()
         if eos_detected:
             break

 ).to(device).eval()
 tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+tokenizer.eos_token = "<|im_end|>"
 @spaces.GPU
 def generate_code(query, temperature=0.4, top_p=0.95, max_new_tokens=256):
+    # Format prompt using chat template
     messages = [
         {"role": "system", "content": "You are a helpful coding assistant."},
+        {"role": "user", "content": query.strip()}
     ]
+    # Apply chat template - this creates the prompt but doesn't include assistant response
     prompt = tokenizer.apply_chat_template(
         messages,
         tokenize=False,
         add_generation_prompt=True
     )
+    # Tokenize only the prompt (without any assistant response)
     inputs = tokenizer(prompt, return_tensors="pt")
     input_ids = inputs.input_ids.to(device)
     attention_mask = inputs.attention_mask.to(device)
+    # Calculate initial prompt length - this is where the assistant response will start
     initial_prompt_len = input_ids.shape[1]
     # Track EOS status
     # Generate with token streaming
     TOKEN_PER_STEP = 1
+    steps = min(max_new_tokens // TOKEN_PER_STEP, 512)  # Limit to max 512 steps
+    # This will accumulate only the assistant's response
+    assistant_response = ""
     for i in range(steps):
         if eos_detected:
             alg_temp=0.,
         )
+        # Get only the new tokens generated in this step
+        new_token_ids = output.sequences[0, -TOKEN_PER_STEP:]
+        # Check for EOS token in the new tokens
+        if tokenizer.eos_token_id in new_token_ids:
+            # If EOS is found, stop after this token
+            eos_detected = True
+            # Remove EOS token from output
+            new_token_ids = new_token_ids[new_token_ids != tokenizer.eos_token_id]
+            if new_token_ids.numel() == 0:
+                # Only EOS was generated, nothing to add
+                break
+        # Decode only the new tokens
         new_text = tokenizer.decode(
+            new_token_ids,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False
         )
         # Update input for next step
             torch.ones(1, 1, dtype=attention_mask.dtype, device=device)
         ], dim=1)
+        # Append to assistant response and yield
+        assistant_response += new_text
+        # Remove any trailing special tokens
+        clean_response = assistant_response.replace('<|dlm_pad|>', '').strip()
+        yield clean_response
         if eos_detected:
             break