Spaces:

Priyanka6
/

fine-tuning-inference

Runtime error

Priyanka6 commited on Feb 28

Commit

7d51297

1 Parent(s): e89dcde

Update space

Files changed (1) hide show

app.py CHANGED Viewed

@@ -31,6 +31,9 @@ def respond(message, history, max_tokens, temperature, top_p):
     # Tokenize and generate response
     inputs = tokenizer.apply_chat_template(messages, tokenize=False)
     input_tokens = tokenizer(inputs, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
     output_tokens = model.generate(
         **input_tokens,
@@ -40,8 +43,16 @@ def respond(message, history, max_tokens, temperature, top_p):
         pad_token_id=tokenizer.pad_token_id,
         eos_token_id=tokenizer.eos_token_id,
     )
-    response = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
     return response
 # Define Gradio Chat Interface

     # Tokenize and generate response
     inputs = tokenizer.apply_chat_template(messages, tokenize=False)
     input_tokens = tokenizer(inputs, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
+    # Get the length of input tokens to separate new response
+    input_length = input_tokens.input_ids.shape[1]
     output_tokens = model.generate(
         **input_tokens,
         pad_token_id=tokenizer.pad_token_id,
         eos_token_id=tokenizer.eos_token_id,
     )
+    # Extract only the new tokens (the model's response)
+    new_tokens = output_tokens[0][input_length:]
+    response = tokenizer.decode(new_tokens, skip_special_tokens=True)
+    # Clean up any remaining system prompt or formatting artifacts
+    response = response.strip()
+    if response.startswith("assistant:"):
+        response = response[len("assistant:"):].strip()
     return response
 # Define Gradio Chat Interface