Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
import torch | |
import re | |
# Model configuration | |
MAX_REASONING_TOKENS = 4096 | |
MAX_RESPONSE_TOKENS = 2048 | |
MODEL_OPTIONS = ["beyoru/ThinkAgain1.3", "beyoru/ThinkAgain1.4", "beyoru/ThinkAgain1.5"] | |
# Global variables for model and tokenizer | |
model = None | |
tokenizer = None | |
messages = [] | |
# Function to extract text between <think> and </think> tags | |
def extract_think_content(text): | |
match = re.search(r'<think>(.*?)</think>', text, re.DOTALL) | |
return match.group(1).strip() if match else text | |
# Function to load model and tokenizer | |
def load_model(model_name): | |
global model, tokenizer, messages | |
messages = [] # Reset conversation history | |
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
return f"Loaded model: {model_name}" | |
def generate_response(user_input, history=None, model_name=None): | |
global messages | |
if history is None: | |
history = [] | |
# Append user message to history | |
messages.append({"role": "user", "content": user_input}) | |
history.append((user_input, None)) # User message in chatbot | |
# Generate reasoning | |
reasoning_template = tokenizer.apply_chat_template(messages, tokenize=False, add_reasoning_prompt=True) | |
reasoning_inputs = tokenizer(reasoning_template, return_tensors="pt").to(model.device) | |
reasoning_ids = model.generate(**reasoning_inputs, max_new_tokens=MAX_REASONING_TOKENS) | |
reasoning_output = tokenizer.decode(reasoning_ids[0, reasoning_inputs.input_ids.shape[1]:], skip_special_tokens=True) | |
# Extract content from <think> tags | |
extracted_reasoning = extract_think_content(reasoning_output) | |
messages.append({"role": "reasoning", "content": extracted_reasoning}) | |
# Generate assistant response | |
response_template = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
response_inputs = tokenizer(response_template, return_tensors="pt").to(model.device) | |
response_ids = model.generate(**response_inputs, max_new_tokens=MAX_RESPONSE_TOKENS) | |
response_output = tokenizer.decode(response_ids[0, response_inputs.input_ids.shape[1]:], skip_special_tokens=True) | |
messages.append({"role": "assistant", "content": response_output}) | |
# Combine reasoning and assistant response in a single chatbot message with toggleable reasoning | |
combined_response = ( | |
f"<details><summary>Reasoning</summary>{extracted_reasoning}</details>\n\n" | |
f"{response_output}" | |
) | |
history.append((None, combined_response)) # Assistant message with toggleable reasoning | |
return history, "" | |
# Create Gradio interface | |
with gr.Blocks() as demo: | |
gr.Markdown("# ThinkAgain Chatbot") | |
# Model selection | |
model_dropdown = gr.Dropdown(choices=MODEL_OPTIONS, label="Select Model", value=MODEL_OPTIONS[0]) | |
model_load_status = gr.Textbox(label="Model Status", interactive=False) | |
# Load model when dropdown changes | |
model_dropdown.change( | |
fn=load_model, | |
inputs=model_dropdown, | |
outputs=model_load_status | |
) | |
chatbot = gr.Chatbot(label="Conversation") | |
user_input = gr.Textbox(label="Your Message", placeholder="Type your message here...") | |
submit = gr.Button("Send") | |
# Clear input and update chatbot after submission | |
submit.click( | |
fn=generate_response, | |
inputs=[user_input, chatbot, model_dropdown], | |
outputs=[chatbot, user_input] | |
) | |
# Load default model | |
load_model(MODEL_OPTIONS[0]) | |
# Launch the app | |
demo.launch(debug=True, show_api=False) | |