import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import re

# Model configuration
MAX_REASONING_TOKENS = 4096
MAX_RESPONSE_TOKENS = 2048
MODEL_OPTIONS = ["beyoru/ThinkAgain1.3", "beyoru/ThinkAgain1.4", "beyoru/ThinkAgain1.5"]

# Global variables for model and tokenizer
model = None
tokenizer = None
messages = []

# Function to extract text between <think> and </think> tags
def extract_think_content(text):
    match = re.search(r'<think>(.*?)</think>', text, re.DOTALL)
    return match.group(1).strip() if match else text

# Function to load model and tokenizer
def load_model(model_name):
    global model, tokenizer, messages
    messages = []  # Reset conversation history
    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return f"Loaded model: {model_name}"

def generate_response(user_input, history=None, model_name=None):
    global messages
    
    if history is None:
        history = []
    
    # Append user message to history
    messages.append({"role": "user", "content": user_input})
    history.append((user_input, None))  # User message in chatbot
    
    # Generate reasoning
    reasoning_template = tokenizer.apply_chat_template(messages, tokenize=False, add_reasoning_prompt=True)
    reasoning_inputs = tokenizer(reasoning_template, return_tensors="pt").to(model.device)
    reasoning_ids = model.generate(**reasoning_inputs, max_new_tokens=MAX_REASONING_TOKENS)
    reasoning_output = tokenizer.decode(reasoning_ids[0, reasoning_inputs.input_ids.shape[1]:], skip_special_tokens=True)
    
    # Extract content from <think> tags
    extracted_reasoning = extract_think_content(reasoning_output)
    messages.append({"role": "reasoning", "content": extracted_reasoning})
    
    # Generate assistant response
    response_template = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    response_inputs = tokenizer(response_template, return_tensors="pt").to(model.device)
    response_ids = model.generate(**response_inputs, max_new_tokens=MAX_RESPONSE_TOKENS)
    response_output = tokenizer.decode(response_ids[0, response_inputs.input_ids.shape[1]:], skip_special_tokens=True)
    
    messages.append({"role": "assistant", "content": response_output})
    
    # Combine reasoning and assistant response in a single chatbot message with toggleable reasoning
    combined_response = (
        f"<details><summary>Reasoning</summary>{extracted_reasoning}</details>\n\n"
        f"{response_output}"
    )
    history.append((None, combined_response))  # Assistant message with toggleable reasoning
    
    return history, ""

# Create Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# ThinkAgain Chatbot")
    
    # Model selection
    model_dropdown = gr.Dropdown(choices=MODEL_OPTIONS, label="Select Model", value=MODEL_OPTIONS[0])
    model_load_status = gr.Textbox(label="Model Status", interactive=False)
    
    # Load model when dropdown changes
    model_dropdown.change(
        fn=load_model,
        inputs=model_dropdown,
        outputs=model_load_status
    )
    
    chatbot = gr.Chatbot(label="Conversation")
    user_input = gr.Textbox(label="Your Message", placeholder="Type your message here...")
    submit = gr.Button("Send")
    
    # Clear input and update chatbot after submission
    submit.click(
        fn=generate_response,
        inputs=[user_input, chatbot, model_dropdown],
        outputs=[chatbot, user_input]
    )

# Load default model
load_model(MODEL_OPTIONS[0])

# Launch the app
demo.launch(debug=True, show_api=False)