import logging
import gradio as gr
import cv2
import tempfile
import os
from pathlib import Path
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from llama_cpp.llama_chat_format import Llava15ChatHandler
from termcolor import cprint

# Configure logging
logging.basicConfig(
    level=logging.DEBUG,
    format='[%(asctime)s] %(levelname)s: %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

# —————————————————————————————————————————
# 1) Inline definition & registration of SmolVLM2ChatHandler
class SmolVLM2ChatHandler(Llava15ChatHandler):
    CHAT_FORMAT = (
        "<|im_start|>"
        "{% for message in messages %}"
        "{{ message['role'] | capitalize }}"
        "{% if message['role']=='user' and message['content'][0]['type']=='image_url' %}:"
        "{% else %}: "
        "{% endif %}"
        "{% for content in message['content'] %}"
        "{% if content['type']=='text' %}{{ content['text'] }}"
        "{% elif content['type']=='image_url' %}"
        "{% if content['image_url'] is string %}"
        "{{ content['image_url'] }}\n"
        "{% elif content['image_url'] is mapping %}"
        "{{ content['image_url']['url'] }}\n"
        "{% endif %}"
        "{% endif %}"
        "{% endfor %}"
        "<end_of_utterance>\n"
        "{% endfor %}"
        "{% if add_generation_prompt %}Assistant:{% endif %}"
    )

# —————————————————————————————————————————
# 2) Model & CLIP files — download if missing
MODEL_FILE = "SmolVLM2-500M-Video-Instruct.Q8_0.gguf"
CLIP_FILE  = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
MODEL_REPO = "mradermacher/SmolVLM2-500M-Video-Instruct-GGUF"
CLIP_REPO  = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF"

def ensure_models():
    logging.debug("Ensuring model files are present...")
    if not os.path.exists(MODEL_FILE):
        logging.info(f"Downloading model file {MODEL_FILE} from {MODEL_REPO}...")
        path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
        os.symlink(path, MODEL_FILE)
        logging.info(f"Created symlink: {path} -> {MODEL_FILE}")
    else:
        logging.debug(f"Model file {MODEL_FILE} already exists.")

    if not os.path.exists(CLIP_FILE):
        logging.info(f"Downloading CLIP file {CLIP_FILE} from {CLIP_REPO}...")
        path = hf_hub_download(repo_id=CLIP_REPO, filename=CLIP_FILE)
        os.symlink(path, CLIP_FILE)
        logging.info(f"Created symlink: {path} -> {CLIP_FILE}")
    else:
        logging.debug(f"CLIP file {CLIP_FILE} already exists.")

ensure_models()


def load_llm():
    logging.debug("Loading Llama model with SmolVLM2ChatHandler...")
    handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False)
    llm = Llama(
        model_path=MODEL_FILE,
        chat_handler=handler,
        n_ctx=8192,
        verbose=False,
    )
    logging.info("Llama model loaded successfully.")
    return llm

llm = load_llm()

# —————————————————————————————————————————
# 4) Captioning helper (stateless prompt)
def caption_frame(frame):
    logging.debug("caption_frame called.")
    # make a writable copy
    frame = frame.copy()
    logging.debug(f"Frame shape: {frame.shape}, dtype: {frame.dtype}")

    # save frame to temporary file for URI
    with tempfile.NamedTemporaryFile(suffix='.jpg') as f:
        success = cv2.imwrite(f.name, frame)
        if not success:
            logging.error(f"Failed to write frame to {f.name}")
        else:
            logging.debug(f"Frame written to temp file: {f.name}")

        uri = Path(f.name).absolute().as_uri()
        logging.debug(f"Frame URI: {uri}")

        # build a single prompt string
        messages = [
            {
                "role": "system",
                "content": (
                    "Focus only on describing the key dramatic action or notable event occurring "
                    "in this image. Skip general context or scene-setting details unless they are "
                    "crucial to understanding the main action."
                ),
            },
            {
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": uri},
                    {"type": "text",      "text": "What is happening in this image?"},
                ],
            },
        ]
        logging.debug(f"Constructed messages: {messages}")

        # stateless completion call
        logging.debug("Resetting LLM and clearing cache.")
        llm.chat_handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False)
        llm.reset()                           # reset n_tokens back to 0
        llm._ctx.kv_cache_clear()            # clear any cached key/values
        logging.debug("Sending chat completion request...")
        resp = llm.create_chat_completion(
            messages=messages,
            max_tokens=256,
            temperature=0.1,
            stop=["<end_of_utterance>"],
        )
        logging.debug(f"LLM raw response: {resp}")

    # extract caption
    caption = (resp.get("choices", [])[0]["message"].get("content", "") or "").strip()
    logging.debug(f"Extracted caption: {caption}")
    return caption

# —————————————————————————————————————————
# 5) Gradio UI (v5 streaming)
demo = gr.Blocks()
with demo:
    gr.Markdown("## 🎥 Real-Time Camera Captioning with SmolVLM2 (CPU)")
    input_img = gr.Image(sources=["webcam"], streaming=True, label="Webcam Feed")
    caption_box = gr.Textbox(interactive=False, label="Caption")

    # stream frames and captions
    input_img.stream(
        fn=caption_frame,
        inputs=[input_img],
        outputs=[caption_box],
        stream_every=3,
        time_limit=600
    )

if __name__ == "__main__":
    logging.debug("Launching Gradio demo...")
    demo.launch()