|
import os |
|
import requests |
|
from huggingface_hub import login, hf_hub_url |
|
from datasets import load_dataset |
|
from PIL import Image |
|
from io import BytesIO |
|
import gradio as gr |
|
from transformers import pipeline |
|
|
|
|
|
login(token=os.environ["HF_TOKEN"]) |
|
|
|
|
|
def resolve_image_url(path): |
|
return hf_hub_url(repo_id="Jize1/GTA", filename=path, repo_type="dataset") |
|
|
|
|
|
def download_image(url): |
|
headers = {"Authorization": f"Bearer {os.environ['HF_TOKEN']}"} |
|
response = requests.get(url, headers=headers) |
|
image = Image.open(BytesIO(response.content)).convert("RGB") |
|
return image |
|
|
|
|
|
print("Loading GTA dataset...") |
|
gta_data = load_dataset("Jize1/GTA", split="train", use_auth_token=True) |
|
|
|
|
|
print("Loading vision models...") |
|
image_captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") |
|
ocr_pipeline = pipeline("image-classification", model="microsoft/dit-base-finetuned-iiit5k") |
|
|
|
def evaluate_model(model_name): |
|
total = 0 |
|
inst_acc = 0 |
|
tool_acc = 0 |
|
summ_acc = 0 |
|
|
|
for example in gta_data.select(range(10)): |
|
dialogs = example["dialogs"] |
|
gt_answer = example["gt_answer"] |
|
|
|
user_query = dialogs[0]["content"] |
|
files = example["files"] |
|
tool_calls = [d for d in dialogs if d.get("tool_calls")] |
|
|
|
image_path = files[0]["path"] |
|
image_url = resolve_image_url(image_path) |
|
image = download_image(image_url) |
|
|
|
|
|
result = "" |
|
for tool_call in tool_calls: |
|
tool = tool_call["tool_calls"][0]["function"]["name"] |
|
if tool == "ImageDescription": |
|
caption = image_captioner(image)[0]["generated_text"] |
|
result += f"[Caption] {caption}\n" |
|
elif tool == "OCR": |
|
result += f"[OCR] dummy OCR result for {image_path}\n" |
|
elif tool == "CountGivenObject": |
|
result += f"[Count] dummy count result\n" |
|
|
|
|
|
inst_acc += 1 |
|
tool_acc += 1 if len(tool_calls) > 0 else 0 |
|
summ_acc += 1 if gt_answer["whitelist"] else 0 |
|
total += 1 |
|
|
|
return { |
|
"InstAcc": round(inst_acc / total * 100, 2), |
|
"ToolAcc": round(tool_acc / total * 100, 2), |
|
"SummAcc": round(summ_acc / total * 100, 2) |
|
} |
|
|
|
|
|
def run_evaluation(model_name): |
|
results = evaluate_model(model_name) |
|
return f"Results for {model_name}:\n" + "\n".join(f"{k}: {v}%" for k, v in results.items()) |
|
|
|
|
|
demo = gr.Interface( |
|
fn=run_evaluation, |
|
inputs=gr.Textbox(label="Hugging Face Model Name", placeholder="e.g. Qwen/Qwen2.5-3B"), |
|
outputs=gr.Textbox(label="GTA Evaluation Metrics"), |
|
title="GTA LLM Evaluation", |
|
description="Enter a model name from Hugging Face to simulate tool use and get GTA-style metrics.", |
|
allow_flagging="never" |
|
) |
|
|
|
demo.launch() |