Spaces:

Omniscient001
/

Omniscient01

Running

App Files Files Community

Omniscient01 / benchmark.py

Chrisyichuan

try2

10e7638 8 days ago

raw

history blame contribute delete

8.8 kB

	# benchmark.py (Updated for Named Datasets)
	#
	import os
	import json
	import time
	from datetime import datetime
	from typing import List, Dict, Optional, Tuple
	from pathlib import Path
	import math

	from geo_bot import GeoBot
	from config import get_data_paths, MODELS_CONFIG, SUCCESS_THRESHOLD_KM, get_model_class


	class MapGuesserBenchmark:
	def __init__(self, dataset_name: str = "default", headless: bool = False):
	self.dataset_name = dataset_name
	self.data_paths = get_data_paths(dataset_name)
	self.headless = headless
	self.golden_labels = self.load_golden_labels()
	print(
	f"📊 Loaded {len(self.golden_labels)} samples from dataset '{dataset_name}'"
	)

	def load_golden_labels(self) -> List[Dict]:
	try:
	with open(self.data_paths["golden_labels"], "r") as f:
	return json.load(f).get("samples", [])
	except Exception:
	return []

	def calculate_distance(
	self, true_coords: Dict, predicted_coords: Optional[Tuple[float, float]]
	) -> Optional[float]:
	if not predicted_coords or "lat" not in true_coords or "lng" not in true_coords:
	return None
	try:
	true_lat, true_lng = true_coords["lat"], true_coords["lng"]
	pred_lat, pred_lng = predicted_coords
	R = 6371
	lat1, lon1, lat2, lon2 = map(
	math.radians, [true_lat, true_lng, pred_lat, pred_lng]
	)
	a = (
	math.sin((dlat := lat2 - lat1) / 2) ** 2
	+ math.cos(lat1)
	* math.cos(lat2)
	* math.sin((dlon := lon2 - lon1) / 2) ** 2
	)
	c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
	return R * c
	except Exception:
	return None

	def run_benchmark(
	self,
	models: Optional[List[str]] = None,
	max_samples: Optional[int] = None,
	temperature: float = 0.0,
	**kwargs,
	) -> Dict:
	if not self.golden_labels:
	raise ValueError(
	f"No golden labels available in dataset '{self.dataset_name}'."
	)

	models_to_test = models or list(MODELS_CONFIG.keys())
	num_to_test = (
	min(max_samples, len(self.golden_labels))
	if max_samples is not None
	else len(self.golden_labels)
	)
	test_samples = self.golden_labels[:num_to_test]

	print(f"🚀 Starting benchmark on dataset '{self.dataset_name}':")
	print(f" Models: {models_to_test}")
	print(f" Samples: {len(test_samples)}")
	print(f" Temperature: {temperature}")

	all_results = []
	for model_name in models_to_test:
	print(f"\n🤖 Testing model: {model_name}")
	model_config = MODELS_CONFIG[model_name]
	model_class = get_model_class(model_config["class"])
	model_class_name = model_config["model_name"]

	try:
	with GeoBot(
	model=model_class,
	model_name=model_class_name,
	use_selenium=True,
	headless=self.headless,
	temperature=temperature,
	) as bot:
	for i, sample in enumerate(test_samples):
	print(
	"########################################################"
	)
	print(f"📍 Sample {i + 1}/{len(test_samples)}")
	try:
	result = self.run_single_test_with_bot(bot, sample)
	all_results.append(result)

	status = (
	"✅ Success" if result.get("success") else "❌ Failed"
	)
	distance = result.get("distance_km")
	dist_str = (
	f"{distance:.1f} km" if distance is not None else "N/A"
	)
	print(f"{status} (Distance: {dist_str})")

	except KeyboardInterrupt:
	raise
	except Exception as e:
	print(f" ❌ Test failed with unhandled exception: {e}")
	all_results.append(
	{
	"model": model_name,
	"sample_id": sample["id"],
	"success": False,
	"error": str(e),
	}
	)

	except KeyboardInterrupt:
	print("\n⏹️ Benchmark outer loop interrupted by user.")
	break

	self.save_results(all_results)
	return self.generate_summary(all_results)

	def run_single_test_with_bot(self, bot: GeoBot, location_data: Dict) -> Dict:
	start_time = time.time()

	assert bot.controller is not None
	if not bot.controller.load_location_from_data(location_data):
	return {
	"success": False,
	"error": "Failed to load location",
	"model": bot.model_name,
	"sample_id": location_data["id"],
	}

	bot.controller.setup_clean_environment()

	screenshot = bot.take_screenshot()
	if not screenshot:
	return {
	"success": False,
	"error": "Failed to take screenshot",
	"model": bot.model_name,
	"sample_id": location_data["id"],
	}

	predicted_lat_lon = bot.analyze_image(screenshot)
	inference_time = time.time() - start_time

	true_coords = {"lat": location_data.get("lat"), "lng": location_data.get("lng")}

	true_location = location_data["address"]
	print(f"🔍 True location: {true_location}")
	print(f"🔍 True coords: {true_coords}")
	print(f"🔍 Predicted coords: {predicted_lat_lon}")
	distance_km = self.calculate_distance(true_coords, predicted_lat_lon)

	is_success = distance_km is not None and distance_km <= SUCCESS_THRESHOLD_KM

	return {
	"sample_id": location_data["id"],
	"model": bot.model_name,
	"true_coordinates": true_coords,
	"predicted_coordinates": predicted_lat_lon,
	"distance_km": distance_km,
	"inference_time": inference_time,
	"success": is_success,
	}

	def save_results(self, results: List[Dict]):
	if not results:
	return
	try:
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	results_dir = Path(self.data_paths["results"])
	results_dir.mkdir(parents=True, exist_ok=True)
	results_file = results_dir / f"benchmark_results_{timestamp}.json"
	output_data = {
	"metadata": {
	"dataset_name": self.dataset_name,
	"timestamp": datetime.now().isoformat(),
	},
	"results": results,
	}
	with open(results_file, "w") as f:
	json.dump(output_data, f, indent=2, default=str)
	print(f"💾 Results saved to {results_file}")
	except Exception as e:
	print(f"❌ Error saving results: {e}")

	def generate_summary(self, results: List[Dict]) -> Dict:
	summary = {}
	by_model = {}
	for r in results:
	model = r.get("model", "unknown")
	if model not in by_model:
	by_model[model] = []
	by_model[model].append(r)
	for model, model_results in by_model.items():
	successful_runs = [r for r in model_results if r.get("success")]
	distances = [
	r["distance_km"]
	for r in model_results
	if r.get("distance_km") is not None
	]
	if not model_results:
	continue
	summary[model] = {
	"success_rate": len(successful_runs) / len(model_results)
	if model_results
	else 0,
	"average_distance_km": sum(distances) / len(distances)
	if distances
	else None,
	"median_distance_km": sorted(distances)[len(distances) // 2]
	if distances
	else None,
	"min_distance_km": min(distances) if distances else None,
	"max_distance_km": max(distances) if distances else None,
	}
	return summary