Spaces:

hyungjoochae
/

Web-Shepherd-Demo

Running

App Files Files Community

Web-Shepherd-Demo / agent /mini_bench /checklist_eval.py

hyungjoochae

update (#2)

1650939 verified about 1 month ago

raw

history blame contribute delete

3.09 kB

	import re

	from langchain_openai import ChatOpenAI

	from .agent import BaseAgent

	SYSTEM_PROMPT = "You are an expert evaluator. Your task is to assess how well a Web Agent’s generated checklist aligns with the reference checklist for a given user instruction."

	USER_PROMPT = """# Task Description
	Use the provided task description, evaluation criteria, and both checklists to assign a score from 1 to 5. Justify your rating with a brief explanation that considers both content overlap and logical structure.

	## Score Criteria
	- 5: Checklist covers all subgoals, is correct and clearly expressed
	- 4: Minor omissions or phrasing issues but mostly accurate and complete
	- 3: Partially matches, but with noticeable gaps or errors
	- 2: Incomplete or includes incorrect steps
	- 1: Mostly irrelevant, incorrect, or missing the task goal

	## User Instruction:
	{intent}

	## Reference Checklist:
	{gt_checklist}

	## Agent’s Generated Checklist:
	{generated_checklist}

	# Output Format
	Your response should be in the following format:
	REASON: [Write 2–4 sentences explaining how well the generated checklist matches the reference. Mention specific matches, omissions, errors, or strengths.]
	SCORE: [1–5]
	"""


	class ChecklistEvalAgent(BaseAgent):
	def __init__(self, agent_config: dict):
	super().__init__(agent_config)
	self._setup()

	def prepare_message(self, model_input: dict, prompt_type):
	message = [
	{
	"role": "system",
	"content": SYSTEM_PROMPT
	},
	{
	"role": "user",
	"content": USER_PROMPT.format(
	intent=model_input["intent"],
	gt_checklist=model_input["gt_checklist"],
	generated_checklist=model_input["generated_checklist"]
	)
	}
	]
	return message

	def generate_response(self, model_input: dict):
	total_cost = 0
	response_list = []
	# prepare message
	message = self.prepare_message(model_input)

	# n sampling
	for _ in range(self.num_generate):
	response, cost = self.generate_with_retry(message, ["SCORE"])
	response_list.append(response)
	total_cost += cost

	return response_list, total_cost

	def parsing_score(response: str):
	score = response.split("SCORE:")[-1].split("\n")[0].strip()
	match = re.search(r'\d+', score)

	if match:
	return int(match.group())
	else:
	return None

	def average_score(scores: list[int]):
	if len(scores) == 0:
	return 0
	return sum(scores) / len(scores)

	def get_score(results: list[dict]):
	score_list = []
	for result in results:
	tmp_scores = [parsing_score(response) for response in result["response"]]
	scores = [score for score in tmp_scores if score is not None]
	result["score_list"] = scores
	final_score = average_score(scores)
	result["score"] = final_score
	score_list.append(result)

	return results, score_list