Spaces:

hyungjoochae
/

Web-Shepherd-Demo

Running

Web-Shepherd-Demo / agent /reward_postprocessor.py

update (#2)

1650939 verified about 1 month ago

1.02 kB

	import numpy as np
	import re


	def extract_judge_hash(response):
	"""
	checklist 별로 yes, in, no를 판단한 정보를 hash 형태로 변환하여 반환
	"""
	content = response['response']

	try:
	judge_content = content.lower().replace(' ', '').split('<answer>')[1].split('</answer>')[0]
	except:
	import traceback
	traceback.print_exc()
	return None
	pattern = r":yes\|:inprogress\|:no"
	matches = re.findall(pattern, judge_content)
	matches = [{':yes': 'y', ':inprogress': 'i', ':no': 'n'}[match] for match in matches]
	return ''.join(matches)

	def average_logits(response):
	"""
	yes, in, no를 logits 레벨에서 계산.
	"""
	judge_probs = response['judge_probs']

	yes_ = np.mean([r['yes'] for r in judge_probs])
	in_ = np.mean([r['in'] for r in judge_probs])

	reward = yes_ + 0.5 * in_
	return reward


	REWARD_PROCESSORS = {
	'avg_logits': average_logits
	}

	REWARD_PROCESSOR_N_SAMPLES = {
	'avg_logits': 5
	}