|
import numpy as np |
|
import re |
|
|
|
|
|
def extract_judge_hash(response): |
|
""" |
|
checklist ๋ณ๋ก yes, in, no๋ฅผ ํ๋จํ ์ ๋ณด๋ฅผ hash ํํ๋ก ๋ณํํ์ฌ ๋ฐํ |
|
""" |
|
content = response['response'] |
|
|
|
try: |
|
judge_content = content.lower().replace(' ', '').split('<answer>')[1].split('</answer>')[0] |
|
except: |
|
import traceback |
|
traceback.print_exc() |
|
return None |
|
pattern = r":yes|:inprogress|:no" |
|
matches = re.findall(pattern, judge_content) |
|
matches = [{':yes': 'y', ':inprogress': 'i', ':no': 'n'}[match] for match in matches] |
|
return ''.join(matches) |
|
|
|
def average_logits(response): |
|
""" |
|
yes, in, no๋ฅผ logits ๋ ๋ฒจ์์ ๊ณ์ฐ. |
|
""" |
|
judge_probs = response['judge_probs'] |
|
|
|
yes_ = np.mean([r['yes'] for r in judge_probs]) |
|
in_ = np.mean([r['in'] for r in judge_probs]) |
|
|
|
reward = yes_ + 0.5 * in_ |
|
return reward |
|
|
|
|
|
REWARD_PROCESSORS = { |
|
'avg_logits': average_logits |
|
} |
|
|
|
REWARD_PROCESSOR_N_SAMPLES = { |
|
'avg_logits': 5 |
|
} |