Spaces:
Sleeping
Sleeping
tangxuemei
commited on
src/backend/__pycache__/evaluate_model.cpython-310.pyc
CHANGED
Binary files a/src/backend/__pycache__/evaluate_model.cpython-310.pyc and b/src/backend/__pycache__/evaluate_model.cpython-310.pyc differ
|
|
src/backend/__pycache__/model_operations.cpython-310.pyc
CHANGED
Binary files a/src/backend/__pycache__/model_operations.cpython-310.pyc and b/src/backend/__pycache__/model_operations.cpython-310.pyc differ
|
|
src/backend/evaluate_model.py
CHANGED
@@ -86,7 +86,7 @@ class Evaluator:
|
|
86 |
# avg_summary_len = self.summary_generator.avg_length
|
87 |
# answer_rate = self.summary_generator.answer_rate
|
88 |
'''开始评估模型的结果'''
|
89 |
-
self.humanlike = self.eval_model.evaluate_humanlike(self.generated_summaries_df, envs.HUMAN_DATA)
|
90 |
'''原始指标'''
|
91 |
# self.hallucination_scores, self.eval_results = self.eval_model.evaluate_hallucination(
|
92 |
# self.generated_summaries_df)
|
|
|
86 |
# avg_summary_len = self.summary_generator.avg_length
|
87 |
# answer_rate = self.summary_generator.answer_rate
|
88 |
'''开始评估模型的结果'''
|
89 |
+
self.humanlike = self.eval_model.evaluate_humanlike(self.generated_summaries_df, envs.HUMAN_DATA, f"generation_results/{self.model}.csv")
|
90 |
'''原始指标'''
|
91 |
# self.hallucination_scores, self.eval_results = self.eval_model.evaluate_hallucination(
|
92 |
# self.generated_summaries_df)
|
src/backend/model_operations.py
CHANGED
@@ -33,7 +33,7 @@ logging.basicConfig(level=logging.INFO,
|
|
33 |
|
34 |
# Load spacy model for word tokenization
|
35 |
nlp = spacy.load("en_core_web_sm")
|
36 |
-
|
37 |
os.environ["HUGGINGFACE_API_KEY"] = envs.TOKEN
|
38 |
os.environ["OPENAI_API_KEY"] = "sk-None-tanhMyavhUtpX2G1kmPuT3BlbkFJGEhM5jmyGyhrTd3LdHDI"
|
39 |
|
@@ -46,7 +46,8 @@ def load_evaluation_model(model_path):
|
|
46 |
Returns:
|
47 |
CrossEncoder: The evaluation model
|
48 |
"""
|
49 |
-
model = CrossEncoder(model_path)
|
|
|
50 |
return model
|
51 |
|
52 |
|
@@ -121,10 +122,13 @@ class SummaryGenerator:
|
|
121 |
print(f"Total: {len(sheet_names)}")
|
122 |
print(sheet_names)
|
123 |
|
124 |
-
|
125 |
|
126 |
-
for i, sheet_name in enumerate(sheet_names
|
127 |
# 读取每个工作表
|
|
|
|
|
|
|
128 |
df_sheet = pd.read_excel(xls, sheet_name=sheet_name)
|
129 |
|
130 |
# 假设第一列是'Prompt0',但这里我们使用列名来避免硬编码
|
@@ -132,18 +136,37 @@ class SummaryGenerator:
|
|
132 |
prompt_column = df_sheet['Prompt0']
|
133 |
else:
|
134 |
# 如果'Prompt0'列不存在,则跳过该工作表或进行其他处理
|
135 |
-
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
|
137 |
# 遍历Prompt0列的值
|
138 |
-
for j, prompt_value in enumerate(tqdm(prompt_column, desc=f"Processing {sheet_name}"), start=
|
139 |
ID = 'E' + str(i)
|
140 |
-
q_ID = ID + '_' + str(j)
|
141 |
|
142 |
-
# print(ID, q_ID, prompt_value)
|
143 |
-
|
144 |
-
|
|
|
145 |
# user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
|
146 |
-
_user_prompt = prompt_value
|
147 |
while True:
|
148 |
try:
|
149 |
'''调用'''
|
@@ -171,19 +194,58 @@ class SummaryGenerator:
|
|
171 |
_response = ""
|
172 |
exceptions.append(i)
|
173 |
break
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
# exit()
|
181 |
|
182 |
# Sleep to prevent hitting rate limits too frequently
|
183 |
time.sleep(1)
|
184 |
|
185 |
-
self.summaries_df = pd.DataFrame(list(zip(
|
186 |
-
columns=["Experiment", "Question_ID", "User_prompt", "Response"])
|
187 |
|
188 |
if save_path is not None:
|
189 |
print(f'Save summaries to {save_path}')
|
@@ -419,85 +481,486 @@ class EvaluationModel:
|
|
419 |
def code_results(self, summaries_df):
|
420 |
'''code results from LLM's response'''
|
421 |
output = []
|
422 |
-
'''
|
423 |
-
|
424 |
-
|
425 |
-
''
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
430 |
|
431 |
male_keyword = ["he", "his", "himself"]
|
432 |
female_keyword = ["she", "her", "herself"]
|
433 |
-
|
|
|
434 |
# vote_1_1, vote_1_2, vote_1_3 = 0, 0, 0
|
|
|
|
|
|
|
|
|
|
|
|
|
435 |
if summaries_df["Experiment"][i] == "E1":
|
436 |
-
|
|
|
|
|
437 |
# vote_1_1 += 1
|
438 |
output.append("Round")
|
439 |
-
elif
|
440 |
-
output.append("
|
441 |
else:
|
442 |
-
output.append("
|
443 |
-
|
444 |
|
445 |
-
|
446 |
-
# vote_2_1, vote_2_2, vote_2_3 = 0, 0, 0
|
447 |
|
448 |
-
|
449 |
-
rs = summaries_df["Response"][i].strip()
|
450 |
rs = rs.split(' ')
|
|
|
451 |
male, female = 0, 0
|
452 |
for word in rs:
|
453 |
-
if word in female_keyword and male
|
454 |
female = 1
|
455 |
output.append("Female")
|
456 |
break
|
457 |
-
if word in male_keyword and female
|
458 |
male = 1
|
459 |
output.append("Male")
|
460 |
break
|
461 |
if male == 0 and female == 0 :
|
462 |
-
output.append("
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
rs = summaries_df["Response"][i].strip()
|
467 |
-
|
468 |
-
if
|
469 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
470 |
|
471 |
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
|
477 |
-
|
478 |
-
rs
|
479 |
-
if rs == "No":
|
480 |
output.append("0")
|
481 |
-
elif rs == "
|
482 |
output.append("1")
|
483 |
else:
|
484 |
-
output.append("
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
489 |
output.append("1")
|
490 |
else:
|
491 |
output.append("0")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
492 |
|
493 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
494 |
|
495 |
-
|
496 |
-
|
|
|
|
|
|
|
|
|
497 |
male, female = 0, 0
|
498 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
499 |
if "because" in rs:
|
500 |
-
rs = rs.split("because")[1]
|
501 |
else:
|
502 |
rs = rs
|
503 |
condition = summaries_df["Factor 2"][i].strip()
|
@@ -507,9 +970,11 @@ class EvaluationModel:
|
|
507 |
male = 1
|
508 |
break
|
509 |
if w in female_keyword and male != 1:
|
|
|
510 |
break
|
|
|
511 |
if male == 0 and female == 0:
|
512 |
-
output.append('
|
513 |
else:
|
514 |
if male == 1 and female==0:
|
515 |
if condition == "MF":
|
@@ -517,36 +982,70 @@ class EvaluationModel:
|
|
517 |
elif condition == "FM":
|
518 |
output.append("Object")
|
519 |
else:
|
520 |
-
output.append("
|
521 |
elif female == 1 and male ==0:
|
522 |
if condition == "MF":
|
523 |
output.append("Object")
|
524 |
elif condition == "FM":
|
525 |
output.append("Subject")
|
526 |
else:
|
527 |
-
output.append("
|
528 |
|
529 |
-
|
530 |
-
|
531 |
-
rs = summaries_df["Response"][i].strip()
|
532 |
-
|
|
|
533 |
output.append("1")
|
534 |
else:
|
535 |
-
output.append("0")
|
536 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
537 |
|
538 |
|
539 |
|
540 |
|
541 |
'''是不是有不同的问题,如何计算'''
|
542 |
-
def evaluate_humanlike(self, summaries_df, human_data_path):
|
543 |
'''
|
544 |
evaluate humanlike score
|
545 |
1. code the result
|
546 |
2. comput the similaritirs between human and model
|
547 |
process model responses'''
|
548 |
-
|
549 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
550 |
return 9.00
|
551 |
|
552 |
|
|
|
33 |
|
34 |
# Load spacy model for word tokenization
|
35 |
nlp = spacy.load("en_core_web_sm")
|
36 |
+
nlp1 = spacy.load("en_core_web_trf")
|
37 |
os.environ["HUGGINGFACE_API_KEY"] = envs.TOKEN
|
38 |
os.environ["OPENAI_API_KEY"] = "sk-None-tanhMyavhUtpX2G1kmPuT3BlbkFJGEhM5jmyGyhrTd3LdHDI"
|
39 |
|
|
|
46 |
Returns:
|
47 |
CrossEncoder: The evaluation model
|
48 |
"""
|
49 |
+
# model = CrossEncoder(model_path)
|
50 |
+
model = ""
|
51 |
return model
|
52 |
|
53 |
|
|
|
122 |
print(f"Total: {len(sheet_names)}")
|
123 |
print(sheet_names)
|
124 |
|
125 |
+
Experiment_ID, Questions_ID, Item_ID, Condition, User_prompt, Response, Factor_2, Stimuli_1 = [], [], [], [], [] ,[], [], []
|
126 |
|
127 |
+
for i, sheet_name in enumerate(sheet_names, start=1):
|
128 |
# 读取每个工作表
|
129 |
+
# if i > 2 and i ==1:
|
130 |
+
# continue
|
131 |
+
print(i, sheet_name)
|
132 |
df_sheet = pd.read_excel(xls, sheet_name=sheet_name)
|
133 |
|
134 |
# 假设第一列是'Prompt0',但这里我们使用列名来避免硬编码
|
|
|
136 |
prompt_column = df_sheet['Prompt0']
|
137 |
else:
|
138 |
# 如果'Prompt0'列不存在,则跳过该工作表或进行其他处理
|
139 |
+
continue
|
140 |
+
if i == 3 :
|
141 |
+
word1_list = df_sheet['Stimuli-2']
|
142 |
+
word2_list = df_sheet['Stimuli-3']
|
143 |
+
V2_column = []
|
144 |
+
for jj in range(len(word1_list)):
|
145 |
+
V2_column.append(word1_list[jj] + '_' + word2_list[jj])
|
146 |
+
# print(V2_column)
|
147 |
+
elif i == 9:
|
148 |
+
V2_column = df_sheet['V2'] #SL, LS
|
149 |
+
elif i == 4 or i == 6 :
|
150 |
+
V2_column = df_sheet['Stimuli-2'] #Stimuli-2
|
151 |
+
else:
|
152 |
+
V2_column = [""] * len(prompt_column)
|
153 |
+
q_column = df_sheet["ID"]
|
154 |
+
Item_column = df_sheet["Item"]
|
155 |
+
Condition_column = df_sheet["Condition"]
|
156 |
+
Stimuli_1_column = df_sheet["Stimuli-1"]
|
157 |
+
if 'Stimuli-2' in df_sheet.columns:
|
158 |
+
Stimuli_2_column = df_sheet["Stimuli-2"]
|
159 |
|
160 |
# 遍历Prompt0列的值
|
161 |
+
for j, prompt_value in enumerate(tqdm(prompt_column[0:2], desc=f"Processing {sheet_name}"), start=0):
|
162 |
ID = 'E' + str(i)
|
163 |
+
# q_ID = ID + '_' + str(j)
|
164 |
|
165 |
+
# print(ID, q_ID, prompt_value)
|
166 |
+
system_prompt = envs.SYSTEM_PROMPT
|
167 |
+
_user_prompt = prompt_value
|
168 |
+
for ii in range(2):
|
169 |
# user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
|
|
|
170 |
while True:
|
171 |
try:
|
172 |
'''调用'''
|
|
|
194 |
_response = ""
|
195 |
exceptions.append(i)
|
196 |
break
|
197 |
+
if i == 5:
|
198 |
+
print(_response)
|
199 |
+
|
200 |
+
_response1, _response2 = _response.split('\n\n')
|
201 |
+
Experiment_ID.append(ID)
|
202 |
+
Questions_ID.append(q_column[j])
|
203 |
+
User_prompt.append(_user_prompt)
|
204 |
+
|
205 |
+
Response.append(_response2)
|
206 |
+
|
207 |
+
Factor_2.append(V2_column[j])
|
208 |
+
Stimuli_1.append(Stimuli_2_column[j])
|
209 |
+
Item_ID.append(Item_column[j])
|
210 |
+
Condition.append(Condition_column[j])
|
211 |
+
|
212 |
+
# the first sentence in the response is saved as E51
|
213 |
+
Experiment_ID.append(ID + '1')
|
214 |
+
Questions_ID.append(str(q_column[j]) + '1')
|
215 |
+
User_prompt.append(_user_prompt)
|
216 |
+
Response.append(_response1)
|
217 |
+
|
218 |
+
|
219 |
+
|
220 |
+
Factor_2.append(V2_column[j])
|
221 |
+
Stimuli_1.append(Stimuli_1_column[j])
|
222 |
+
Item_ID.append(Item_column[j])
|
223 |
+
Condition.append(Condition_column[j])
|
224 |
+
|
225 |
+
else:
|
226 |
+
Experiment_ID.append(ID)
|
227 |
+
Questions_ID.append(q_column[j])
|
228 |
+
User_prompt.append(_user_prompt)
|
229 |
+
|
230 |
+
Response.append(_response)
|
231 |
+
if i == 6:
|
232 |
+
Factor_2.append(Condition_column[j])
|
233 |
+
Stimuli_1.append(V2_column[j])
|
234 |
+
else:
|
235 |
+
Factor_2.append(V2_column[j])
|
236 |
+
Stimuli_1.append(Stimuli_1_column[j])
|
237 |
+
Item_ID.append(Item_column[j])
|
238 |
+
Condition.append(Condition_column[j])
|
239 |
+
print(_response)
|
240 |
+
|
241 |
+
|
242 |
# exit()
|
243 |
|
244 |
# Sleep to prevent hitting rate limits too frequently
|
245 |
time.sleep(1)
|
246 |
|
247 |
+
self.summaries_df = pd.DataFrame(list(zip(Experiment_ID, Questions_ID, Item_ID, Condition, User_prompt, Response, Factor_2, Stimuli_1)),
|
248 |
+
columns=["Experiment", "Question_ID", "Item", "Condition", "User_prompt", "Response","Factor 2","Stimuli 1"])
|
249 |
|
250 |
if save_path is not None:
|
251 |
print(f'Save summaries to {save_path}')
|
|
|
481 |
def code_results(self, summaries_df):
|
482 |
'''code results from LLM's response'''
|
483 |
output = []
|
484 |
+
'''database for Exp4'''
|
485 |
+
item4 = pd.read_csv(envs.ITEM_4_DATA)
|
486 |
+
wordpair2code = {}
|
487 |
+
for j in range(len(item4['Coding'])):
|
488 |
+
wordpair2code[item4['Pair'][j]] = item4['Coding'][j]
|
489 |
+
'''verb for Exp5'''
|
490 |
+
item5 = pd.read_csv(envs.ITEM_5_DATA)
|
491 |
+
# item corresponding to verb, same item id corresponding to verb pair
|
492 |
+
item2verb2 = {}
|
493 |
+
item2verb1 = {}
|
494 |
+
|
495 |
+
Stimuli1, Stimuli2 = {}, {}
|
496 |
+
for j in range(len(item5['Item'])):
|
497 |
+
item2verb1[item5['Item'][j]] = item5['Verb1'][j]
|
498 |
+
item2verb2[item5['Item'][j]] = item5['Verb2'][j]
|
499 |
+
Stimuli1[item5['ID'][j]] = item5['Stimuli-1'][j]
|
500 |
+
Stimuli2[item5['ID'][j]] = item5['Stimuli-2'][j]
|
501 |
+
|
502 |
|
503 |
male_keyword = ["he", "his", "himself"]
|
504 |
female_keyword = ["she", "her", "herself"]
|
505 |
+
print(len(summaries_df["Experiment"]))
|
506 |
+
for i in range(len(summaries_df["Experiment"])):
|
507 |
# vote_1_1, vote_1_2, vote_1_3 = 0, 0, 0
|
508 |
+
# print()
|
509 |
+
if pd.isna(summaries_df["Response"][i]):
|
510 |
+
output.append("Other")
|
511 |
+
continue
|
512 |
+
rs = summaries_df["Response"][i].strip().lower()
|
513 |
+
'''Exp1'''
|
514 |
if summaries_df["Experiment"][i] == "E1":
|
515 |
+
print("E1", rs)
|
516 |
+
rs = rs.replace('"','')
|
517 |
+
if rs == "round":
|
518 |
# vote_1_1 += 1
|
519 |
output.append("Round")
|
520 |
+
elif rs == "spiky":
|
521 |
+
output.append("Spiky")
|
522 |
else:
|
523 |
+
output.append("Other")
|
524 |
+
|
525 |
|
526 |
+
'''Exp2'''
|
|
|
527 |
|
528 |
+
elif summaries_df["Experiment"][i] == "E2":
|
529 |
+
# rs = summaries_df["Response"][i].strip()
|
530 |
rs = rs.split(' ')
|
531 |
+
print("E2", rs)
|
532 |
male, female = 0, 0
|
533 |
for word in rs:
|
534 |
+
if word in female_keyword and male == 0:
|
535 |
female = 1
|
536 |
output.append("Female")
|
537 |
break
|
538 |
+
if word in male_keyword and female == 0:
|
539 |
male = 1
|
540 |
output.append("Male")
|
541 |
break
|
542 |
if male == 0 and female == 0 :
|
543 |
+
output.append("Other")
|
544 |
+
|
545 |
+
'''Exp3'''
|
546 |
+
elif summaries_df["Experiment"][i] == "E3":
|
547 |
+
# rs = summaries_df["Response"][i].strip()
|
548 |
+
print("E3", rs)
|
549 |
+
if pd.isna(summaries_df["Factor 2"][i]):
|
550 |
+
output.append("Other")
|
551 |
+
else:
|
552 |
+
if summaries_df["Factor 2"][i].strip() == "LS":
|
553 |
+
if "2" in rs:
|
554 |
+
output.append("Long")
|
555 |
+
elif "3" in rs:
|
556 |
+
output.append("Short")
|
557 |
+
else:
|
558 |
+
output.append("Other")
|
559 |
+
if summaries_df["Factor 2"][i].strip() == "SL":
|
560 |
+
if "2" in rs:
|
561 |
+
output.append("Short")
|
562 |
+
elif "3" in rs:
|
563 |
+
output.append("Long")
|
564 |
+
else:
|
565 |
+
output.append("Other")
|
566 |
+
'''Exp4'''
|
567 |
+
|
568 |
+
elif summaries_df["Experiment"][i] == "E4":
|
569 |
+
# rs = summaries_df["Response"][i].strip()
|
570 |
+
target = summaries_df["Factor 2"][i].strip().lower()
|
571 |
+
pair = target + "_" + rs
|
572 |
+
print("E4:", pair)
|
573 |
+
if pair in wordpair2code.keys():
|
574 |
+
output.append(wordpair2code[pair])
|
575 |
+
else:
|
576 |
+
output.append("Other")
|
577 |
+
|
578 |
+
'''Exp5'''
|
579 |
+
elif summaries_df["Experiment"][i] == "E5" or summaries_df["Experiment"][i] == "E51":
|
580 |
+
# sentence = summaries_df["Response"][i].strip()
|
581 |
+
item_id = summaries_df["Item"][i]
|
582 |
+
question_id = summaries_df["Question_ID"][i]
|
583 |
+
|
584 |
+
sti1, sti2 = "", ""
|
585 |
+
|
586 |
+
if summaries_df["Experiment"][i] == "E51":
|
587 |
+
sti1 = Stimuli1[question_id[0:-1]].lower().replace("...", "")
|
588 |
+
sti2 = Stimuli2[question_id[0:-1]].lower().replace("...", "")
|
589 |
+
verb = item2verb1[item_id].lower()
|
590 |
+
|
591 |
+
sentence = sti1 + " " + rs.replace(sti1, "")
|
592 |
+
print("E5", verb, sentence)
|
593 |
+
if summaries_df["Experiment"][i] == "E5":
|
594 |
+
sti1 = Stimuli1[question_id].lower().replace("...", "")
|
595 |
+
# print(sti1)
|
596 |
+
sti2 = Stimuli2[question_id].lower().replace("...", "")
|
597 |
+
|
598 |
+
verb = item2verb2[item_id].lower()
|
599 |
+
sentence = sti2.replace("...","") + " " + rs.replace(sti2, "")
|
600 |
+
print("E5", verb, sentence)
|
601 |
+
|
602 |
+
|
603 |
+
doc = nlp1(sentence.replace(" "," "))
|
604 |
+
# print(doc)
|
605 |
+
# print()
|
606 |
+
verb_token = None
|
607 |
+
for token in doc:
|
608 |
+
# print(token.lemma_)
|
609 |
+
if token.lemma_ == verb:
|
610 |
+
verb_token = token
|
611 |
+
break
|
612 |
+
# exit()
|
613 |
+
if verb_token is None:
|
614 |
+
output.append("Other")
|
615 |
+
print("E5 The target verb is missing from the sentence.")
|
616 |
+
else:
|
617 |
+
pobj, dative = None, None
|
618 |
+
# print(verb_token.children)
|
619 |
+
# exit()
|
620 |
+
for child in verb_token.children:
|
621 |
+
print(child)
|
622 |
+
if (child.dep_ == 'dative' and child.pos_ == "ADP") or (child.text == "to" and child.dep_ == 'prep' and child.pos_ == "ADP"):
|
623 |
+
pobj = child.text
|
624 |
+
if child.dep_ == 'dative':
|
625 |
+
dative = child.text
|
626 |
+
print("E5", pobj, dative)
|
627 |
+
# exit()
|
628 |
+
|
629 |
+
if pobj:
|
630 |
+
output.append("PO")
|
631 |
+
elif dative:
|
632 |
+
output.append("DO")
|
633 |
+
else:
|
634 |
+
print("Other", sentence, pobj, dative)
|
635 |
+
# exit()
|
636 |
+
output.append("Other")
|
637 |
+
|
638 |
+
'''Exp6'''
|
639 |
+
|
640 |
+
elif summaries_df["Experiment"][i] == "E6":
|
641 |
+
sentence = summaries_df["Stimuli 1"][i].strip().lower()
|
642 |
+
print("E6", sentence)
|
643 |
+
doc = nlp1(sentence)
|
644 |
+
subject = "None"
|
645 |
+
obj = "None"
|
646 |
+
# 遍历依存关系,寻找主语和宾语
|
647 |
+
for token in doc:
|
648 |
+
if token.dep_ == "nsubj":
|
649 |
+
subject = token.text
|
650 |
+
elif token.dep_ == "dobj":
|
651 |
+
obj = token.text
|
652 |
+
print("E6", subject, obj)
|
653 |
+
if subject in rs and obj in rs:
|
654 |
+
print(rs, subject, obj, "Other")
|
655 |
+
output.append("Other")
|
656 |
+
elif subject in rs:
|
657 |
+
print(rs, subject, obj, "VP")
|
658 |
+
output.append("VP")
|
659 |
+
elif obj in rs:
|
660 |
+
print(rs, subject, obj, "NP")
|
661 |
+
output.append("NP")
|
662 |
+
else:
|
663 |
+
print(rs, subject, obj, "Other")
|
664 |
+
output.append("Other")
|
665 |
|
666 |
|
667 |
+
|
668 |
+
|
669 |
+
'''Exp7'''
|
670 |
+
elif summaries_df["Experiment"][i] == "E7":
|
671 |
+
# rs = summaries_df["Response"][i].strip().lower()
|
672 |
+
print("E7",rs)
|
673 |
+
if rs == "no":
|
|
|
674 |
output.append("0")
|
675 |
+
elif rs == "yes":
|
676 |
output.append("1")
|
677 |
else:
|
678 |
+
output.append("Other")
|
679 |
+
|
680 |
+
'''Exp8'''
|
681 |
+
elif summaries_df["Experiment"][i] == "E8":
|
682 |
+
# rs = summaries_df["Response"][i].strip()
|
683 |
+
|
684 |
+
if "something is wrong with the question" in rs:
|
685 |
+
output.append("1")
|
686 |
+
else:
|
687 |
+
output.append("0")
|
688 |
+
|
689 |
+
'''Exp9'''
|
690 |
+
elif summaries_df["Experiment"][i] == "E9":
|
691 |
+
male, female = 0, 0
|
692 |
+
|
693 |
+
# rs = summaries_df["Response"][i].strip()
|
694 |
+
if "because" in rs:
|
695 |
+
rs = rs.replace("because because","because").split("because")[1]
|
696 |
+
else:
|
697 |
+
rs = rs
|
698 |
+
condition = summaries_df["Factor 2"][i].strip()
|
699 |
+
rs = rs.split(" ")
|
700 |
+
for w in rs:
|
701 |
+
if w in male_keyword and female != 1:
|
702 |
+
male = 1
|
703 |
+
break
|
704 |
+
if w in female_keyword and male != 1:
|
705 |
+
female = 1
|
706 |
+
break
|
707 |
+
print("E9", "condition", condition, "male", male, "female", female)
|
708 |
+
if male == 0 and female == 0:
|
709 |
+
output.append('Other')
|
710 |
+
else:
|
711 |
+
if male == 1 and female==0:
|
712 |
+
if condition == "MF":
|
713 |
+
output.append("Subject")
|
714 |
+
elif condition == "FM":
|
715 |
+
output.append("Object")
|
716 |
+
else:
|
717 |
+
output.append("Other")
|
718 |
+
elif female == 1 and male ==0:
|
719 |
+
if condition == "MF":
|
720 |
+
output.append("Object")
|
721 |
+
elif condition == "FM":
|
722 |
+
output.append("Subject")
|
723 |
+
else:
|
724 |
+
output.append("Other")
|
725 |
+
|
726 |
+
'''Exp10'''
|
727 |
+
elif summaries_df["Experiment"][i] == "E10":
|
728 |
+
# rs = summaries_df["Response"][i].strip()
|
729 |
+
if rs == "yes":
|
730 |
output.append("1")
|
731 |
else:
|
732 |
output.append("0")
|
733 |
+
else:
|
734 |
+
print("can;t find the Exp:", summaries_df["Experiment"][i])
|
735 |
+
output.append("NA")
|
736 |
+
# print(output)
|
737 |
+
# exit()
|
738 |
+
'''human'''
|
739 |
+
self.data = pd.DataFrame(list(zip(summaries_df["Experiment"], summaries_df["Question_ID"], summaries_df["Item"], summaries_df["Response"], summaries_df["Factor 2"], summaries_df["Stimuli 1"], summaries_df["Coding"], output)),
|
740 |
+
columns=["Experiment", "Question_ID", "Item", "Response", "Factor 2", "Simulate 1","Original_Coding","Coding"])
|
741 |
+
# '''LLM'''
|
742 |
+
# self.data = pd.DataFrame(list(zip(summaries_df["Experiment"], summaries_df["Question_ID"], summaries_df["Item"], summaries_df["Response"], summaries_df["Factor 2"], summaries_df["Stimuli 1"], output)),
|
743 |
+
# columns=["Experiment", "Question_ID", "Item", "Response", "Factor 2", "Simulate 1","Coding"])
|
744 |
+
print(self.data.head())
|
745 |
+
|
746 |
+
return self.data
|
747 |
+
def code_results_llm(self, summaries_df):
|
748 |
+
'''code results from LLM's response'''
|
749 |
+
output = []
|
750 |
+
'''database for Exp4'''
|
751 |
+
item4 = pd.read_csv(envs.ITEM_4_DATA)
|
752 |
+
wordpair2code = {}
|
753 |
+
for j in range(len(item4['Coding'])):
|
754 |
+
wordpair2code[item4['Pair'][j]] = item4['Coding'][j]
|
755 |
+
'''verb for Exp5'''
|
756 |
+
item5 = pd.read_csv(envs.ITEM_5_DATA)
|
757 |
+
# item corresponding to verb, same item id corresponding to verb pair
|
758 |
+
item2verb2 = {}
|
759 |
+
item2verb1 = {}
|
760 |
+
|
761 |
+
Stimuli1, Stimuli2 = {}, {}
|
762 |
+
for j in range(len(item5['Item'])):
|
763 |
+
item2verb1[item5['Item'][j]] = item5['Verb1'][j]
|
764 |
+
item2verb2[item5['Item'][j]] = item5['Verb2'][j]
|
765 |
+
Stimuli1[item5['ID'][j]] = item5['Stimuli-1'][j]
|
766 |
+
Stimuli2[item5['ID'][j]] = item5['Stimuli-2'][j]
|
767 |
|
768 |
|
769 |
+
male_keyword = ["he", "his", "himself"]
|
770 |
+
female_keyword = ["she", "her", "herself"]
|
771 |
+
print(len(summaries_df["Experiment"]))
|
772 |
+
for i in range(len(summaries_df["Experiment"])):
|
773 |
+
# vote_1_1, vote_1_2, vote_1_3 = 0, 0, 0
|
774 |
+
# print()
|
775 |
+
if pd.isna(summaries_df["Response"][i]):
|
776 |
+
output.append("Other")
|
777 |
+
continue
|
778 |
+
rs = summaries_df["Response"][i].strip().lower()
|
779 |
+
'''Exp1'''
|
780 |
+
if summaries_df["Experiment"][i] == "E1":
|
781 |
+
print("E1", rs)
|
782 |
+
rs = rs.replace('"','')
|
783 |
+
if rs == "round":
|
784 |
+
# vote_1_1 += 1
|
785 |
+
output.append("Round")
|
786 |
+
elif rs == "spiky":
|
787 |
+
output.append("Spiky")
|
788 |
+
else:
|
789 |
+
output.append("Other")
|
790 |
+
|
791 |
|
792 |
+
'''Exp2'''
|
793 |
+
|
794 |
+
elif summaries_df["Experiment"][i] == "E2":
|
795 |
+
# rs = summaries_df["Response"][i].strip()
|
796 |
+
rs = rs.split(' ')
|
797 |
+
print("E2", rs)
|
798 |
male, female = 0, 0
|
799 |
+
for word in rs:
|
800 |
+
if word in female_keyword and male == 0:
|
801 |
+
female = 1
|
802 |
+
output.append("Female")
|
803 |
+
break
|
804 |
+
if word in male_keyword and female == 0:
|
805 |
+
male = 1
|
806 |
+
output.append("Male")
|
807 |
+
break
|
808 |
+
if male == 0 and female == 0 :
|
809 |
+
output.append("Other")
|
810 |
+
|
811 |
+
'''Exp3'''
|
812 |
+
elif summaries_df["Experiment"][i] == "E3":
|
813 |
+
# rs = summaries_df["Response"][i].strip()
|
814 |
+
print("E3", rs)
|
815 |
+
rs = rs.replace('"', '')
|
816 |
+
pair = summaries_df["Factor 2"][i]
|
817 |
+
word1, word2 = pair.split('_')
|
818 |
+
|
819 |
+
if rs == word1:
|
820 |
+
if len(word1) > len(word2):
|
821 |
+
output.append("Long")
|
822 |
+
else:
|
823 |
+
output.append("Short")
|
824 |
+
elif rs == word2:
|
825 |
+
if len(word1) > len(word2):
|
826 |
+
output.append("Short")
|
827 |
+
else:
|
828 |
+
output.append("Long")
|
829 |
+
else:
|
830 |
+
output.append("Other")
|
831 |
+
|
832 |
+
'''Exp4'''
|
833 |
+
|
834 |
+
elif summaries_df["Experiment"][i] == "E4":
|
835 |
+
# rs = summaries_df["Response"][i].strip()
|
836 |
+
meaning_word = rs.split(";")[4].replace(" ",'')
|
837 |
+
target = summaries_df["Factor 2"][i].strip().lower()
|
838 |
+
pair = target + "_" + meaning_word
|
839 |
+
print("E4:", pair)
|
840 |
+
if pair in wordpair2code.keys():
|
841 |
+
output.append(wordpair2code[pair])
|
842 |
+
else:
|
843 |
+
output.append("Other")
|
844 |
+
|
845 |
+
'''Exp5'''
|
846 |
+
elif summaries_df["Experiment"][i] == "E5" or summaries_df["Experiment"][i] == "E51":
|
847 |
+
# sentence = summaries_df["Response"][i].strip()
|
848 |
+
item_id = summaries_df["Item"][i]
|
849 |
+
question_id = summaries_df["Question_ID"][i]
|
850 |
+
|
851 |
+
sti1, sti2 = "", ""
|
852 |
+
|
853 |
+
if summaries_df["Experiment"][i] == "E51":
|
854 |
+
sti1 = Stimuli1[question_id[0:-1]].lower().replace("...", "")
|
855 |
+
sti2 = Stimuli2[question_id[0:-1]].lower().replace("...", "")
|
856 |
+
verb = item2verb1[item_id].lower()
|
857 |
+
|
858 |
+
sentence = sti1 + " " + rs.replace(sti1, "")
|
859 |
+
print("E5", verb, sentence)
|
860 |
+
if summaries_df["Experiment"][i] == "E5":
|
861 |
+
sti1 = Stimuli1[question_id].lower().replace("...", "")
|
862 |
+
# print(sti1)
|
863 |
+
sti2 = Stimuli2[question_id].lower().replace("...", "")
|
864 |
+
|
865 |
+
verb = item2verb2[item_id].lower()
|
866 |
+
sentence = sti2.replace("...","") + " " + rs.replace(sti2, "")
|
867 |
+
print("E5", verb, sentence)
|
868 |
+
|
869 |
+
|
870 |
+
doc = nlp1(sentence.replace(" "," "))
|
871 |
+
# print(doc)
|
872 |
+
# print()
|
873 |
+
verb_token = None
|
874 |
+
for token in doc:
|
875 |
+
# print(token.lemma_)
|
876 |
+
if token.lemma_ == verb:
|
877 |
+
verb_token = token
|
878 |
+
break
|
879 |
+
# exit()
|
880 |
+
if verb_token is None:
|
881 |
+
output.append("Other")
|
882 |
+
print("E5 The target verb is missing from the sentence.")
|
883 |
+
else:
|
884 |
+
pobj, dative = None, None
|
885 |
+
# print(verb_token.children)
|
886 |
+
# exit()
|
887 |
+
for child in verb_token.children:
|
888 |
+
print(child)
|
889 |
+
if (child.dep_ == 'dative' and child.pos_ == "ADP") or (child.text == "to" and child.dep_ == 'prep' and child.pos_ == "ADP"):
|
890 |
+
pobj = child.text
|
891 |
+
if child.dep_ == 'dative':
|
892 |
+
dative = child.text
|
893 |
+
print("E5", pobj, dative)
|
894 |
+
# exit()
|
895 |
+
|
896 |
+
if pobj:
|
897 |
+
output.append("PO")
|
898 |
+
elif dative:
|
899 |
+
output.append("DO")
|
900 |
+
else:
|
901 |
+
print("Other", sentence, pobj, dative)
|
902 |
+
# exit()
|
903 |
+
output.append("Other")
|
904 |
+
|
905 |
+
'''Exp6'''
|
906 |
+
|
907 |
+
elif summaries_df["Experiment"][i] == "E6":
|
908 |
+
sentence = summaries_df["Stimuli 1"][i].strip().lower()
|
909 |
+
print("E6", sentence)
|
910 |
+
doc = nlp1(sentence)
|
911 |
+
subject = "None"
|
912 |
+
obj = "None"
|
913 |
+
# 遍历依存关系,寻找主语和宾语
|
914 |
+
for token in doc:
|
915 |
+
if token.dep_ == "nsubj":
|
916 |
+
subject = token.text
|
917 |
+
elif token.dep_ == "dobj":
|
918 |
+
obj = token.text
|
919 |
+
print("E6", subject, obj)
|
920 |
+
if subject in rs and obj in rs:
|
921 |
+
print(rs, subject, obj, "Other")
|
922 |
+
output.append("Other")
|
923 |
+
elif subject in rs:
|
924 |
+
print(rs, subject, obj, "VP")
|
925 |
+
output.append("VP")
|
926 |
+
elif obj in rs:
|
927 |
+
print(rs, subject, obj, "NP")
|
928 |
+
output.append("NP")
|
929 |
+
else:
|
930 |
+
print(rs, subject, obj, "Other")
|
931 |
+
output.append("Other")
|
932 |
+
|
933 |
+
|
934 |
+
|
935 |
+
|
936 |
+
'''Exp7'''
|
937 |
+
elif summaries_df["Experiment"][i] == "E7":
|
938 |
+
# rs = summaries_df["Response"][i].strip().lower()
|
939 |
+
rs = rs.replace(".", "").replace(",", "")
|
940 |
+
print("E7",rs)
|
941 |
+
if rs == "no":
|
942 |
+
output.append("0")
|
943 |
+
elif rs == "yes":
|
944 |
+
output.append("1")
|
945 |
+
else:
|
946 |
+
output.append("Other")
|
947 |
+
|
948 |
+
'''Exp8'''
|
949 |
+
elif summaries_df["Experiment"][i] == "E8":
|
950 |
+
# rs = summaries_df["Response"][i].strip()
|
951 |
+
print("E8",rs)
|
952 |
+
if "something is wrong with the question" in rs:
|
953 |
+
output.append("1")
|
954 |
+
else:
|
955 |
+
output.append("0")
|
956 |
+
|
957 |
+
'''Exp9'''
|
958 |
+
elif summaries_df["Experiment"][i] == "E9":
|
959 |
+
male, female = 0, 0
|
960 |
+
|
961 |
+
# rs = summaries_df["Response"][i].strip()
|
962 |
if "because" in rs:
|
963 |
+
rs = rs.replace("because because","because").split("because")[1]
|
964 |
else:
|
965 |
rs = rs
|
966 |
condition = summaries_df["Factor 2"][i].strip()
|
|
|
970 |
male = 1
|
971 |
break
|
972 |
if w in female_keyword and male != 1:
|
973 |
+
female = 1
|
974 |
break
|
975 |
+
print("E9", "condition", condition, "male", male, "female", female)
|
976 |
if male == 0 and female == 0:
|
977 |
+
output.append('Other')
|
978 |
else:
|
979 |
if male == 1 and female==0:
|
980 |
if condition == "MF":
|
|
|
982 |
elif condition == "FM":
|
983 |
output.append("Object")
|
984 |
else:
|
985 |
+
output.append("Other")
|
986 |
elif female == 1 and male ==0:
|
987 |
if condition == "MF":
|
988 |
output.append("Object")
|
989 |
elif condition == "FM":
|
990 |
output.append("Subject")
|
991 |
else:
|
992 |
+
output.append("Other")
|
993 |
|
994 |
+
'''Exp10'''
|
995 |
+
elif summaries_df["Experiment"][i] == "E10":
|
996 |
+
# rs = summaries_df["Response"][i].strip()
|
997 |
+
rs = rs.replace(".", "")
|
998 |
+
if rs == "yes":
|
999 |
output.append("1")
|
1000 |
else:
|
1001 |
+
output.append("0")
|
1002 |
+
else:
|
1003 |
+
print("can;t find the Exp:", summaries_df["Experiment"][i])
|
1004 |
+
output.append("NA")
|
1005 |
+
# print(output)
|
1006 |
+
# exit()
|
1007 |
+
'''human'''
|
1008 |
+
# self.data = pd.DataFrame(list(zip(summaries_df["Experiment"], summaries_df["Question_ID"], summaries_df["Item"], summaries_df["Response"], summaries_df["Factor 2"], summaries_df["Stimuli 1"], summaries_df["Coding"], output)),
|
1009 |
+
# columns=["Experiment", "Question_ID", "Item", "Response", "Factor 2", "Simulate 1","Original_Coding","Coding"])
|
1010 |
+
'''LLM'''
|
1011 |
+
self.data = pd.DataFrame(list(zip(summaries_df["Experiment"], summaries_df["Question_ID"], summaries_df["Item"], summaries_df["Response"], summaries_df["Factor 2"], summaries_df["Stimuli 1"], output)),
|
1012 |
+
columns=["Experiment", "Question_ID", "Item", "Response", "Factor 2", "Simulate 1","Coding"])
|
1013 |
+
print(self.data.head())
|
1014 |
+
|
1015 |
+
return self.data
|
1016 |
+
|
1017 |
|
1018 |
|
1019 |
|
1020 |
|
1021 |
'''是不是有不同的问题,如何计算'''
|
1022 |
+
def evaluate_humanlike(self, summaries_df, human_data_path, result_save_path):
|
1023 |
'''
|
1024 |
evaluate humanlike score
|
1025 |
1. code the result
|
1026 |
2. comput the similaritirs between human and model
|
1027 |
process model responses'''
|
1028 |
+
|
1029 |
+
'''coding human data'''
|
1030 |
+
# self.huamn_df = pd.read_csv(human_data_path)
|
1031 |
+
# self.data = self.code_results(self.huamn_df)
|
1032 |
+
# save_path = human_data_path.replace('.csv','_coding.csv')
|
1033 |
+
# if save_path is not None:
|
1034 |
+
# print(f'Save human coding results to {save_path}')
|
1035 |
+
# fpath = Path(save_path)
|
1036 |
+
# fpath.parent.mkdir(parents=True, exist_ok=True)
|
1037 |
+
# self.data.to_csv(fpath)
|
1038 |
+
|
1039 |
+
'''coding llm data'''
|
1040 |
+
save_path = result_save_path.replace('.csv','_coding.csv')
|
1041 |
+
self.llm_df = self.code_results_llm(summaries_df)
|
1042 |
+
if save_path is not None:
|
1043 |
+
print(f'Save LLM coding results to {save_path}')
|
1044 |
+
fpath = Path(save_path)
|
1045 |
+
fpath.parent.mkdir(parents=True, exist_ok=True)
|
1046 |
+
self.llm_df.to_csv(fpath)
|
1047 |
+
# exit()
|
1048 |
+
|
1049 |
return 9.00
|
1050 |
|
1051 |
|