Spaces:
Sleeping
Sleeping
Update mmlu_eval.py
Browse files- mmlu_eval.py +2 -2
mmlu_eval.py
CHANGED
|
@@ -45,7 +45,7 @@ def evaluate_mmlu(model, tokenizer, num_questions_per_task=5):
|
|
| 45 |
incorrect_examples = []
|
| 46 |
|
| 47 |
for task_name in mmlu_dataset.keys():
|
| 48 |
-
|
| 49 |
dataset = mmlu_dataset[task_name]
|
| 50 |
sampled_questions = random.sample(list(dataset), min(num_questions_per_task, len(dataset)))
|
| 51 |
|
|
@@ -53,7 +53,7 @@ def evaluate_mmlu(model, tokenizer, num_questions_per_task=5):
|
|
| 53 |
references = []
|
| 54 |
|
| 55 |
for sample in sampled_questions:
|
| 56 |
-
|
| 57 |
question = sample["question"]
|
| 58 |
correct_answer = str(sample["answer"]).strip().lower()
|
| 59 |
model_output = generate_answer(model, tokenizer, question).strip().lower()
|
|
|
|
| 45 |
incorrect_examples = []
|
| 46 |
|
| 47 |
for task_name in mmlu_dataset.keys():
|
| 48 |
+
print ("TASK NAME: ", task_name)
|
| 49 |
dataset = mmlu_dataset[task_name]
|
| 50 |
sampled_questions = random.sample(list(dataset), min(num_questions_per_task, len(dataset)))
|
| 51 |
|
|
|
|
| 53 |
references = []
|
| 54 |
|
| 55 |
for sample in sampled_questions:
|
| 56 |
+
print ("SAMPLE", sample)
|
| 57 |
question = sample["question"]
|
| 58 |
correct_answer = str(sample["answer"]).strip().lower()
|
| 59 |
model_output = generate_answer(model, tokenizer, question).strip().lower()
|