from dataclasses import dataclass from enum import Enum @dataclass class Task: benchmark: str metric: str col_name: str # Select your tasks here # --------------------------------------------------- class Tasks(Enum): # task_key in the json file, metric_key in the json file, name to display in the leaderboard task0 = Task("blimp", "acc", "BLiMP") task1 = Task("blimp_supplement", "acc", "BLiMP Supplement") task2 = Task("glue", "acc", "(Super)GLUE") task3 = Task("ewok", "acc", "EWoK") class TasksMultimodal(Enum): task0 = Task("blimp", "acc", "BLiMP") task1 = Task("blimp_supplement", "acc", "BLiMP Supplement") task2 = Task("glue", "acc", "(Super)GLUE") task3 = Task("ewok", "acc", "EWoK") task4 = Task("vqa", "acc", "VQA") task5 = Task("winoground", "acc", "Winoground") task6 = Task("devbench", "acc", "DevBench") @dataclass class TaskMIB_Subgraph: benchmark: str # task name in json (ioi/arithmetic) models: list[str] # list of models to show as sub-columns col_name: str # display name in leaderboard metrics: list[str] # metrics to store (edge_counts, faithfulness) class TasksMib_Subgraph(Enum): task0 = TaskMIB_Subgraph("ioi", ["gpt2", "qwen2_5", "gemma2", "llama3"], "IOI", ["edge_counts", "faithfulness"]) task1 = TaskMIB_Subgraph("mcqa", ["qwen2_5", "gemma2", "llama3"], "MCQA", ["edge_counts", "faithfulness"]) task2 = TaskMIB_Subgraph("arithmetic_addition", ["llama3"], "arithmetic_addition", ["edge_counts", "faithfulness"]) task3 = TaskMIB_Subgraph("arithmetic_subtraction", ["llama3"], "arithmetic_subtraction", ["edge_counts", "faithfulness"]) task4 = TaskMIB_Subgraph("arc_easy", ["gemma2", "llama3"], "arc_easy", ["edge_counts", "faithfulness"]) task5 = TaskMIB_Subgraph("arc_challenge", ["llama3"], "arc_challenge", ["edge_counts", "faithfulness"]) @classmethod def get_all_tasks(cls): """Returns a list of all task benchmarks""" return [task.value.benchmark for task in cls] @classmethod def get_all_models(cls): """Returns a list of all unique models across all tasks""" models = set() for task in cls: models.update(task.value.models) return sorted(list(models)) # @dataclass # class TaskMIB_Causalgraph: # benchmark: str # MCQA # models: list[str] # List of all models # layers: list[str] # 0-31 # col_name: str # display name in leaderboard # interventions: list[str] # output_token, output_location # counterfactuals: list[str] # symbol_counterfactual, etc. # metrics: list[str] # score # class TasksMib_Causalgraph(Enum): # task0 = TaskMIB_Causalgraph( # "MCQA", # ["LlamaForCausalLM", "Qwen2ForCausalLM", "Gemma2ForCausalLM"], # Updated model list # [str(i) for i in range(32)], # 0-31 layers # "mcqa", # ["output_token", "output_location"], # ["symbol_counterfactual", "randomLetter_counterfactual", # "answerPosition_counterfactual", "answerPosition_symbol_counterfactual"], # ["score"] # ) # class TasksMib_Causalgraph(Enum): # task0 = TaskMIB_Causalgraph( # "MCQA", # ["Qwen2ForCausalLM", "Gemma2ForCausalLM", "LlamaForCausalLM"], # Match exact model names with correct casing # [str(i) for i in range(32)], # "mcqa", # ["output_token", "output_location"], # ["randomLetter_counterfactual", "answerPosition_counterfactual", # "answerPosition_randomLetter_counterfactual"], # ["score"] # ) # class TasksMib_Causalgraph(Enum): # task0 = TaskMIB_Causalgraph( # "MCQA", # ["qwen2forcausallm", "gemma2forcausallm", "llamaforcausallm"], # Use lowercase consistently # [str(i) for i in range(32)], # "mcqa", # ["output_token", "output_location"], # ["randomLetter_counterfactual", "answerPosition_counterfactual", # "answerPosition_randomLetter_counterfactual"], # ["score"] # ) @dataclass class TaskMIB_Causalgraph: benchmark: str models: list[str] layers: dict[str, list[str]] # Different layers for each model col_name: str interventions: list[str] counterfactuals: list[str] metrics: list[str] # class TasksMib_Causalgraph(Enum): # task0 = TaskMIB_Causalgraph( # "MCQA", # ["Qwen2ForCausalLM", "Gemma2ForCausalLM", "LlamaForCausalLM"], # { # "Qwen2ForCausalLM": [str(i) for i in range(24)], # 0-23 # "Gemma2ForCausalLM": [str(i) for i in range(26)], # 0-25 # "LlamaForCausalLM": [str(i) for i in range(32)] # 0-31 # }, # "mcqa", # ["output_token", "output_location"], # ["randomLetter_counterfactual", "answerPosition_counterfactual", # "answerPosition_randomLetter_counterfactual"], # ["score"] # ) class TasksMib_Causalgraph(Enum): task0 = TaskMIB_Causalgraph( "MCQA", ["qwen2forcausallm", "gemma2forcausallm", "llamaforcausallm"], # Use lowercase names to match actual columns { "qwen2forcausallm": [str(i) for i in range(24)], # 0-23 "gemma2forcausallm": [str(i) for i in range(26)], # 0-25 "llamaforcausallm": [str(i) for i in range(32)] # 0-31 }, "mcqa", ["output_token", "output_location"], ["randomLetter_counterfactual", "answerPosition_counterfactual", "answerPosition_randomLetter_counterfactual"], ["score"] ) NUM_FEWSHOT = 0 # Change with your few shot # --------------------------------------------------- # Your leaderboard name TITLE = """