Spaces:

loveblairsky
/

llm-eval-dashboard

Sleeping

App Files Files Community

Blair Yang commited on Mar 4, 2024

Commit

5264831

1 Parent(s): e159d95

done api

Browse files

Files changed (8) hide show

Config.py +24 -0
__pycache__/Config.cpython-311.pyc +0 -0
app.py +87 -4
data/.DS_Store +0 -0
data/mmlu/response_rec.csv +13 -0
plot.py +15 -0
requirements.txt +4 -0
util.py +14 -0

Config.py ADDED Viewed

	@@ -0,0 +1,24 @@

+DATASETS = [
+    'mmlu',
+    # 'Anthropic_safety_eval'
+]
+TOPICS = {
+    'mmlu':
+    [
+        # 'high_school_biology',
+        'high_school_physics'
+    ],
+    'Anthropic_safety_eval':
+    [
+        'myopia'
+    ]
+}
+MODELS = ['Llama-2-70b-chat-hf',
+          'Llama-2-13b-chat-hf',
+          'Mixtral-8x7B-Instruct-v0.1',
+          'Mistral-7B-Instruct-v0.2'
+          ]
+RANDOM_SEED = 42

__pycache__/Config.cpython-311.pyc ADDED Viewed

Binary file (472 Bytes). View file

app.py CHANGED Viewed

@@ -1,7 +1,90 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-iface = gr.Interface(fn=greet, inputs="text", outputs="text")
-iface.launch()

 import gradio as gr
+import plotly.express as px
+import plotly.graph_objs as go
+from collections import defaultdict
+import json, math, gdown
+import numpy as np
+import pandas as pd
+from Config import *
+pd.options.display.float_format = '{:.2f}'.format
+battles = np.linspace(0, 100, 100)
+meta_topics = ['mmlu']
+def generate_plot(meta_index, topic_index):
+    """
+        Bar plot of a specific dataset
+    """
+    # battles = np.linspace(0, 100, 100)
+    meta_topic = meta_topics[meta_index]
+    print(meta_topic)
+    topic = TOPICS[meta_topic][topic_index]
+    data = pd.read_csv(f"data/{meta_topic}/response_rec.csv", sep=",")
+    topic_data = data[data['sub_topic'] == topic]
+    # Compute human and llm accuracy
+    topic_data['human_acc'] = topic_data['no_correct_human'] / topic_data['no_responses_human'].replace(0, np.nan)
+    topic_data['llm_acc'] = topic_data['no_correct_llm'] / topic_data['no_responses_llm'].replace(0, np.nan)
+    # Calculate mean and standard deviation for the sample data
+    mean_data = topic_data.groupby('model_name').mean().reset_index()
+    std_deviation = topic_data.groupby('model_name').std().reset_index()
+    # Prepare the plot data
+    plot_data = []
+    # Define a consistent color scheme
+    colors = ['#FFA07A', '#20B2AA', '#778899']  # Light Salmon, Light Sea Green, Light Slate Gray
+    opacities = [0.7, 0.7, 0.7]  # Opacity for average bars
+    # Add bars with error bars for the averages
+    for acc_type, color, opacity in zip(['oracle_acc', 'human_acc', 'llm_acc'], colors, opacities):
+        plot_data.append(go.Bar(
+            x=mean_data['model_name'],
+            y=mean_data[acc_type],
+            error_y=dict(
+                type='data',
+                array=std_deviation[acc_type],
+                visible=True
+            ),
+            name=acc_type.split('_')[0].capitalize(),
+            marker=dict(color=color, opacity=opacity)
+        ))
+    # Layout
+    layout = go.Layout(
+        title=f"Accuracy for {meta_topic} ({topic})",
+        xaxis=dict(title='Model Name'),
+        yaxis=dict(title='Accuracy'),
+        showlegend=True,
+        legend=dict(title='Accuracy Type'),
+        barmode='group'
+    )
+    fig = go.Figure(data=plot_data, layout=layout)
+    return fig
+# Gradio interface with grid layout
+with gr.Blocks() as interface:
+    with gr.Row():  # Row 1
+        plot1 = gr.Plot(generate_plot(0, 0))
+        # plot1.update(inputs=[0, 0])
+        plot2 = gr.Plot(generate_plot(0, 0))
+        # plot2.update(inputs=[0, 1])
+    with gr.Row():  # Row 2
+        plot3 = gr.Plot(generate_plot(0, 0))
+        # plot3.update(inputs=[1, 0])
+        plot4 = gr.Plot(generate_plot(0, 0))
+        # plot4.update(inputs=[1, 1])
+    with gr.Row():  # Row 3
+        plot5 = gr.Plot(generate_plot(0, 0))
+        # plot5.update(inputs=[2, 0])
+        plot6 = gr.Plot(generate_plot(0, 0))
+        # plot6.update(inputs=[2, 1])
+interface.launch()

data/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

data/mmlu/response_rec.csv CHANGED Viewed

	@@ -0,0 +1,13 @@

+sub_topic,model_name,card_idx,no_responses_human,no_correct_human,no_responses_llm,no_correct_llm,oracle_acc
+high_school_physics,Mixtral-8x7B-Instruct-v0.1,-1,10,8,10,7,0.68
+high_school_physics,Mixtral-8x7B-Instruct-v0.1,0,6,4,6,3,0.66
+high_school_physics,Mixtral-8x7B-Instruct-v0.1,1,4,4,4,4,0.7
+high_school_physics,Mistral-7B-Instruct-v0.2,-1,0,0,0,0,0
+high_school_physics,Mistral-7B-Instruct-v0.2,0,0,0,0,0,0
+high_school_physics,Mistral-7B-Instruct-v0.2,1,0,0,0,0,0
+high_school_biology,Mixtral-8x7B-Instruct-v0.1,-1,10,8,10,7,0.68
+high_school_biology,Mixtral-8x7B-Instruct-v0.1,0,6,4,6,3,0.66
+high_school_biology,Mixtral-8x7B-Instruct-v0.1,1,4,4,4,4,0.7
+high_school_biology,Mistral-7B-Instruct-v0.2,-1,0,0,0,0,0
+high_school_biology,Mistral-7B-Instruct-v0.2,0,0,0,0,0,0
+high_school_biology,Mistral-7B-Instruct-v0.2,1,0,0,0,0,0

plot.py CHANGED Viewed

	@@ -0,0 +1,15 @@

+from collections import defaultdict
+import json, math, gdown
+import numpy as np
+import pandas as pd
+import plotly.express as px
+from tqdm import tqdm
+pd.options.display.float_format = '{:.2f}'.format
+battles = np.linspace(0, 100, 100)
+fig = px.bar(battles,
+             title="Counts of Battle Outcomes", text_auto=True, height=400)
+fig.update_layout(xaxis_title="Battle Outcome", yaxis_title="Count",
+                  showlegend=False)
+fig.show()

requirements.txt CHANGED Viewed

	@@ -0,0 +1,4 @@

+plotly
+numpy
+pandas
+tqdm

util.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import pandas as pd
+def read_data(file_path):
+    """
+        Read data from a csv file
+    """
+    return pd.read_csv(file_path, sep=",")
+if __name__ == "__main__":
+    file_path = "data/mmlu/response_rec.csv"
+    data = read_data(file_path)
+    high_school_physics = data[data['sub_topic'] == 'high_school_physics']
+    print(high_school_physics.head(5))