In [1]:
import pandas as pd
import re

In [2]:
fpmms = pd.read_parquet('/Users/arshath/play/openautonomy/olas-prediction-live-dashboard/data/fpmms.parquet')
tools = pd.read_parquet('/Users/arshath/play/openautonomy/olas-prediction-live-dashboard/data/tools.parquet')
trades = pd.read_parquet('/Users/arshath/play/openautonomy/olas-prediction-live-dashboard/data/all_trades_profitability.parquet')

In [3]:
def extract_question(text):
    pattern = r'"([^"]+\?)"'
    match = re.search(pattern, text)
    if match:
        return match.group(1)
    return text

In [4]:
def get_current_answer(q):
    return trades[trades['title'] == q]['current_answer'].unique()

In [5]:
# only select trades in May 2024
trades['creation_timestamp'] = pd.to_datetime(trades['creation_timestamp'])
trades = trades[trades['creation_timestamp'].dt.month == 5]
trades = trades[trades['creation_timestamp'].dt.year == 2024]

# make a column for winning_vote
tools['winning_vote'] = (tools['vote'] == tools['currentAnswer'])
tools = tools[tools['tool']!= 'resolve-market-reasoning-gpt-4'].reset_index(drop=True)

In [6]:
tools['prompt_request'] = tools['prompt_request'].apply(extract_question)

In [7]:
trades_grouped = trades.groupby(['title', 'winning_trade']).size().unstack().fillna(0)

In [8]:
winning_trades_percentage = trades_grouped[True] / trades_grouped.sum(axis=1)
winning_trades_percentage = winning_trades_percentage.reset_index()
winning_trades_percentage.columns = ['title', 'winning_trade_percentage']
winning_trades_percentage['num_trades'] = list(trades_grouped.sum(axis=1).values)
winning_trades_percentage_bottom_50 = winning_trades_percentage.sort_values(by='winning_trade_percentage', ascending=False)[-50:].reset_index(drop=True)
winning_trades_percentage_top_50 = winning_trades_percentage.sort_values(by='winning_trade_percentage', ascending=False)[:50].reset_index(drop=True)

In [13]:
# winning_trades_percentage.sort_values(by='winning_trade_percentage', ascending=False).reset_index(drop=True).to_csv('winning_trades_percentage.csv', index=False)

In [18]:
winning_trades_percentage_top_50['title'].tolist()



['Will Kylian Mbappe leave Paris St-Germain at the end of the season by 16 May 2024?',
 'Will BlizzCon be reinstated on or by 1 May 2024 after its cancellation in 2024?',
 'Will Joe Biden approve more weapons for Ukraine by 4 May 2024?',
 "Will FiiO's new custom in-ear monitors become the top-selling wireless earbuds by 9 May 2024?",
 'Will Mohamed Salah leave Liverpool on 7 May 2024?',
 "Will Ryan Gosling accept a 'dark' role in a film by 14 May 2024?",
 'Will the Philadelphia 76ers win the NBA play-offs on 7 May 2024?',
 'Will the Panamanian presidential election result in a clear victor by 12 May 2024?',
 'Will the Museum of Old and New Art in Tasmania be allowed to keep its exhibit women-only by 14 May 2024?',
 "Will Diego Maradona's 'Stolen' Golden Ball be auctioned off on 14 May 2024?",
 'Will the Mercedes G-Wagen release an electric version on 1 May 2024?',
 'Will the Israeli government lift the broadcast ban on Al Jazeera on or before 13 May 2024?',
 'Will Intel release its Cor

In [17]:
winning_trades_percentage_bottom_50['title'].tolist()

["Will 'Scavengers Reign' be renewed for a second season on Netflix by 19 May 2024?",
 'Will Fiona Harvey officially file a lawsuit against Netflix and Richard Gadd by 17 May 2024?',
 'Will the final report on the Baltimore bridge collapse be released by 20 May 2024?',
 'Will the Autonomous Racing League successfully hold their second race by May 3, 2024?',
 'Will Trent Staggs win the Senatorial race to replace Sen. Mitt Romney (R-UT) on 5 May 2024?',
 'Will the Houston area experience flooding conditions on 11 May 2024?',
 "Will 'Wednesday' season 2 be released on Netflix by 1 May 2024?",
 'Will Arsenal win against Bournemouth in the Premier League match on 12 May 2024?',
 'Will Qualcomm release its Snapdragon X Plus laptop chip by 1 May 2024?',
 "Will Feyenoord's Arne Slot become the new manager of Liverpool by 1 May 2024?",
 'Will the FCC receive additional funding for replacing Huawei gear by 10 May 2024?',
 'Will there be any major cyber attack on an organization using AI before 2

In [62]:
def losing_percentage(q):
    print(f"Losing percentage for: {q}")
    q_losing = tools[tools['prompt_request'].str.contains(q)].groupby(['tool', 'winning_vote']).size().unstack().fillna(0)
    q_losing_perc = q_losing[False] / (q_losing[False] + q_losing[True])
    q_losing_perc = q_losing_perc.reset_index()
    q_losing_perc.columns = ['tool', 'losing_percentage']
    q_losing_perc['num_calls'] = list(q_losing.sum(axis=1).values)
    q_losing_perc = q_losing_perc.sort_values(by='losing_percentage', ascending=False)
    return q_losing_perc

In [63]:
# have confirmed market resolution was correct
losing_percentage(winning_trades_percentage_bottom_50.loc[0, 'title'])

Losing percentage for: Will 'Scavengers Reign' be renewed for a second season on Netflix by 19 May 2024?


Unnamed: 0,tool,losing_percentage,num_calls
0,prediction-offline,1.0,40.0
4,prediction-request-rag-claude,1.0,17.0
7,prediction-url-cot-claude,1.0,2.0
2,prediction-online-sme,0.656716,67.0
6,prediction-request-reasoning-claude,0.571429,7.0
5,prediction-request-reasoning,0.538462,52.0
3,prediction-request-rag,0.25,4.0
1,prediction-online,0.185185,27.0


In [64]:
# have confirmed currentAnswer
losing_percentage(winning_trades_percentage_bottom_50.loc[0, 'title'])

Losing percentage for: Will 'Scavengers Reign' be renewed for a second season on Netflix by 19 May 2024?


Unnamed: 0,tool,losing_percentage,num_calls
0,prediction-offline,1.0,40.0
4,prediction-request-rag-claude,1.0,17.0
7,prediction-url-cot-claude,1.0,2.0
2,prediction-online-sme,0.656716,67.0
6,prediction-request-reasoning-claude,0.571429,7.0
5,prediction-request-reasoning,0.538462,52.0
3,prediction-request-rag,0.25,4.0
1,prediction-online,0.185185,27.0


In [65]:
# have confirmed currentAnswer
losing_percentage(winning_trades_percentage_bottom_50.loc[1, 'title'])

Losing percentage for: Will Fiona Harvey officially file a lawsuit against Netflix and Richard Gadd by 17 May 2024?


Unnamed: 0,tool,losing_percentage,num_calls
7,prediction-url-cot-claude,1.0,1.0
2,prediction-online-sme,0.977273,44.0
1,prediction-online,0.975,40.0
0,prediction-offline,0.677419,31.0
5,prediction-request-reasoning,0.534483,58.0
4,prediction-request-rag-claude,0.223881,67.0
6,prediction-request-reasoning-claude,0.2,5.0
3,prediction-request-rag,0.0,8.0


In [66]:
# have confirmed currentAnswer
losing_percentage(winning_trades_percentage_bottom_50.loc[2, 'title'])

Losing percentage for: Will the final report on the Baltimore bridge collapse be released by 20 May 2024?


Unnamed: 0,tool,losing_percentage,num_calls
0,claude-prediction-offline,1.0,5.0
1,claude-prediction-online,1.0,1.0
2,prediction-offline,1.0,87.0
6,prediction-request-rag-claude,1.0,25.0
9,prediction-url-cot-claude,1.0,1.0
3,prediction-online,0.95122,41.0
8,prediction-request-reasoning-claude,0.833333,6.0
5,prediction-request-rag,0.714286,7.0
7,prediction-request-reasoning,0.4375,48.0
4,prediction-online-sme,0.394366,71.0


In [67]:
# have confirmed currentAnswer
losing_percentage(winning_trades_percentage_bottom_50.loc[3, 'title'])

Losing percentage for: Will the Autonomous Racing League successfully hold their second race by May 3, 2024?


Unnamed: 0,tool,losing_percentage,num_calls
0,claude-prediction-offline,1.0,2.0
1,prediction-offline,1.0,23.0
2,prediction-online,1.0,14.0
3,prediction-online-sme,1.0,18.0
4,prediction-request-rag,1.0,5.0
5,prediction-request-rag-claude,1.0,8.0
8,prediction-url-cot-claude,1.0,6.0
6,prediction-request-reasoning,0.0,18.0
7,prediction-request-reasoning-claude,0.0,3.0


In [72]:
losing_percentage(winning_trades_percentage_bottom_50.loc[5, 'title'])

Losing percentage for: Will the Houston area experience flooding conditions on 11 May 2024?


Unnamed: 0,tool,losing_percentage,num_calls
0,claude-prediction-offline,1.0,2.0
1,claude-prediction-online,1.0,6.0
2,prediction-offline,1.0,58.0
4,prediction-online-sme,1.0,39.0
5,prediction-request-rag,1.0,4.0
8,prediction-request-reasoning-claude,1.0,8.0
9,prediction-url-cot-claude,1.0,5.0
6,prediction-request-rag-claude,0.754717,53.0
7,prediction-request-reasoning,0.369048,84.0
3,prediction-online,0.166667,72.0


In [73]:
losing_percentage(winning_trades_percentage_bottom_50.loc[6, 'title'])

Losing percentage for: Will 'Wednesday' season 2 be released on Netflix by 1 May 2024?


Unnamed: 0,tool,losing_percentage,num_calls
1,prediction-online-sme,0.75,4.0
5,prediction-request-reasoning-claude,0.75,4.0
2,prediction-request-rag,0.666667,6.0
3,prediction-request-rag-claude,0.5,2.0
4,prediction-request-reasoning,0.4,5.0
0,claude-prediction-online,0.0,1.0


In [74]:
losing_percentage(winning_trades_percentage_bottom_50.loc[7, 'title'])

Losing percentage for: Will Arsenal win against Bournemouth in the Premier League match on 12 May 2024?


Unnamed: 0,tool,losing_percentage,num_calls
0,prediction-offline,1.0,11.0
1,prediction-online,1.0,17.0
2,prediction-online-sme,1.0,30.0
4,prediction-request-rag-claude,1.0,45.0
5,prediction-request-reasoning,0.874016,127.0
3,prediction-request-rag,0.25,4.0
6,prediction-request-reasoning-claude,0.0,2.0


In [75]:
losing_percentage(winning_trades_percentage_bottom_50.loc[8, 'title'])

Losing percentage for: Will Qualcomm release its Snapdragon X Plus laptop chip by 1 May 2024?


Unnamed: 0,tool,losing_percentage,num_calls
0,claude-prediction-offline,1.0,7.0
1,prediction-offline,1.0,1.0
3,prediction-online-sme,1.0,19.0
5,prediction-request-rag-claude,1.0,15.0
4,prediction-request-rag,0.941176,17.0
2,prediction-online,0.8,5.0
7,prediction-request-reasoning-claude,0.666667,15.0
6,prediction-request-reasoning,0.652174,23.0
8,prediction-url-cot-claude,0.333333,3.0


In [76]:
losing_percentage(winning_trades_percentage_bottom_50.loc[9, 'title'])

Losing percentage for: Will Feyenoord's Arne Slot become the new manager of Liverpool by 1 May 2024?


Unnamed: 0,tool,losing_percentage,num_calls
0,claude-prediction-offline,1.0,4.0
1,prediction-offline,1.0,2.0
8,prediction-url-cot-claude,1.0,2.0
6,prediction-request-reasoning,0.916667,12.0
7,prediction-request-reasoning-claude,0.9,10.0
4,prediction-request-rag,0.714286,14.0
3,prediction-online-sme,0.666667,9.0
2,prediction-online,0.5,2.0
5,prediction-request-rag-claude,0.454545,11.0


In [77]:
losing_percentage(winning_trades_percentage_bottom_50.loc[10, 'title'])

Losing percentage for: Will the FCC receive additional funding for replacing Huawei gear by 10 May 2024?


Unnamed: 0,tool,losing_percentage,num_calls
0,claude-prediction-offline,1.0,6.0
1,claude-prediction-online,1.0,3.0
2,prediction-offline,1.0,36.0
6,prediction-request-rag-claude,1.0,50.0
4,prediction-online-sme,0.986486,74.0
5,prediction-request-rag,0.947368,19.0
3,prediction-online,0.910714,56.0
9,prediction-url-cot-claude,0.777778,9.0
7,prediction-request-reasoning,0.465753,73.0
8,prediction-request-reasoning-claude,0.071429,14.0


In [98]:
all_q = winning_trades_percentage_bottom_50['title'].unique().tolist()
q_losing = tools[tools['prompt_request'].isin(all_q)]
q_losing = q_losing.groupby(['tool'])['winning_vote'].value_counts().unstack().fillna(0)
q_losing_perc = q_losing[False] / (q_losing[False] + q_losing[True])
q_losing_perc = q_losing_perc.reset_index()
q_losing_perc.columns = ['tool', 'losing_percentage']
q_losing_perc['num_calls'] = list(q_losing.sum(axis=1).values)
q_losing_perc = q_losing_perc.sort_values(by='losing_percentage', ascending=False)

In [99]:
q_losing_perc

Unnamed: 0,tool,losing_percentage,num_calls
3,prediction-offline-sme,1.0,2.0
7,prediction-request-rag-claude,0.913007,1184.0
2,prediction-offline,0.893281,1012.0
6,prediction-request-rag,0.889881,336.0
5,prediction-online-sme,0.857143,1722.0
4,prediction-online,0.853553,1154.0
8,prediction-request-reasoning,0.847451,2727.0
10,prediction-url-cot-claude,0.846154,130.0
1,claude-prediction-online,0.735849,53.0
9,prediction-request-reasoning-claude,0.659664,238.0


In [103]:
all_q = winning_trades_percentage_bottom_50['title'].unique().tolist()
q_losing = tools[tools['prompt_request'].isin(all_q)]
q_losing.groupby(['tool'])['confidence'].value_counts().unstack().fillna(0)

confidence,0.00,0.10,0.20,0.30,0.40,0.50,0.55,0.60,0.65,0.70,0.75,0.80,0.85,0.90,0.95,0.99,1.00
tool,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
claude-prediction-offline,0.0,0.0,5.0,46.0,4.0,0.0,0.0,87.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
claude-prediction-online,0.0,0.0,2.0,10.0,7.0,3.0,0.0,30.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
prediction-offline,0.0,267.0,2.0,13.0,302.0,189.0,0.0,231.0,3.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,1.0
prediction-offline-sme,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
prediction-online,0.0,22.0,4.0,5.0,43.0,23.0,8.0,670.0,99.0,2.0,76.0,28.0,55.0,25.0,11.0,0.0,20.0
prediction-online-sme,1.0,27.0,10.0,0.0,71.0,2.0,0.0,679.0,234.0,39.0,149.0,76.0,109.0,80.0,6.0,0.0,39.0
prediction-request-rag,0.0,3.0,2.0,0.0,4.0,4.0,0.0,25.0,5.0,48.0,11.0,36.0,57.0,16.0,11.0,1.0,20.0
prediction-request-rag-claude,0.0,0.0,1.0,32.0,0.0,0.0,0.0,175.0,0.0,513.0,0.0,209.0,3.0,40.0,3.0,0.0,0.0
prediction-request-reasoning,0.0,3.0,103.0,1.0,58.0,97.0,0.0,315.0,176.0,441.0,317.0,339.0,159.0,44.0,58.0,0.0,97.0
prediction-request-reasoning-claude,0.0,0.0,0.0,3.0,4.0,0.0,0.0,27.0,0.0,38.0,4.0,76.0,0.0,8.0,1.0,0.0,2.0
