yulongchen commited on
Commit
259448b
ยท
1 Parent(s): b0280c9

Add system

Browse files
Files changed (5) hide show
  1. .DS_Store +0 -0
  2. app.py +8 -6
  3. system/pledge_tracking.py +8 -8
  4. system/process_time.py +3 -3
  5. test.html +3 -1
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
app.py CHANGED
@@ -71,7 +71,12 @@ def similar_pledges():
71
 
72
  def calculate_time_range(option: str, pledge_date: str = None):
73
  today = datetime.today()
74
- pledge_date = datetime.strptime(pledge_date, "%Y-%m-%d")
 
 
 
 
 
75
 
76
  if option == "week":
77
  one_week_ago = today - timedelta(days=7)
@@ -79,16 +84,13 @@ def calculate_time_range(option: str, pledge_date: str = None):
79
  elif option == "month":
80
  one_month_ago = today - timedelta(days=30)
81
  start = max(one_month_ago, pledge_date)
82
- elif option == "year":
83
- one_year_ago = today - timedelta(days=365)
84
- start = max(one_year_ago, pledge_date)
85
  elif option == "since_pledge_date":
86
  if not pledge_date:
87
  raise ValueError("Pledge date is required for 'since_pledge_date' option")
88
  start = datetime.strptime(pledge_date, "%Y-%m-%d")
89
  else:
90
  raise ValueError("Invalid time range option")
91
- print(start)
92
  return start.strftime("%Y%m%d"), today.strftime("%Y%m%d")
93
 
94
  @app.route("/")
@@ -146,7 +148,7 @@ def run_model():
146
  time_start, time_end = calculate_time_range(time_range_option, pledge_date=pledge_date)
147
  print(f"[DEMO] Received claim: {claim}")
148
  print(f"[DEMO] Time range: {time_start} ~ {time_end}")
149
- print(f"[DEMO] Time range: {pledge_date}")
150
 
151
  # user_id = str(uuid.uuid4())[:8]
152
  # outputs = run_pipeline(claim, pledge_date, pledge_author, time_start, timestamp, user_id)
 
71
 
72
  def calculate_time_range(option: str, pledge_date: str = None):
73
  today = datetime.today()
74
+ # pledge_date = datetime.strptime(pledge_date, "%Y-%m-%d")
75
+
76
+ if isinstance(pledge_date, str):
77
+ pledge_date = datetime.strptime(pledge_date, "%Y-%m-%d")
78
+ elif not isinstance(pledge_date, datetime):
79
+ raise ValueError("pledge_date must be a str or datetime")
80
 
81
  if option == "week":
82
  one_week_ago = today - timedelta(days=7)
 
84
  elif option == "month":
85
  one_month_ago = today - timedelta(days=30)
86
  start = max(one_month_ago, pledge_date)
 
 
 
87
  elif option == "since_pledge_date":
88
  if not pledge_date:
89
  raise ValueError("Pledge date is required for 'since_pledge_date' option")
90
  start = datetime.strptime(pledge_date, "%Y-%m-%d")
91
  else:
92
  raise ValueError("Invalid time range option")
93
+ print(start, one_week_ago, one_month_ago, pledge_date)
94
  return start.strftime("%Y%m%d"), today.strftime("%Y%m%d")
95
 
96
  @app.route("/")
 
148
  time_start, time_end = calculate_time_range(time_range_option, pledge_date=pledge_date)
149
  print(f"[DEMO] Received claim: {claim}")
150
  print(f"[DEMO] Time range: {time_start} ~ {time_end}")
151
+ print(f"[DEMO] Pledge date range: {pledge_date}")
152
 
153
  # user_id = str(uuid.uuid4())[:8]
154
  # outputs = run_pipeline(claim, pledge_date, pledge_author, time_start, timestamp, user_id)
system/pledge_tracking.py CHANGED
@@ -57,7 +57,7 @@ def run_pipeline(claim, pledge_date, pledge_author, start_date, timestamp, user_
57
  with open(initial_tsv_file, "r", encoding="utf-8") as f:
58
  line_count = sum(1 for line in f)
59
  if update_fn:
60
- update_fn(step_id, f"We have found {line_count} URLs")
61
  step_id+=1
62
 
63
 
@@ -70,7 +70,7 @@ def run_pipeline(claim, pledge_date, pledge_author, start_date, timestamp, user_
70
  with open(initial_scraped_output_path, "r", encoding="utf-8") as f:
71
  line_count = sum(1 for line in f if json.loads(line)["url2text"] != [])
72
  if update_fn:
73
- update_fn(step_id, f"We have scraped {line_count} URLs")
74
  step_id+=1
75
 
76
 
@@ -85,7 +85,7 @@ def run_pipeline(claim, pledge_date, pledge_author, start_date, timestamp, user_
85
  questions = {line["question"] for line in json.load(f)["evidence"]}
86
  line_count = len(questions)
87
  if update_fn:
88
- update_fn(step_id, f"We have generated {line_count} search queries")
89
  step_id+=1
90
 
91
  else:
@@ -115,7 +115,7 @@ def run_pipeline(claim, pledge_date, pledge_author, start_date, timestamp, user_
115
  with open(augmented_tsv_file, "r", encoding="utf-8") as f:
116
  line_count = sum(1 for line in f)
117
  if update_fn:
118
- update_fn(step_id, f"We have found {line_count} URLs")
119
  step_id+=1
120
 
121
  augmented_data_store_dir = os.path.join(pipeline_base_dir, "augmented_data_store")
@@ -126,7 +126,7 @@ def run_pipeline(claim, pledge_date, pledge_author, start_date, timestamp, user_
126
  with open(augmented_scraped_output_path, "r", encoding="utf-8") as f:
127
  line_count = sum(1 for line in f if json.loads(line)["url2text"] != [])
128
  if update_fn:
129
- update_fn(step_id, f"We have scraped {line_count} URLs")
130
  step_id+=1
131
 
132
 
@@ -150,7 +150,7 @@ def run_pipeline(claim, pledge_date, pledge_author, start_date, timestamp, user_
150
  if "url" in doc:
151
  unique_urls.add(doc["url"])
152
  if update_fn:
153
- update_fn(step_id, f"We have found {len(unique_urls)} most relevant documents")
154
  step_id+=1
155
 
156
  extracted_event_path = run_gpt4_event_extraction(data_dir=pipeline_base_dir, max_tokens=100000)
@@ -158,7 +158,7 @@ def run_pipeline(claim, pledge_date, pledge_author, start_date, timestamp, user_
158
  events_num = count_total_events(extracted_event_path)
159
 
160
  if update_fn:
161
- update_fn(step_id, f"We have extracted {events_num} events from the documents.")
162
  step_id+=1
163
 
164
 
@@ -173,7 +173,7 @@ def run_pipeline(claim, pledge_date, pledge_author, start_date, timestamp, user_
173
  claim=claim,
174
  suggestion_meta=suggestion_meta
175
  )
176
- print(sorted_events)
177
  df = pd.DataFrame(sorted_events)
178
  sorted_event_path = f"{pipeline_base_dir}/sorted_events.xlsx"
179
  df.to_excel(sorted_event_path, index=False)
 
57
  with open(initial_tsv_file, "r", encoding="utf-8") as f:
58
  line_count = sum(1 for line in f)
59
  if update_fn:
60
+ update_fn(step_id, f"{line_count} URLs are retrieved")
61
  step_id+=1
62
 
63
 
 
70
  with open(initial_scraped_output_path, "r", encoding="utf-8") as f:
71
  line_count = sum(1 for line in f if json.loads(line)["url2text"] != [])
72
  if update_fn:
73
+ update_fn(step_id, f"{line_count} URL pages have been sucessefully scraped")
74
  step_id+=1
75
 
76
 
 
85
  questions = {line["question"] for line in json.load(f)["evidence"]}
86
  line_count = len(questions)
87
  if update_fn:
88
+ update_fn(step_id, f"{line_count} relevant queries are generated")
89
  step_id+=1
90
 
91
  else:
 
115
  with open(augmented_tsv_file, "r", encoding="utf-8") as f:
116
  line_count = sum(1 for line in f)
117
  if update_fn:
118
+ update_fn(step_id, f"{line_count} URLs are retrieved")
119
  step_id+=1
120
 
121
  augmented_data_store_dir = os.path.join(pipeline_base_dir, "augmented_data_store")
 
126
  with open(augmented_scraped_output_path, "r", encoding="utf-8") as f:
127
  line_count = sum(1 for line in f if json.loads(line)["url2text"] != [])
128
  if update_fn:
129
+ update_fn(step_id, f"{line_count} URL pages have been sucessefully scraped")
130
  step_id+=1
131
 
132
 
 
150
  if "url" in doc:
151
  unique_urls.add(doc["url"])
152
  if update_fn:
153
+ update_fn(step_id, f"{len(unique_urls)} documents are selected")
154
  step_id+=1
155
 
156
  extracted_event_path = run_gpt4_event_extraction(data_dir=pipeline_base_dir, max_tokens=100000)
 
158
  events_num = count_total_events(extracted_event_path)
159
 
160
  if update_fn:
161
+ update_fn(step_id, f"{events_num} events are extracted from those documents.")
162
  step_id+=1
163
 
164
 
 
173
  claim=claim,
174
  suggestion_meta=suggestion_meta
175
  )
176
+ # print(sorted_events)
177
  df = pd.DataFrame(sorted_events)
178
  sorted_event_path = f"{pipeline_base_dir}/sorted_events.xlsx"
179
  df.to_excel(sorted_event_path, index=False)
system/process_time.py CHANGED
@@ -169,7 +169,7 @@ def extract_and_sort_events(data_dir, pledge_date, pledge_author, claim, suggest
169
  file_path = os.path.join(data_dir, "gpt4_event_extraction", "gpt4o_results_0_claim.json")
170
  gpt4_results_json = load_json(file_path)
171
 
172
- print(gpt4_results_json)
173
  train_file_path = hf_hub_download(
174
  repo_id="PledgeTracker/demo_feedback",
175
  filename="train_useful.json",
@@ -179,7 +179,7 @@ def extract_and_sort_events(data_dir, pledge_date, pledge_author, claim, suggest
179
 
180
  with open(train_file_path, "r", encoding="utf-8") as f:
181
  train_data = json.load(f)
182
- print(train_data[0])
183
 
184
 
185
 
@@ -225,7 +225,7 @@ def extract_and_sort_events(data_dir, pledge_date, pledge_author, claim, suggest
225
 
226
  test_instance = f"Pledge: {pledge} (Speaker: {pledge_author}; Pledge Date: {pledge_date})\nEvent Summary: {event['event']} (Event Date: {original_date})\nIs this event summary useful?"
227
 
228
- print(test_instance)
229
 
230
  label, score = gpt_eval(test_instance, train_data, instruction, suggestion_meta, ICL_id=ICL_id)
231
 
 
169
  file_path = os.path.join(data_dir, "gpt4_event_extraction", "gpt4o_results_0_claim.json")
170
  gpt4_results_json = load_json(file_path)
171
 
172
+ # print(gpt4_results_json)
173
  train_file_path = hf_hub_download(
174
  repo_id="PledgeTracker/demo_feedback",
175
  filename="train_useful.json",
 
179
 
180
  with open(train_file_path, "r", encoding="utf-8") as f:
181
  train_data = json.load(f)
182
+ # print(train_data[0])
183
 
184
 
185
 
 
225
 
226
  test_instance = f"Pledge: {pledge} (Speaker: {pledge_author}; Pledge Date: {pledge_date})\nEvent Summary: {event['event']} (Event Date: {original_date})\nIs this event summary useful?"
227
 
228
+ # print(test_instance)
229
 
230
  label, score = gpt_eval(test_instance, train_data, instruction, suggestion_meta, ICL_id=ICL_id)
231
 
test.html CHANGED
@@ -317,7 +317,9 @@
317
  const data = await eventsRes.json();
318
  if (!Array.isArray(data)) throw new Error("Unexpected data format");
319
 
320
- p.innerHTML = `<strong>We have found ${data.length} events for this pledge.</strong><br><br>` +
 
 
321
  data.map((e, index) => `
322
  <div class="mb-6 border-b pb-4">
323
  ๐Ÿ—“๏ธ <b>${e.date}</b>: ${e.event}<br>
 
317
  const data = await eventsRes.json();
318
  if (!Array.isArray(data)) throw new Error("Unexpected data format");
319
 
320
+ // p.innerHTML = `<strong>We have found ${data.length} events for this pledge.</strong><br><br>` +
321
+ // data.map((e, index) => `
322
+ p.innerHTML =
323
  data.map((e, index) => `
324
  <div class="mb-6 border-b pb-4">
325
  ๐Ÿ—“๏ธ <b>${e.date}</b>: ${e.event}<br>