Spaces:

PledgeTracker
/

Pledge_Tracker

Sleeping

App Files Files Community

yulongchen commited on Jun 15

Commit

259448b

1 Parent(s): b0280c9

Add system

Browse files

Files changed (5) hide show

.DS_Store +0 -0
app.py +8 -6
system/pledge_tracking.py +8 -8
system/process_time.py +3 -3
test.html +3 -1

.DS_Store CHANGED Viewed

Binary files a/.DS_Store and b/.DS_Store differ

app.py CHANGED Viewed

@@ -71,7 +71,12 @@ def similar_pledges():
 def calculate_time_range(option: str, pledge_date: str = None):
     today = datetime.today()
-    pledge_date = datetime.strptime(pledge_date, "%Y-%m-%d")
     if option == "week":
         one_week_ago = today - timedelta(days=7)
@@ -79,16 +84,13 @@ def calculate_time_range(option: str, pledge_date: str = None):
     elif option == "month":
         one_month_ago = today - timedelta(days=30)
         start = max(one_month_ago, pledge_date)
-    elif option == "year":
-        one_year_ago = today - timedelta(days=365)
-        start = max(one_year_ago, pledge_date)
     elif option == "since_pledge_date":
         if not pledge_date:
             raise ValueError("Pledge date is required for 'since_pledge_date' option")
         start = datetime.strptime(pledge_date, "%Y-%m-%d")
     else:
         raise ValueError("Invalid time range option")
-    print(start)
     return start.strftime("%Y%m%d"), today.strftime("%Y%m%d")
 @app.route("/")
@@ -146,7 +148,7 @@ def run_model():
         time_start, time_end = calculate_time_range(time_range_option, pledge_date=pledge_date)
         print(f"[DEMO] Received claim: {claim}")
         print(f"[DEMO] Time range: {time_start} ~ {time_end}")
-        print(f"[DEMO] Time range: {pledge_date}")
         # user_id = str(uuid.uuid4())[:8]
         # outputs = run_pipeline(claim, pledge_date, pledge_author, time_start, timestamp, user_id)

 def calculate_time_range(option: str, pledge_date: str = None):
     today = datetime.today()
+    # pledge_date = datetime.strptime(pledge_date, "%Y-%m-%d")
+    if isinstance(pledge_date, str):
+        pledge_date = datetime.strptime(pledge_date, "%Y-%m-%d")
+    elif not isinstance(pledge_date, datetime):
+        raise ValueError("pledge_date must be a str or datetime")
     if option == "week":
         one_week_ago = today - timedelta(days=7)
     elif option == "month":
         one_month_ago = today - timedelta(days=30)
         start = max(one_month_ago, pledge_date)
     elif option == "since_pledge_date":
         if not pledge_date:
             raise ValueError("Pledge date is required for 'since_pledge_date' option")
         start = datetime.strptime(pledge_date, "%Y-%m-%d")
     else:
         raise ValueError("Invalid time range option")
+    print(start, one_week_ago, one_month_ago, pledge_date)
     return start.strftime("%Y%m%d"), today.strftime("%Y%m%d")
 @app.route("/")
         time_start, time_end = calculate_time_range(time_range_option, pledge_date=pledge_date)
         print(f"[DEMO] Received claim: {claim}")
         print(f"[DEMO] Time range: {time_start} ~ {time_end}")
+        print(f"[DEMO] Pledge date range: {pledge_date}")
         # user_id = str(uuid.uuid4())[:8]
         # outputs = run_pipeline(claim, pledge_date, pledge_author, time_start, timestamp, user_id)

system/pledge_tracking.py CHANGED Viewed

@@ -57,7 +57,7 @@ def run_pipeline(claim, pledge_date, pledge_author, start_date, timestamp, user_
         with open(initial_tsv_file, "r", encoding="utf-8") as f:
             line_count = sum(1 for line in f)
         if update_fn:
-            update_fn(step_id, f"We have found {line_count} URLs")
             step_id+=1
@@ -70,7 +70,7 @@ def run_pipeline(claim, pledge_date, pledge_author, start_date, timestamp, user_
         with open(initial_scraped_output_path, "r", encoding="utf-8") as f:
             line_count = sum(1 for line in f if json.loads(line)["url2text"] != [])
         if update_fn:
-            update_fn(step_id, f"We have scraped {line_count} URLs")
             step_id+=1
@@ -85,7 +85,7 @@ def run_pipeline(claim, pledge_date, pledge_author, start_date, timestamp, user_
             questions = {line["question"] for line in json.load(f)["evidence"]}
             line_count = len(questions)
         if update_fn:
-            update_fn(step_id, f"We have generated {line_count} search queries")
             step_id+=1
     else:
@@ -115,7 +115,7 @@ def run_pipeline(claim, pledge_date, pledge_author, start_date, timestamp, user_
     with open(augmented_tsv_file, "r", encoding="utf-8") as f:
         line_count = sum(1 for line in f)
     if update_fn:
-        update_fn(step_id, f"We have found {line_count} URLs")
         step_id+=1
     augmented_data_store_dir = os.path.join(pipeline_base_dir, "augmented_data_store")
@@ -126,7 +126,7 @@ def run_pipeline(claim, pledge_date, pledge_author, start_date, timestamp, user_
     with open(augmented_scraped_output_path, "r", encoding="utf-8") as f:
         line_count = sum(1 for line in f if json.loads(line)["url2text"] != [])
     if update_fn:
-        update_fn(step_id, f"We have scraped {line_count} URLs")
         step_id+=1
@@ -150,7 +150,7 @@ def run_pipeline(claim, pledge_date, pledge_author, start_date, timestamp, user_
                 if "url" in doc:
                     unique_urls.add(doc["url"])
     if update_fn:
-        update_fn(step_id, f"We have found {len(unique_urls)} most relevant documents")
         step_id+=1
     extracted_event_path = run_gpt4_event_extraction(data_dir=pipeline_base_dir, max_tokens=100000)
@@ -158,7 +158,7 @@ def run_pipeline(claim, pledge_date, pledge_author, start_date, timestamp, user_
     events_num = count_total_events(extracted_event_path)
     if update_fn:
-        update_fn(step_id, f"We have extracted {events_num} events from the documents.")
         step_id+=1
@@ -173,7 +173,7 @@ def run_pipeline(claim, pledge_date, pledge_author, start_date, timestamp, user_
         claim=claim,
         suggestion_meta=suggestion_meta
         )
-    print(sorted_events)
     df = pd.DataFrame(sorted_events)
     sorted_event_path = f"{pipeline_base_dir}/sorted_events.xlsx"
     df.to_excel(sorted_event_path, index=False)

         with open(initial_tsv_file, "r", encoding="utf-8") as f:
             line_count = sum(1 for line in f)
         if update_fn:
+            update_fn(step_id, f"{line_count} URLs are retrieved")
             step_id+=1
         with open(initial_scraped_output_path, "r", encoding="utf-8") as f:
             line_count = sum(1 for line in f if json.loads(line)["url2text"] != [])
         if update_fn:
+            update_fn(step_id, f"{line_count} URL pages have been sucessefully scraped")
             step_id+=1
             questions = {line["question"] for line in json.load(f)["evidence"]}
             line_count = len(questions)
         if update_fn:
+            update_fn(step_id, f"{line_count} relevant queries are generated")
             step_id+=1
     else:
     with open(augmented_tsv_file, "r", encoding="utf-8") as f:
         line_count = sum(1 for line in f)
     if update_fn:
+        update_fn(step_id, f"{line_count} URLs are retrieved")
         step_id+=1
     augmented_data_store_dir = os.path.join(pipeline_base_dir, "augmented_data_store")
     with open(augmented_scraped_output_path, "r", encoding="utf-8") as f:
         line_count = sum(1 for line in f if json.loads(line)["url2text"] != [])
     if update_fn:
+        update_fn(step_id, f"{line_count} URL pages have been sucessefully scraped")
         step_id+=1
                 if "url" in doc:
                     unique_urls.add(doc["url"])
     if update_fn:
+        update_fn(step_id, f"{len(unique_urls)} documents are selected")
         step_id+=1
     extracted_event_path = run_gpt4_event_extraction(data_dir=pipeline_base_dir, max_tokens=100000)
     events_num = count_total_events(extracted_event_path)
     if update_fn:
+        update_fn(step_id, f"{events_num} events are extracted from those documents.")
         step_id+=1
         claim=claim,
         suggestion_meta=suggestion_meta
         )
+    # print(sorted_events)
     df = pd.DataFrame(sorted_events)
     sorted_event_path = f"{pipeline_base_dir}/sorted_events.xlsx"
     df.to_excel(sorted_event_path, index=False)

system/process_time.py CHANGED Viewed

@@ -169,7 +169,7 @@ def extract_and_sort_events(data_dir, pledge_date, pledge_author, claim, suggest
     file_path = os.path.join(data_dir, "gpt4_event_extraction", "gpt4o_results_0_claim.json")
     gpt4_results_json = load_json(file_path)
-    print(gpt4_results_json)
     train_file_path = hf_hub_download(
         repo_id="PledgeTracker/demo_feedback",
         filename="train_useful.json",
@@ -179,7 +179,7 @@ def extract_and_sort_events(data_dir, pledge_date, pledge_author, claim, suggest
     with open(train_file_path, "r", encoding="utf-8") as f:
         train_data = json.load(f)
-        print(train_data[0])
@@ -225,7 +225,7 @@ def extract_and_sort_events(data_dir, pledge_date, pledge_author, claim, suggest
                 test_instance = f"Pledge: {pledge} (Speaker: {pledge_author}; Pledge Date: {pledge_date})\nEvent Summary: {event['event']} (Event Date: {original_date})\nIs this event summary useful?"
-                print(test_instance)
                 label, score = gpt_eval(test_instance, train_data, instruction, suggestion_meta, ICL_id=ICL_id)

     file_path = os.path.join(data_dir, "gpt4_event_extraction", "gpt4o_results_0_claim.json")
     gpt4_results_json = load_json(file_path)
+    # print(gpt4_results_json)
     train_file_path = hf_hub_download(
         repo_id="PledgeTracker/demo_feedback",
         filename="train_useful.json",
     with open(train_file_path, "r", encoding="utf-8") as f:
         train_data = json.load(f)
+        # print(train_data[0])
                 test_instance = f"Pledge: {pledge} (Speaker: {pledge_author}; Pledge Date: {pledge_date})\nEvent Summary: {event['event']} (Event Date: {original_date})\nIs this event summary useful?"
+                # print(test_instance)
                 label, score = gpt_eval(test_instance, train_data, instruction, suggestion_meta, ICL_id=ICL_id)

test.html CHANGED Viewed

@@ -317,7 +317,9 @@
           const data = await eventsRes.json();
           if (!Array.isArray(data)) throw new Error("Unexpected data format");
-          p.innerHTML = `<strong>We have found ${data.length} events for this pledge.</strong><br><br>` +
             data.map((e, index) => `
               <div class="mb-6 border-b pb-4">
                 🗓️ <b>${e.date}</b>: ${e.event}<br>

           const data = await eventsRes.json();
           if (!Array.isArray(data)) throw new Error("Unexpected data format");
+          // p.innerHTML = `<strong>We have found ${data.length} events for this pledge.</strong><br><br>` +
+          //   data.map((e, index) => `
+          p.innerHTML =
             data.map((e, index) => `
               <div class="mb-6 border-b pb-4">
                 🗓️ <b>${e.date}</b>: ${e.event}<br>