Pledge_Tracker / app.py
yulongchen's picture
Add system
bf32721
from flask import Flask, jsonify, send_file, request, send_from_directory
from flask_cors import CORS
import os, json, uuid, time
import pandas as pd
from datetime import datetime, timedelta
from huggingface_hub import HfApi
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from system.pledge_tracking import run_pipeline
from huggingface_hub import hf_hub_download
import spacy
import traceback
import threading
nlp = spacy.load("en_core_web_sm")
app = Flask(__name__, static_folder='.')
CORS(app)
HF_DATASET_REPO = "PledgeTracker/demo_feedback"
HF_TOKEN = os.environ.get("HF_TOKEN")
TMP_DIR = "tmp"
FEEDBACK_DIR = "feedback_logs"
os.makedirs(TMP_DIR, exist_ok=True)
os.makedirs(FEEDBACK_DIR, exist_ok=True)
REFERENCE_PLEDGES = []
REFERENCE_PLEDGE_PATH = hf_hub_download(
repo_id="PledgeTracker/demo_feedback",
filename="existing_pledges.txt",
repo_type="dataset",
token=os.environ["HF_TOKEN"]
)
if os.path.exists(REFERENCE_PLEDGE_PATH):
with open(REFERENCE_PLEDGE_PATH, "r") as f:
REFERENCE_PLEDGES = [line.strip() for line in f if line.strip()]
else:
print(f"Missing reference pledge file: {REFERENCE_PLEDGE_PATH}")
def lemmatize(text):
doc = nlp(text)
return " ".join([token.lemma_ for token in doc if not token.is_punct and not token.is_space])
@app.route("/api/similar-pledges", methods=["POST"])
def similar_pledges():
data = request.get_json()
claim = data.get("claim", "").strip()
if not claim or not REFERENCE_PLEDGES:
return jsonify({"suggestions": []})
all_pledges = [claim] + REFERENCE_PLEDGES
lemmatized_pledges = [lemmatize(p) for p in all_pledges]
vectorizer = TfidfVectorizer().fit_transform(lemmatized_pledges)
similarities = cosine_similarity(vectorizer[0:1], vectorizer[1:]).flatten()
filtered = [(i, similarities[i]) for i in range(len(similarities)) if similarities[i] > 0.3]
top_filtered = sorted(filtered, key=lambda x: x[1], reverse=True)[:5]
suggestions = [
{"text": REFERENCE_PLEDGES[i], "index": int(i)}
for i, score in top_filtered
]
return jsonify({"suggestions": suggestions})
def calculate_time_range(option: str, pledge_date: str = None):
today = datetime.today()
# pledge_date = datetime.strptime(pledge_date, "%Y-%m-%d")
if isinstance(pledge_date, str):
pledge_date = datetime.strptime(pledge_date, "%Y-%m-%d")
elif not isinstance(pledge_date, datetime):
raise ValueError("pledge_date must be a str or datetime")
if option == "week":
one_week_ago = today - timedelta(days=7)
start = max(one_week_ago, pledge_date)
elif option == "month":
one_month_ago = today - timedelta(days=30)
start = max(one_month_ago, pledge_date)
elif option == "since_pledge_date":
if not pledge_date:
raise ValueError("Pledge date is required for 'since_pledge_date' option")
start = pledge_date
else:
raise ValueError("Invalid time range option")
print(start, pledge_date)
return start.strftime("%Y%m%d"), today.strftime("%Y%m%d")
@app.route("/")
def serve_html():
return send_from_directory('.', 'test.html')
@app.route("/api/status")
def check_status():
user_id = request.args.get("user_id")
timestamp = request.args.get("timestamp")
log_file_path = os.path.join(TMP_DIR, f"{timestamp}_{user_id}_status.log")
if not os.path.exists(log_file_path):
return jsonify({"status": {}}), 200
try:
with open(log_file_path, "r") as f:
status = json.load(f)
except Exception:
status = {}
return jsonify({"status": status})
@app.route("/api/run-model", methods=["POST"])
def run_model():
data = request.get_json()
claim = data.get("claim", "no input")
time_range_option = data.get("time_range", "month")
system_start_time = datetime.now()
suggestion_meta = data.get("suggestion_meta")
pledge_date = data.get("pledge_date", "")
pledge_author = data.get("pledge_author", "")
timestamp = data.get("timestamp") or time.strftime("%Y-%m-%d_%H-%M-%S")
user_id = data.get("user_id") or str(uuid.uuid4())[:8]
log_file_path = os.path.join(TMP_DIR, f"{timestamp}_{user_id}_status.log")
status_lock = threading.Lock()
def update_status(step_id, msg):
print(f"[STATUS] Step {step_id}: {msg}")
with status_lock:
if os.path.exists(log_file_path):
try:
with open(log_file_path, "r") as f:
current = json.load(f)
except Exception:
current = {}
else:
current = {}
current[str(step_id)] = f"{msg}"
with open(log_file_path, "w") as f:
json.dump(current, f, indent=2)
try:
time_start, time_end = calculate_time_range(time_range_option, pledge_date=pledge_date)
print(f"[DEMO] Received claim: {claim}")
print(f"[DEMO] Time range: {time_start} ~ {time_end}")
print(f"[DEMO] Pledge date range: {pledge_date}")
# user_id = str(uuid.uuid4())[:8]
# outputs = run_pipeline(claim, pledge_date, pledge_author, time_start, timestamp, user_id)
update_status(0, "📌 Starting the system ...")
print(suggestion_meta)
outputs = run_pipeline(
claim, pledge_date, pledge_author, time_start, timestamp, user_id,
update_fn=update_status, suggestion_meta=suggestion_meta
)
df = pd.read_excel(outputs["sorted_events"])
json_path = os.path.join(TMP_DIR, f"{timestamp}_{user_id}.json")
df.to_json(json_path, orient="records", indent=2)
system_end_time = datetime.now()
runtime = system_end_time - system_start_time
events = df.to_dict(orient="records")
log_entry = {
"requested_time": timestamp,
"user_id": user_id,
"pledge": claim,
"suggestion_meta": suggestion_meta,
"time_start": time_start,
"time_end": time_end,
"runtime": runtime.total_seconds(),
"pledge_author": pledge_author,
"pledge_date": pledge_date,
"events": events
}
default_log_path = f"{FEEDBACK_DIR}/feedback_{timestamp}_{user_id}.jsonl"
with open(default_log_path, "w") as f:
f.write(json.dumps(log_entry, indent=1))
tsv_path = outputs["augmented_tsv_file"]
try:
api = HfApi()
api.upload_file(
path_or_fileobj=default_log_path,
path_in_repo=f"logs/feedback_{timestamp}_{user_id}.jsonl",
repo_id=HF_DATASET_REPO,
repo_type="dataset",
token=HF_TOKEN
)
api.upload_file(
path_or_fileobj=tsv_path,
path_in_repo=f"logs/augmented_{timestamp}_{user_id}.tsv",
repo_id=HF_DATASET_REPO,
repo_type="dataset",
token=HF_TOKEN
)
except Exception as e:
traceback.print_exc()
print(f"[Default Feedback Upload Error] {e}")
return jsonify({
"status": "success",
"file": f"{timestamp}_{user_id}.json",
"user_id": user_id,
"timestamp": timestamp
})
except Exception as e:
traceback.print_exc()
return jsonify({"status": "error", "detail": str(e)}), 500
@app.route("/api/events")
def get_events():
filename = request.args.get("file")
file_path = os.path.join(TMP_DIR, filename)
if not os.path.exists(file_path):
return jsonify({"error": "File not found"}), 404
with open(file_path, "r") as f:
events = json.load(f)
return jsonify(events)
@app.route("/api/feedback", methods=["POST"])
def receive_feedback():
data = request.get_json()
pledge = data.get("pledge", "no_pledge_text")
feedback_list = data.get("feedback", [])
filename = data.get("file")
file_path = os.path.join(TMP_DIR, filename)
timestamp = data.get("timestamp")
user_id = data.get("user_id")
if not user_id or not timestamp:
return jsonify({'status': 'error', 'detail': 'Missing user_id or timestamp'}), 400
if not os.path.exists(file_path):
return jsonify({"error": "Event file not found"}), 400
with open(file_path, "r") as f:
events = json.load(f)
suggestion_meta = None
time_start = None
time_end = None
try:
prev_log_path = f"{FEEDBACK_DIR}/feedback_{timestamp}_{user_id}.jsonl"
with open(prev_log_path, "r") as f:
previous_log = json.load(f)
suggestion_meta = previous_log.get("suggestion_meta")
time_start = previous_log.get("time_start")
time_end = previous_log.get("time_end")
pledge_author = previous_log.get("pledge_author")
pledge_date = previous_log.get("pledge_date")
runtime = previous_log.get("runtime")
except Exception:
pass
feedback_dict = {int(item['eventIndex']): item['answer'] for item in feedback_list}
for idx, event in enumerate(events):
event["user_feedback"] = feedback_dict.get(idx)
log_entry = {
"requested_time": timestamp,
"user_id": user_id,
"pledge": pledge,
"suggestion_meta": suggestion_meta,
"time_start": time_start,
"time_end": time_end,
"runtime": runtime,
"pledge_author": pledge_author,
"pledge_date": pledge_date,
"events": events
}
local_filename = f"{FEEDBACK_DIR}/feedback_{timestamp}_{user_id}.jsonl"
with open(local_filename, "w") as f:
f.write(json.dumps(log_entry, indent=1))
try:
api = HfApi()
api.upload_file(
path_or_fileobj=local_filename,
path_in_repo=f"logs/feedback_{timestamp}_{user_id}.jsonl",
repo_id=HF_DATASET_REPO,
repo_type="dataset",
token=HF_TOKEN
)
except Exception as e:
return jsonify({'status': 'partial_success', 'error': str(e)}), 500
return jsonify({'status': 'success'})
@app.route("/download-feedback/<filename>")
def download_feedback_file(filename):
return send_from_directory(FEEDBACK_DIR, filename, as_attachment=True)
@app.route("/feedback-files")
def list_feedback_files():
files = os.listdir(FEEDBACK_DIR)
return jsonify(sorted(files))
@app.route("/download")
def download_excel():
file = request.args.get("file")
if not file:
return "Missing file param", 400
json_path = os.path.join(TMP_DIR, file)
if not os.path.exists(json_path):
return "Event file not found", 404
with open(json_path, "r") as f:
data = json.load(f)
df = pd.DataFrame(data)
xlsx_path = os.path.join(TMP_DIR, file.replace(".json", ".xlsx"))
df.to_excel(xlsx_path, index=False)
return send_file(xlsx_path, as_attachment=True)
if __name__ == '__main__':
app.run(host="0.0.0.0", port=7860)