Spaces:
Sleeping
Sleeping
from flask import Flask, jsonify, send_file, request, send_from_directory | |
from flask_cors import CORS | |
import os, json, uuid, time | |
import pandas as pd | |
from datetime import datetime, timedelta | |
from huggingface_hub import HfApi | |
import sys | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
from system.pledge_tracking import run_pipeline | |
from huggingface_hub import hf_hub_download | |
import spacy | |
import traceback | |
import threading | |
nlp = spacy.load("en_core_web_sm") | |
app = Flask(__name__, static_folder='.') | |
CORS(app) | |
HF_DATASET_REPO = "PledgeTracker/demo_feedback" | |
HF_TOKEN = os.environ.get("HF_TOKEN") | |
TMP_DIR = "tmp" | |
FEEDBACK_DIR = "feedback_logs" | |
os.makedirs(TMP_DIR, exist_ok=True) | |
os.makedirs(FEEDBACK_DIR, exist_ok=True) | |
REFERENCE_PLEDGES = [] | |
REFERENCE_PLEDGE_PATH = hf_hub_download( | |
repo_id="PledgeTracker/demo_feedback", | |
filename="existing_pledges.txt", | |
repo_type="dataset", | |
token=os.environ["HF_TOKEN"] | |
) | |
if os.path.exists(REFERENCE_PLEDGE_PATH): | |
with open(REFERENCE_PLEDGE_PATH, "r") as f: | |
REFERENCE_PLEDGES = [line.strip() for line in f if line.strip()] | |
else: | |
print(f"Missing reference pledge file: {REFERENCE_PLEDGE_PATH}") | |
def lemmatize(text): | |
doc = nlp(text) | |
return " ".join([token.lemma_ for token in doc if not token.is_punct and not token.is_space]) | |
def similar_pledges(): | |
data = request.get_json() | |
claim = data.get("claim", "").strip() | |
if not claim or not REFERENCE_PLEDGES: | |
return jsonify({"suggestions": []}) | |
all_pledges = [claim] + REFERENCE_PLEDGES | |
lemmatized_pledges = [lemmatize(p) for p in all_pledges] | |
vectorizer = TfidfVectorizer().fit_transform(lemmatized_pledges) | |
similarities = cosine_similarity(vectorizer[0:1], vectorizer[1:]).flatten() | |
filtered = [(i, similarities[i]) for i in range(len(similarities)) if similarities[i] > 0.3] | |
top_filtered = sorted(filtered, key=lambda x: x[1], reverse=True)[:5] | |
suggestions = [ | |
{"text": REFERENCE_PLEDGES[i], "index": int(i)} | |
for i, score in top_filtered | |
] | |
return jsonify({"suggestions": suggestions}) | |
def calculate_time_range(option: str, pledge_date: str = None): | |
today = datetime.today() | |
# pledge_date = datetime.strptime(pledge_date, "%Y-%m-%d") | |
if isinstance(pledge_date, str): | |
pledge_date = datetime.strptime(pledge_date, "%Y-%m-%d") | |
elif not isinstance(pledge_date, datetime): | |
raise ValueError("pledge_date must be a str or datetime") | |
if option == "week": | |
one_week_ago = today - timedelta(days=7) | |
start = max(one_week_ago, pledge_date) | |
elif option == "month": | |
one_month_ago = today - timedelta(days=30) | |
start = max(one_month_ago, pledge_date) | |
elif option == "since_pledge_date": | |
if not pledge_date: | |
raise ValueError("Pledge date is required for 'since_pledge_date' option") | |
start = pledge_date | |
else: | |
raise ValueError("Invalid time range option") | |
print(start, pledge_date) | |
return start.strftime("%Y%m%d"), today.strftime("%Y%m%d") | |
def serve_html(): | |
return send_from_directory('.', 'test.html') | |
def check_status(): | |
user_id = request.args.get("user_id") | |
timestamp = request.args.get("timestamp") | |
log_file_path = os.path.join(TMP_DIR, f"{timestamp}_{user_id}_status.log") | |
if not os.path.exists(log_file_path): | |
return jsonify({"status": {}}), 200 | |
try: | |
with open(log_file_path, "r") as f: | |
status = json.load(f) | |
except Exception: | |
status = {} | |
return jsonify({"status": status}) | |
def run_model(): | |
data = request.get_json() | |
claim = data.get("claim", "no input") | |
time_range_option = data.get("time_range", "month") | |
system_start_time = datetime.now() | |
suggestion_meta = data.get("suggestion_meta") | |
pledge_date = data.get("pledge_date", "") | |
pledge_author = data.get("pledge_author", "") | |
timestamp = data.get("timestamp") or time.strftime("%Y-%m-%d_%H-%M-%S") | |
user_id = data.get("user_id") or str(uuid.uuid4())[:8] | |
log_file_path = os.path.join(TMP_DIR, f"{timestamp}_{user_id}_status.log") | |
status_lock = threading.Lock() | |
def update_status(step_id, msg): | |
print(f"[STATUS] Step {step_id}: {msg}") | |
with status_lock: | |
if os.path.exists(log_file_path): | |
try: | |
with open(log_file_path, "r") as f: | |
current = json.load(f) | |
except Exception: | |
current = {} | |
else: | |
current = {} | |
current[str(step_id)] = f"{msg}" | |
with open(log_file_path, "w") as f: | |
json.dump(current, f, indent=2) | |
try: | |
time_start, time_end = calculate_time_range(time_range_option, pledge_date=pledge_date) | |
print(f"[DEMO] Received claim: {claim}") | |
print(f"[DEMO] Time range: {time_start} ~ {time_end}") | |
print(f"[DEMO] Pledge date range: {pledge_date}") | |
# user_id = str(uuid.uuid4())[:8] | |
# outputs = run_pipeline(claim, pledge_date, pledge_author, time_start, timestamp, user_id) | |
update_status(0, "📌 Starting the system ...") | |
print(suggestion_meta) | |
outputs = run_pipeline( | |
claim, pledge_date, pledge_author, time_start, timestamp, user_id, | |
update_fn=update_status, suggestion_meta=suggestion_meta | |
) | |
df = pd.read_excel(outputs["sorted_events"]) | |
json_path = os.path.join(TMP_DIR, f"{timestamp}_{user_id}.json") | |
df.to_json(json_path, orient="records", indent=2) | |
system_end_time = datetime.now() | |
runtime = system_end_time - system_start_time | |
events = df.to_dict(orient="records") | |
log_entry = { | |
"requested_time": timestamp, | |
"user_id": user_id, | |
"pledge": claim, | |
"suggestion_meta": suggestion_meta, | |
"time_start": time_start, | |
"time_end": time_end, | |
"runtime": runtime.total_seconds(), | |
"pledge_author": pledge_author, | |
"pledge_date": pledge_date, | |
"events": events | |
} | |
default_log_path = f"{FEEDBACK_DIR}/feedback_{timestamp}_{user_id}.jsonl" | |
with open(default_log_path, "w") as f: | |
f.write(json.dumps(log_entry, indent=1)) | |
tsv_path = outputs["augmented_tsv_file"] | |
try: | |
api = HfApi() | |
api.upload_file( | |
path_or_fileobj=default_log_path, | |
path_in_repo=f"logs/feedback_{timestamp}_{user_id}.jsonl", | |
repo_id=HF_DATASET_REPO, | |
repo_type="dataset", | |
token=HF_TOKEN | |
) | |
api.upload_file( | |
path_or_fileobj=tsv_path, | |
path_in_repo=f"logs/augmented_{timestamp}_{user_id}.tsv", | |
repo_id=HF_DATASET_REPO, | |
repo_type="dataset", | |
token=HF_TOKEN | |
) | |
except Exception as e: | |
traceback.print_exc() | |
print(f"[Default Feedback Upload Error] {e}") | |
return jsonify({ | |
"status": "success", | |
"file": f"{timestamp}_{user_id}.json", | |
"user_id": user_id, | |
"timestamp": timestamp | |
}) | |
except Exception as e: | |
traceback.print_exc() | |
return jsonify({"status": "error", "detail": str(e)}), 500 | |
def get_events(): | |
filename = request.args.get("file") | |
file_path = os.path.join(TMP_DIR, filename) | |
if not os.path.exists(file_path): | |
return jsonify({"error": "File not found"}), 404 | |
with open(file_path, "r") as f: | |
events = json.load(f) | |
return jsonify(events) | |
def receive_feedback(): | |
data = request.get_json() | |
pledge = data.get("pledge", "no_pledge_text") | |
feedback_list = data.get("feedback", []) | |
filename = data.get("file") | |
file_path = os.path.join(TMP_DIR, filename) | |
timestamp = data.get("timestamp") | |
user_id = data.get("user_id") | |
if not user_id or not timestamp: | |
return jsonify({'status': 'error', 'detail': 'Missing user_id or timestamp'}), 400 | |
if not os.path.exists(file_path): | |
return jsonify({"error": "Event file not found"}), 400 | |
with open(file_path, "r") as f: | |
events = json.load(f) | |
suggestion_meta = None | |
time_start = None | |
time_end = None | |
try: | |
prev_log_path = f"{FEEDBACK_DIR}/feedback_{timestamp}_{user_id}.jsonl" | |
with open(prev_log_path, "r") as f: | |
previous_log = json.load(f) | |
suggestion_meta = previous_log.get("suggestion_meta") | |
time_start = previous_log.get("time_start") | |
time_end = previous_log.get("time_end") | |
pledge_author = previous_log.get("pledge_author") | |
pledge_date = previous_log.get("pledge_date") | |
runtime = previous_log.get("runtime") | |
except Exception: | |
pass | |
feedback_dict = {int(item['eventIndex']): item['answer'] for item in feedback_list} | |
for idx, event in enumerate(events): | |
event["user_feedback"] = feedback_dict.get(idx) | |
log_entry = { | |
"requested_time": timestamp, | |
"user_id": user_id, | |
"pledge": pledge, | |
"suggestion_meta": suggestion_meta, | |
"time_start": time_start, | |
"time_end": time_end, | |
"runtime": runtime, | |
"pledge_author": pledge_author, | |
"pledge_date": pledge_date, | |
"events": events | |
} | |
local_filename = f"{FEEDBACK_DIR}/feedback_{timestamp}_{user_id}.jsonl" | |
with open(local_filename, "w") as f: | |
f.write(json.dumps(log_entry, indent=1)) | |
try: | |
api = HfApi() | |
api.upload_file( | |
path_or_fileobj=local_filename, | |
path_in_repo=f"logs/feedback_{timestamp}_{user_id}.jsonl", | |
repo_id=HF_DATASET_REPO, | |
repo_type="dataset", | |
token=HF_TOKEN | |
) | |
except Exception as e: | |
return jsonify({'status': 'partial_success', 'error': str(e)}), 500 | |
return jsonify({'status': 'success'}) | |
def download_feedback_file(filename): | |
return send_from_directory(FEEDBACK_DIR, filename, as_attachment=True) | |
def list_feedback_files(): | |
files = os.listdir(FEEDBACK_DIR) | |
return jsonify(sorted(files)) | |
def download_excel(): | |
file = request.args.get("file") | |
if not file: | |
return "Missing file param", 400 | |
json_path = os.path.join(TMP_DIR, file) | |
if not os.path.exists(json_path): | |
return "Event file not found", 404 | |
with open(json_path, "r") as f: | |
data = json.load(f) | |
df = pd.DataFrame(data) | |
xlsx_path = os.path.join(TMP_DIR, file.replace(".json", ".xlsx")) | |
df.to_excel(xlsx_path, index=False) | |
return send_file(xlsx_path, as_attachment=True) | |
if __name__ == '__main__': | |
app.run(host="0.0.0.0", port=7860) | |