# preprocess.py import os import pandas as pd import re import requests from io import StringIO # Hugging Face token from secret token = os.getenv("HF_TOKEN") # Correct dataset URL (update this if the filename changes) url = "https://huggingface.co/datasets/Ozziejoe/concise_data/resolve/main/concise_data.csv" # Download the dataset securely headers = {"Authorization": f"Bearer {token}"} response = requests.get(url, headers=headers) response.raise_for_status() # Read into pandas df = pd.read_csv(StringIO(response.text)) # Clean text function def clean_text(text): text = str(text).lower() text = re.sub(r"[^\w\s]", "", text) text = re.sub(r"\s+", " ", text) return text.strip() # Apply text cleaning df["clean_question"] = df["merged_Question.x"].apply(clean_text) # Save to temporary file for app.py to use df.to_csv("/tmp/eemm_cleaned.csv", index=False) print("✅ Saved cleaned data to /tmp/eemm_cleaned.csv")