Spaces:
Runtime error
Runtime error
# preprocess.py | |
import os | |
import pandas as pd | |
import re | |
import requests | |
from io import StringIO | |
# Hugging Face token from secret | |
token = os.getenv("HF_TOKEN") | |
# Correct dataset URL (update this if the filename changes) | |
url = "https://huggingface.co/datasets/Ozziejoe/concise_data/resolve/main/concise_data.csv" | |
# Download the dataset securely | |
headers = {"Authorization": f"Bearer {token}"} | |
response = requests.get(url, headers=headers) | |
response.raise_for_status() | |
# Read into pandas | |
df = pd.read_csv(StringIO(response.text)) | |
# Clean text function | |
def clean_text(text): | |
text = str(text).lower() | |
text = re.sub(r"[^\w\s]", "", text) | |
text = re.sub(r"\s+", " ", text) | |
return text.strip() | |
# Apply text cleaning | |
df["clean_question"] = df["merged_Question.x"].apply(clean_text) | |
# Save to temporary file for app.py to use | |
df.to_csv("/tmp/eemm_cleaned.csv", index=False) | |
print("β Saved cleaned data to /tmp/eemm_cleaned.csv") | |