Spaces:
Runtime error
Runtime error
File size: 942 Bytes
6ed54c3 bdfd315 6ed54c3 8ee409a 0cce6af b2e2d0a bdfd315 6ed54c3 2f99b21 6ed54c3 8ee409a b2e2d0a 971461e 6ed54c3 8ee409a 6ed54c3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
# preprocess.py
import os
import pandas as pd
import re
import requests
from io import StringIO
# Hugging Face token from secret
token = os.getenv("HF_TOKEN")
# Correct dataset URL (update this if the filename changes)
url = "https://huggingface.co/datasets/Ozziejoe/concise_data/resolve/main/concise_data.csv"
# Download the dataset securely
headers = {"Authorization": f"Bearer {token}"}
response = requests.get(url, headers=headers)
response.raise_for_status()
# Read into pandas
df = pd.read_csv(StringIO(response.text))
# Clean text function
def clean_text(text):
text = str(text).lower()
text = re.sub(r"[^\w\s]", "", text)
text = re.sub(r"\s+", " ", text)
return text.strip()
# Apply text cleaning
df["clean_question"] = df["merged_Question.x"].apply(clean_text)
# Save to temporary file for app.py to use
df.to_csv("/tmp/eemm_cleaned.csv", index=False)
print("✅ Saved cleaned data to /tmp/eemm_cleaned.csv")
|