Spaces:

Ozziejoe
/

eemmExemplarClassfier

Runtime error

eemmExemplarClassfier / preprocess.py

Update preprocess.py

6ed54c3 verified 3 months ago

942 Bytes

	# preprocess.py

	import os
	import pandas as pd
	import re
	import requests
	from io import StringIO

	# Hugging Face token from secret
	token = os.getenv("HF_TOKEN")

	# Correct dataset URL (update this if the filename changes)
	url = "https://huggingface.co/datasets/Ozziejoe/concise_data/resolve/main/concise_data.csv"

	# Download the dataset securely
	headers = {"Authorization": f"Bearer {token}"}
	response = requests.get(url, headers=headers)
	response.raise_for_status()

	# Read into pandas
	df = pd.read_csv(StringIO(response.text))

	# Clean text function
	def clean_text(text):
	text = str(text).lower()
	text = re.sub(r"[^\w\s]", "", text)
	text = re.sub(r"\s+", " ", text)
	return text.strip()

	# Apply text cleaning
	df["clean_question"] = df["merged_Question.x"].apply(clean_text)

	# Save to temporary file for app.py to use
	df.to_csv("/tmp/eemm_cleaned.csv", index=False)
	print("✅ Saved cleaned data to /tmp/eemm_cleaned.csv")