File size: 942 Bytes
6ed54c3
 
bdfd315
 
6ed54c3
8ee409a
 
0cce6af
b2e2d0a
bdfd315
 
6ed54c3
2f99b21
 
6ed54c3
8ee409a
 
b2e2d0a
971461e
6ed54c3
8ee409a
 
6ed54c3
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# preprocess.py

import os
import pandas as pd
import re
import requests
from io import StringIO

# Hugging Face token from secret
token = os.getenv("HF_TOKEN")

# Correct dataset URL (update this if the filename changes)
url = "https://huggingface.co/datasets/Ozziejoe/concise_data/resolve/main/concise_data.csv"

# Download the dataset securely
headers = {"Authorization": f"Bearer {token}"}
response = requests.get(url, headers=headers)
response.raise_for_status()

# Read into pandas
df = pd.read_csv(StringIO(response.text))

# Clean text function
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

# Apply text cleaning
df["clean_question"] = df["merged_Question.x"].apply(clean_text)

# Save to temporary file for app.py to use
df.to_csv("/tmp/eemm_cleaned.csv", index=False)
print("✅ Saved cleaned data to /tmp/eemm_cleaned.csv")