Spaces:
Runtime error
Runtime error
# train.py | |
import pandas as pd | |
from datasets import Dataset | |
from transformers import ( | |
AutoTokenizer, | |
AutoModelForSequenceClassification, | |
TrainingArguments, | |
Trainer, | |
AutoConfig | |
) | |
# π Set your model name (this will be created on Hugging Face) | |
model_name = "eemm-deberta-v3-small" | |
# β These are the labels in your dataset | |
label_cols = [ | |
"Cog_present", "Aff_present", "Self_present", "Motivation_present", | |
"Attention_present", "OB_present", "Context_present", | |
"Social", "Physical", "Psych" | |
] | |
# β Load your cleaned dataset | |
df = pd.read_csv("/tmp/eemm_cleaned.csv") | |
df = df[["clean_question"] + label_cols] | |
dataset = Dataset.from_pandas(df) | |
# β Load tokenizer | |
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-small") | |
# β Tokenize the data | |
def tokenize(example): | |
tokens = tokenizer(example["clean_question"], padding="max_length", truncation=True, max_length=128) | |
for label in label_cols: | |
tokens[label] = example[label] | |
return tokens | |
dataset = dataset.map(tokenize) | |
dataset.set_format(type="torch", columns=["input_ids", "attention_mask"] + label_cols) | |
# β Split into train/test | |
split = dataset.train_test_split(test_size=0.2) | |
train_dataset = split["train"] | |
eval_dataset = split["test"] | |
# β Load model for multi-label classification | |
config = AutoConfig.from_pretrained( | |
"microsoft/deberta-v3-small", | |
num_labels=len(label_cols), | |
problem_type="multi_label_classification" | |
) | |
model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-small", config=config) | |
# β Set training args | |
training_args = TrainingArguments( | |
output_dir="./results", | |
per_device_train_batch_size=16, | |
per_device_eval_batch_size=16, | |
num_train_epochs=5, | |
evaluation_strategy="epoch", | |
save_strategy="epoch", | |
logging_dir="./logs", | |
logging_steps=10, | |
push_to_hub=True, | |
hub_model_id=f"Ozziejoe/{model_name}" # This must match your Hugging Face username/repo | |
) | |
# β Train and push | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=train_dataset, | |
eval_dataset=eval_dataset | |
) | |
trainer.train() | |
trainer.save_model("./results") | |
tokenizer.save_pretrained("./results") | |
# β Push to Hub | |
model.push_to_hub(model_name) | |
tokenizer.push_to_hub(model_name) | |
print("β Model and tokenizer pushed successfully!") | |