Ozziejoe's picture
Update train.py
45a876e verified
# train.py
import pandas as pd
from datasets import Dataset
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer,
AutoConfig
)
# πŸ”‘ Set your model name (this will be created on Hugging Face)
model_name = "eemm-deberta-v3-small"
# βœ… These are the labels in your dataset
label_cols = [
"Cog_present", "Aff_present", "Self_present", "Motivation_present",
"Attention_present", "OB_present", "Context_present",
"Social", "Physical", "Psych"
]
# βœ… Load your cleaned dataset
df = pd.read_csv("/tmp/eemm_cleaned.csv")
df = df[["clean_question"] + label_cols]
dataset = Dataset.from_pandas(df)
# βœ… Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-small")
# βœ… Tokenize the data
def tokenize(example):
tokens = tokenizer(example["clean_question"], padding="max_length", truncation=True, max_length=128)
for label in label_cols:
tokens[label] = example[label]
return tokens
dataset = dataset.map(tokenize)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask"] + label_cols)
# βœ… Split into train/test
split = dataset.train_test_split(test_size=0.2)
train_dataset = split["train"]
eval_dataset = split["test"]
# βœ… Load model for multi-label classification
config = AutoConfig.from_pretrained(
"microsoft/deberta-v3-small",
num_labels=len(label_cols),
problem_type="multi_label_classification"
)
model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-small", config=config)
# βœ… Set training args
training_args = TrainingArguments(
output_dir="./results",
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=5,
evaluation_strategy="epoch",
save_strategy="epoch",
logging_dir="./logs",
logging_steps=10,
push_to_hub=True,
hub_model_id=f"Ozziejoe/{model_name}" # This must match your Hugging Face username/repo
)
# βœ… Train and push
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset
)
trainer.train()
trainer.save_model("./results")
tokenizer.save_pretrained("./results")
# βœ… Push to Hub
model.push_to_hub(model_name)
tokenizer.push_to_hub(model_name)
print("βœ… Model and tokenizer pushed successfully!")