# train.py import pandas as pd from datasets import Dataset from transformers import ( AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoConfig ) # 🔑 Set your model name (this will be created on Hugging Face) model_name = "eemm-deberta-v3-small" # ✅ These are the labels in your dataset label_cols = [ "Cog_present", "Aff_present", "Self_present", "Motivation_present", "Attention_present", "OB_present", "Context_present", "Social", "Physical", "Psych" ] # ✅ Load your cleaned dataset df = pd.read_csv("/tmp/eemm_cleaned.csv") df = df[["clean_question"] + label_cols] dataset = Dataset.from_pandas(df) # ✅ Load tokenizer tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-small") # ✅ Tokenize the data def tokenize(example): tokens = tokenizer(example["clean_question"], padding="max_length", truncation=True, max_length=128) for label in label_cols: tokens[label] = example[label] return tokens dataset = dataset.map(tokenize) dataset.set_format(type="torch", columns=["input_ids", "attention_mask"] + label_cols) # ✅ Split into train/test split = dataset.train_test_split(test_size=0.2) train_dataset = split["train"] eval_dataset = split["test"] # ✅ Load model for multi-label classification config = AutoConfig.from_pretrained( "microsoft/deberta-v3-small", num_labels=len(label_cols), problem_type="multi_label_classification" ) model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-small", config=config) # ✅ Set training args training_args = TrainingArguments( output_dir="./results", per_device_train_batch_size=16, per_device_eval_batch_size=16, num_train_epochs=5, evaluation_strategy="epoch", save_strategy="epoch", logging_dir="./logs", logging_steps=10, push_to_hub=True, hub_model_id=f"Ozziejoe/{model_name}" # This must match your Hugging Face username/repo ) # ✅ Train and push trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset ) trainer.train() trainer.save_model("./results") tokenizer.save_pretrained("./results") # ✅ Push to Hub model.push_to_hub(model_name) tokenizer.push_to_hub(model_name) print("✅ Model and tokenizer pushed successfully!")