Spaces:

Ozziejoe
/

eemmExemplarClassfier

Runtime error

App Files Files Community

Ozziejoe commited on May 13

Commit

45a876e

verified ·

1 Parent(s): da6c739

Update train.py

Browse files

Files changed (1) hide show

train.py +34 -37

train.py CHANGED Viewed

@@ -9,63 +9,63 @@ from transformers import (
     Trainer,
     AutoConfig
 )
-import os
-# STEP 1: Define label columns
 label_cols = [
-    "Cog_present", "Aff_present", "Self_present",
-    "Motivation_present", "Attention_present", "OB_present", "Context_present",
     "Social", "Physical", "Psych"
 ]
-# STEP 2: Load and prepare dataset
 df = pd.read_csv("/tmp/eemm_cleaned.csv")
-df_final = df[["clean_question"] + label_cols]
-dataset = Dataset.from_pandas(df_final)
-# STEP 3: Choose model and tokenizer
-base_model_name = "microsoft/deberta-v3-small"
-tokenizer = AutoTokenizer.from_pretrained(base_model_name)
-# STEP 4: Tokenization
-def tokenize_and_format(example):
-    tokenized = tokenizer(example["clean_question"], padding="max_length", truncation=True, max_length=128)
     for label in label_cols:
-        tokenized[label] = example[label]
-    return tokenized
-tokenized_dataset = dataset.map(tokenize_and_format)
-tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"] + label_cols)
-# STEP 5: Train/test split
-train_test = tokenized_dataset.train_test_split(test_size=0.2)
-train_dataset = train_test["train"]
-eval_dataset = train_test["test"]
-# STEP 6: Load model with config for multi-label
 config = AutoConfig.from_pretrained(
-    base_model_name,
     num_labels=len(label_cols),
     problem_type="multi_label_classification"
 )
-model = AutoModelForSequenceClassification.from_pretrained(base_model_name, config=config)
-# STEP 7: Define training arguments
 training_args = TrainingArguments(
     output_dir="./results",
-    evaluation_strategy="epoch",
     per_device_train_batch_size=16,
     per_device_eval_batch_size=16,
     num_train_epochs=5,
-    weight_decay=0.01,
     logging_dir="./logs",
     logging_steps=10,
-    save_strategy="epoch",
-    save_total_limit=1,
     push_to_hub=True,
-    hub_model_id="Ozziejoe/eemm-deberta-v3-small"  # Your HF model ID
 )
 trainer = Trainer(
     model=model,
     args=training_args,
@@ -73,14 +73,11 @@ trainer = Trainer(
     eval_dataset=eval_dataset
 )
-# STEP 8: Train the model
 trainer.train()
-print("✅ Training complete.")
-# STEP 9: Save locally and push to Hugging Face Hub
 trainer.save_model("./results")
 tokenizer.save_pretrained("./results")
-model.push_to_hub("eemm-deberta-v3-small")
-tokenizer.push_to_hub("eemm-deberta-v3-small")
-print("✅ Model pushed to Hugging Face Hub at: https://huggingface.co/Ozziejoe/eemm-deberta-v3-small")

     Trainer,
     AutoConfig
 )
+# 🔑 Set your model name (this will be created on Hugging Face)
+model_name = "eemm-deberta-v3-small"
+# ✅ These are the labels in your dataset
 label_cols = [
+    "Cog_present", "Aff_present", "Self_present", "Motivation_present",
+    "Attention_present", "OB_present", "Context_present",
     "Social", "Physical", "Psych"
 ]
+# ✅ Load your cleaned dataset
 df = pd.read_csv("/tmp/eemm_cleaned.csv")
+df = df[["clean_question"] + label_cols]
+dataset = Dataset.from_pandas(df)
+# ✅ Load tokenizer
+tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-small")
+# ✅ Tokenize the data
+def tokenize(example):
+    tokens = tokenizer(example["clean_question"], padding="max_length", truncation=True, max_length=128)
     for label in label_cols:
+        tokens[label] = example[label]
+    return tokens
+dataset = dataset.map(tokenize)
+dataset.set_format(type="torch", columns=["input_ids", "attention_mask"] + label_cols)
+# ✅ Split into train/test
+split = dataset.train_test_split(test_size=0.2)
+train_dataset = split["train"]
+eval_dataset = split["test"]
+# ✅ Load model for multi-label classification
 config = AutoConfig.from_pretrained(
+    "microsoft/deberta-v3-small",
     num_labels=len(label_cols),
     problem_type="multi_label_classification"
 )
+model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-small", config=config)
+# ✅ Set training args
 training_args = TrainingArguments(
     output_dir="./results",
     per_device_train_batch_size=16,
     per_device_eval_batch_size=16,
     num_train_epochs=5,
+    evaluation_strategy="epoch",
+    save_strategy="epoch",
     logging_dir="./logs",
     logging_steps=10,
     push_to_hub=True,
+    hub_model_id=f"Ozziejoe/{model_name}"  # This must match your Hugging Face username/repo
 )
+# ✅ Train and push
 trainer = Trainer(
     model=model,
     args=training_args,
     eval_dataset=eval_dataset
 )
 trainer.train()
 trainer.save_model("./results")
 tokenizer.save_pretrained("./results")
+# ✅ Push to Hub
+model.push_to_hub(model_name)
+tokenizer.push_to_hub(model_name)
+print("✅ Model and tokenizer pushed successfully!")