Ozziejoe commited on
Commit
32ce3ad
Β·
verified Β·
1 Parent(s): 178cc28

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +13 -20
train.py CHANGED
@@ -10,31 +10,25 @@ from transformers import (
10
  AutoConfig
11
  )
12
 
13
- # ---- STEP 1: Load data ----
14
  label_cols = [
15
  "Cog_present", "Aff_present", "Self_present",
16
- "Motivation_present", "Attention_present", "OB_present", "Context_present"
 
17
  ]
18
 
 
19
  df = pd.read_csv("/tmp/eemm_cleaned.csv")
20
  df_final = df[["clean_question"] + label_cols]
21
-
22
- # ---- STEP 2: Convert to Hugging Face dataset ----
23
  dataset = Dataset.from_pandas(df_final)
24
 
25
- # ---- STEP 3: Choose the best base model (changeable) ----
26
- base_model_name = "microsoft/deberta-v3-small" # πŸ”₯ Strong multi-label base model
27
-
28
- # ---- STEP 4: Tokenize ----
29
  tokenizer = AutoTokenizer.from_pretrained(base_model_name)
30
 
 
31
  def tokenize_and_format(example):
32
- tokenized = tokenizer(
33
- example["clean_question"],
34
- padding="max_length",
35
- truncation=True,
36
- max_length=128
37
- )
38
  for label in label_cols:
39
  tokenized[label] = example[label]
40
  return tokenized
@@ -42,21 +36,20 @@ def tokenize_and_format(example):
42
  tokenized_dataset = dataset.map(tokenize_and_format)
43
  tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"] + label_cols)
44
 
45
- # ---- STEP 5: Split into train/test ----
46
  train_test = tokenized_dataset.train_test_split(test_size=0.2)
47
  train_dataset = train_test["train"]
48
  eval_dataset = train_test["test"]
49
 
50
- # ---- STEP 6: Load model config and model ----
51
  config = AutoConfig.from_pretrained(
52
  base_model_name,
53
  num_labels=len(label_cols),
54
  problem_type="multi_label_classification"
55
  )
56
-
57
  model = AutoModelForSequenceClassification.from_pretrained(base_model_name, config=config)
58
 
59
- # ---- STEP 7: Trainer setup ----
60
  training_args = TrainingArguments(
61
  output_dir="./results",
62
  evaluation_strategy="epoch",
@@ -77,10 +70,10 @@ trainer = Trainer(
77
  eval_dataset=eval_dataset
78
  )
79
 
80
- # ---- STEP 8: Train ----
81
  trainer.train()
82
  print("βœ… Training complete.")
83
 
84
- # ---- STEP 9: Save model ----
85
  trainer.save_model("./results")
86
  print("βœ… Model saved to ./results")
 
10
  AutoConfig
11
  )
12
 
13
+ # STEP 1: Define labels
14
  label_cols = [
15
  "Cog_present", "Aff_present", "Self_present",
16
+ "Motivation_present", "Attention_present", "OB_present", "Context_present",
17
+ "Social", "Physical", "Psych"
18
  ]
19
 
20
+ # STEP 2: Load dataset
21
  df = pd.read_csv("/tmp/eemm_cleaned.csv")
22
  df_final = df[["clean_question"] + label_cols]
 
 
23
  dataset = Dataset.from_pandas(df_final)
24
 
25
+ # STEP 3: Choose model and tokenizer
26
+ base_model_name = "microsoft/deberta-v3-small"
 
 
27
  tokenizer = AutoTokenizer.from_pretrained(base_model_name)
28
 
29
+ # STEP 4: Tokenization
30
  def tokenize_and_format(example):
31
+ tokenized = tokenizer(example["clean_question"], padding="max_length", truncation=True, max_length=128)
 
 
 
 
 
32
  for label in label_cols:
33
  tokenized[label] = example[label]
34
  return tokenized
 
36
  tokenized_dataset = dataset.map(tokenize_and_format)
37
  tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"] + label_cols)
38
 
39
+ # STEP 5: Train/test split
40
  train_test = tokenized_dataset.train_test_split(test_size=0.2)
41
  train_dataset = train_test["train"]
42
  eval_dataset = train_test["test"]
43
 
44
+ # STEP 6: Model config and loading
45
  config = AutoConfig.from_pretrained(
46
  base_model_name,
47
  num_labels=len(label_cols),
48
  problem_type="multi_label_classification"
49
  )
 
50
  model = AutoModelForSequenceClassification.from_pretrained(base_model_name, config=config)
51
 
52
+ # STEP 7: TrainingArguments and Trainer
53
  training_args = TrainingArguments(
54
  output_dir="./results",
55
  evaluation_strategy="epoch",
 
70
  eval_dataset=eval_dataset
71
  )
72
 
73
+ # STEP 8: Train
74
  trainer.train()
75
  print("βœ… Training complete.")
76
 
77
+ # STEP 9: Save
78
  trainer.save_model("./results")
79
  print("βœ… Model saved to ./results")