Ozziejoe commited on
Commit
45a876e
Β·
verified Β·
1 Parent(s): da6c739

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +34 -37
train.py CHANGED
@@ -9,63 +9,63 @@ from transformers import (
9
  Trainer,
10
  AutoConfig
11
  )
12
- import os
13
 
14
- # STEP 1: Define label columns
 
 
 
15
  label_cols = [
16
- "Cog_present", "Aff_present", "Self_present",
17
- "Motivation_present", "Attention_present", "OB_present", "Context_present",
18
  "Social", "Physical", "Psych"
19
  ]
20
 
21
- # STEP 2: Load and prepare dataset
22
  df = pd.read_csv("/tmp/eemm_cleaned.csv")
23
- df_final = df[["clean_question"] + label_cols]
24
- dataset = Dataset.from_pandas(df_final)
25
 
26
- # STEP 3: Choose model and tokenizer
27
- base_model_name = "microsoft/deberta-v3-small"
28
- tokenizer = AutoTokenizer.from_pretrained(base_model_name)
29
 
30
- # STEP 4: Tokenization
31
- def tokenize_and_format(example):
32
- tokenized = tokenizer(example["clean_question"], padding="max_length", truncation=True, max_length=128)
33
  for label in label_cols:
34
- tokenized[label] = example[label]
35
- return tokenized
36
 
37
- tokenized_dataset = dataset.map(tokenize_and_format)
38
- tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"] + label_cols)
39
 
40
- # STEP 5: Train/test split
41
- train_test = tokenized_dataset.train_test_split(test_size=0.2)
42
- train_dataset = train_test["train"]
43
- eval_dataset = train_test["test"]
44
 
45
- # STEP 6: Load model with config for multi-label
46
  config = AutoConfig.from_pretrained(
47
- base_model_name,
48
  num_labels=len(label_cols),
49
  problem_type="multi_label_classification"
50
  )
51
- model = AutoModelForSequenceClassification.from_pretrained(base_model_name, config=config)
52
 
53
- # STEP 7: Define training arguments
54
  training_args = TrainingArguments(
55
  output_dir="./results",
56
- evaluation_strategy="epoch",
57
  per_device_train_batch_size=16,
58
  per_device_eval_batch_size=16,
59
  num_train_epochs=5,
60
- weight_decay=0.01,
 
61
  logging_dir="./logs",
62
  logging_steps=10,
63
- save_strategy="epoch",
64
- save_total_limit=1,
65
  push_to_hub=True,
66
- hub_model_id="Ozziejoe/eemm-deberta-v3-small" # Your HF model ID
67
  )
68
 
 
69
  trainer = Trainer(
70
  model=model,
71
  args=training_args,
@@ -73,14 +73,11 @@ trainer = Trainer(
73
  eval_dataset=eval_dataset
74
  )
75
 
76
- # STEP 8: Train the model
77
  trainer.train()
78
- print("βœ… Training complete.")
79
-
80
- # STEP 9: Save locally and push to Hugging Face Hub
81
  trainer.save_model("./results")
82
  tokenizer.save_pretrained("./results")
83
 
84
- model.push_to_hub("eemm-deberta-v3-small")
85
- tokenizer.push_to_hub("eemm-deberta-v3-small")
86
- print("βœ… Model pushed to Hugging Face Hub at: https://huggingface.co/Ozziejoe/eemm-deberta-v3-small")
 
 
9
  Trainer,
10
  AutoConfig
11
  )
 
12
 
13
+ # πŸ”‘ Set your model name (this will be created on Hugging Face)
14
+ model_name = "eemm-deberta-v3-small"
15
+
16
+ # βœ… These are the labels in your dataset
17
  label_cols = [
18
+ "Cog_present", "Aff_present", "Self_present", "Motivation_present",
19
+ "Attention_present", "OB_present", "Context_present",
20
  "Social", "Physical", "Psych"
21
  ]
22
 
23
+ # βœ… Load your cleaned dataset
24
  df = pd.read_csv("/tmp/eemm_cleaned.csv")
25
+ df = df[["clean_question"] + label_cols]
26
+ dataset = Dataset.from_pandas(df)
27
 
28
+ # βœ… Load tokenizer
29
+ tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-small")
 
30
 
31
+ # βœ… Tokenize the data
32
+ def tokenize(example):
33
+ tokens = tokenizer(example["clean_question"], padding="max_length", truncation=True, max_length=128)
34
  for label in label_cols:
35
+ tokens[label] = example[label]
36
+ return tokens
37
 
38
+ dataset = dataset.map(tokenize)
39
+ dataset.set_format(type="torch", columns=["input_ids", "attention_mask"] + label_cols)
40
 
41
+ # βœ… Split into train/test
42
+ split = dataset.train_test_split(test_size=0.2)
43
+ train_dataset = split["train"]
44
+ eval_dataset = split["test"]
45
 
46
+ # βœ… Load model for multi-label classification
47
  config = AutoConfig.from_pretrained(
48
+ "microsoft/deberta-v3-small",
49
  num_labels=len(label_cols),
50
  problem_type="multi_label_classification"
51
  )
52
+ model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-small", config=config)
53
 
54
+ # βœ… Set training args
55
  training_args = TrainingArguments(
56
  output_dir="./results",
 
57
  per_device_train_batch_size=16,
58
  per_device_eval_batch_size=16,
59
  num_train_epochs=5,
60
+ evaluation_strategy="epoch",
61
+ save_strategy="epoch",
62
  logging_dir="./logs",
63
  logging_steps=10,
 
 
64
  push_to_hub=True,
65
+ hub_model_id=f"Ozziejoe/{model_name}" # This must match your Hugging Face username/repo
66
  )
67
 
68
+ # βœ… Train and push
69
  trainer = Trainer(
70
  model=model,
71
  args=training_args,
 
73
  eval_dataset=eval_dataset
74
  )
75
 
 
76
  trainer.train()
 
 
 
77
  trainer.save_model("./results")
78
  tokenizer.save_pretrained("./results")
79
 
80
+ # βœ… Push to Hub
81
+ model.push_to_hub(model_name)
82
+ tokenizer.push_to_hub(model_name)
83
+ print("βœ… Model and tokenizer pushed successfully!")