Spaces:
Runtime error
Runtime error
Update train.py
Browse files
train.py
CHANGED
@@ -9,63 +9,63 @@ from transformers import (
|
|
9 |
Trainer,
|
10 |
AutoConfig
|
11 |
)
|
12 |
-
import os
|
13 |
|
14 |
-
#
|
|
|
|
|
|
|
15 |
label_cols = [
|
16 |
-
"Cog_present", "Aff_present", "Self_present",
|
17 |
-
"
|
18 |
"Social", "Physical", "Psych"
|
19 |
]
|
20 |
|
21 |
-
#
|
22 |
df = pd.read_csv("/tmp/eemm_cleaned.csv")
|
23 |
-
|
24 |
-
dataset = Dataset.from_pandas(
|
25 |
|
26 |
-
#
|
27 |
-
|
28 |
-
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
|
29 |
|
30 |
-
#
|
31 |
-
def
|
32 |
-
|
33 |
for label in label_cols:
|
34 |
-
|
35 |
-
return
|
36 |
|
37 |
-
|
38 |
-
|
39 |
|
40 |
-
#
|
41 |
-
|
42 |
-
train_dataset =
|
43 |
-
eval_dataset =
|
44 |
|
45 |
-
#
|
46 |
config = AutoConfig.from_pretrained(
|
47 |
-
|
48 |
num_labels=len(label_cols),
|
49 |
problem_type="multi_label_classification"
|
50 |
)
|
51 |
-
model = AutoModelForSequenceClassification.from_pretrained(
|
52 |
|
53 |
-
#
|
54 |
training_args = TrainingArguments(
|
55 |
output_dir="./results",
|
56 |
-
evaluation_strategy="epoch",
|
57 |
per_device_train_batch_size=16,
|
58 |
per_device_eval_batch_size=16,
|
59 |
num_train_epochs=5,
|
60 |
-
|
|
|
61 |
logging_dir="./logs",
|
62 |
logging_steps=10,
|
63 |
-
save_strategy="epoch",
|
64 |
-
save_total_limit=1,
|
65 |
push_to_hub=True,
|
66 |
-
hub_model_id="Ozziejoe/
|
67 |
)
|
68 |
|
|
|
69 |
trainer = Trainer(
|
70 |
model=model,
|
71 |
args=training_args,
|
@@ -73,14 +73,11 @@ trainer = Trainer(
|
|
73 |
eval_dataset=eval_dataset
|
74 |
)
|
75 |
|
76 |
-
# STEP 8: Train the model
|
77 |
trainer.train()
|
78 |
-
print("β
Training complete.")
|
79 |
-
|
80 |
-
# STEP 9: Save locally and push to Hugging Face Hub
|
81 |
trainer.save_model("./results")
|
82 |
tokenizer.save_pretrained("./results")
|
83 |
|
84 |
-
|
85 |
-
|
86 |
-
|
|
|
|
9 |
Trainer,
|
10 |
AutoConfig
|
11 |
)
|
|
|
12 |
|
13 |
+
# π Set your model name (this will be created on Hugging Face)
|
14 |
+
model_name = "eemm-deberta-v3-small"
|
15 |
+
|
16 |
+
# β
These are the labels in your dataset
|
17 |
label_cols = [
|
18 |
+
"Cog_present", "Aff_present", "Self_present", "Motivation_present",
|
19 |
+
"Attention_present", "OB_present", "Context_present",
|
20 |
"Social", "Physical", "Psych"
|
21 |
]
|
22 |
|
23 |
+
# β
Load your cleaned dataset
|
24 |
df = pd.read_csv("/tmp/eemm_cleaned.csv")
|
25 |
+
df = df[["clean_question"] + label_cols]
|
26 |
+
dataset = Dataset.from_pandas(df)
|
27 |
|
28 |
+
# β
Load tokenizer
|
29 |
+
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-small")
|
|
|
30 |
|
31 |
+
# β
Tokenize the data
|
32 |
+
def tokenize(example):
|
33 |
+
tokens = tokenizer(example["clean_question"], padding="max_length", truncation=True, max_length=128)
|
34 |
for label in label_cols:
|
35 |
+
tokens[label] = example[label]
|
36 |
+
return tokens
|
37 |
|
38 |
+
dataset = dataset.map(tokenize)
|
39 |
+
dataset.set_format(type="torch", columns=["input_ids", "attention_mask"] + label_cols)
|
40 |
|
41 |
+
# β
Split into train/test
|
42 |
+
split = dataset.train_test_split(test_size=0.2)
|
43 |
+
train_dataset = split["train"]
|
44 |
+
eval_dataset = split["test"]
|
45 |
|
46 |
+
# β
Load model for multi-label classification
|
47 |
config = AutoConfig.from_pretrained(
|
48 |
+
"microsoft/deberta-v3-small",
|
49 |
num_labels=len(label_cols),
|
50 |
problem_type="multi_label_classification"
|
51 |
)
|
52 |
+
model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-small", config=config)
|
53 |
|
54 |
+
# β
Set training args
|
55 |
training_args = TrainingArguments(
|
56 |
output_dir="./results",
|
|
|
57 |
per_device_train_batch_size=16,
|
58 |
per_device_eval_batch_size=16,
|
59 |
num_train_epochs=5,
|
60 |
+
evaluation_strategy="epoch",
|
61 |
+
save_strategy="epoch",
|
62 |
logging_dir="./logs",
|
63 |
logging_steps=10,
|
|
|
|
|
64 |
push_to_hub=True,
|
65 |
+
hub_model_id=f"Ozziejoe/{model_name}" # This must match your Hugging Face username/repo
|
66 |
)
|
67 |
|
68 |
+
# β
Train and push
|
69 |
trainer = Trainer(
|
70 |
model=model,
|
71 |
args=training_args,
|
|
|
73 |
eval_dataset=eval_dataset
|
74 |
)
|
75 |
|
|
|
76 |
trainer.train()
|
|
|
|
|
|
|
77 |
trainer.save_model("./results")
|
78 |
tokenizer.save_pretrained("./results")
|
79 |
|
80 |
+
# β
Push to Hub
|
81 |
+
model.push_to_hub(model_name)
|
82 |
+
tokenizer.push_to_hub(model_name)
|
83 |
+
print("β
Model and tokenizer pushed successfully!")
|