File size: 2,353 Bytes
178cc28
 
 
 
 
 
 
 
 
 
 
 
45a876e
 
 
 
178cc28
45a876e
 
32ce3ad
178cc28
 
45a876e
178cc28
45a876e
 
178cc28
45a876e
 
178cc28
45a876e
 
 
178cc28
45a876e
 
178cc28
45a876e
 
178cc28
45a876e
 
 
 
178cc28
45a876e
178cc28
45a876e
178cc28
 
 
45a876e
178cc28
45a876e
178cc28
 
 
 
 
45a876e
 
178cc28
 
f5e377f
45a876e
178cc28
 
45a876e
178cc28
 
 
 
 
 
 
 
 
63fe74a
ac45da5
45a876e
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# train.py

import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig
)

# πŸ”‘ Set your model name (this will be created on Hugging Face)
model_name = "eemm-deberta-v3-small"

# βœ… These are the labels in your dataset
label_cols = [
    "Cog_present", "Aff_present", "Self_present", "Motivation_present",
    "Attention_present", "OB_present", "Context_present",
    "Social", "Physical", "Psych"
]

# βœ… Load your cleaned dataset
df = pd.read_csv("/tmp/eemm_cleaned.csv")
df = df[["clean_question"] + label_cols]
dataset = Dataset.from_pandas(df)

# βœ… Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-small")

# βœ… Tokenize the data
def tokenize(example):
    tokens = tokenizer(example["clean_question"], padding="max_length", truncation=True, max_length=128)
    for label in label_cols:
        tokens[label] = example[label]
    return tokens

dataset = dataset.map(tokenize)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask"] + label_cols)

# βœ… Split into train/test
split = dataset.train_test_split(test_size=0.2)
train_dataset = split["train"]
eval_dataset = split["test"]

# βœ… Load model for multi-label classification
config = AutoConfig.from_pretrained(
    "microsoft/deberta-v3-small",
    num_labels=len(label_cols),
    problem_type="multi_label_classification"
)
model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-small", config=config)

# βœ… Set training args
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    push_to_hub=True,
    hub_model_id=f"Ozziejoe/{model_name}"  # This must match your Hugging Face username/repo
)

# βœ… Train and push
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

trainer.train()
trainer.save_model("./results")
tokenizer.save_pretrained("./results")

# βœ… Push to Hub
model.push_to_hub(model_name)
tokenizer.push_to_hub(model_name)
print("βœ… Model and tokenizer pushed successfully!")