Spaces:
Runtime error
Runtime error
File size: 2,353 Bytes
178cc28 45a876e 178cc28 45a876e 32ce3ad 178cc28 45a876e 178cc28 45a876e 178cc28 45a876e 178cc28 45a876e 178cc28 45a876e 178cc28 45a876e 178cc28 45a876e 178cc28 45a876e 178cc28 45a876e 178cc28 45a876e 178cc28 45a876e 178cc28 45a876e 178cc28 f5e377f 45a876e 178cc28 45a876e 178cc28 63fe74a ac45da5 45a876e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
# train.py
import pandas as pd
from datasets import Dataset
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer,
AutoConfig
)
# π Set your model name (this will be created on Hugging Face)
model_name = "eemm-deberta-v3-small"
# β
These are the labels in your dataset
label_cols = [
"Cog_present", "Aff_present", "Self_present", "Motivation_present",
"Attention_present", "OB_present", "Context_present",
"Social", "Physical", "Psych"
]
# β
Load your cleaned dataset
df = pd.read_csv("/tmp/eemm_cleaned.csv")
df = df[["clean_question"] + label_cols]
dataset = Dataset.from_pandas(df)
# β
Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-small")
# β
Tokenize the data
def tokenize(example):
tokens = tokenizer(example["clean_question"], padding="max_length", truncation=True, max_length=128)
for label in label_cols:
tokens[label] = example[label]
return tokens
dataset = dataset.map(tokenize)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask"] + label_cols)
# β
Split into train/test
split = dataset.train_test_split(test_size=0.2)
train_dataset = split["train"]
eval_dataset = split["test"]
# β
Load model for multi-label classification
config = AutoConfig.from_pretrained(
"microsoft/deberta-v3-small",
num_labels=len(label_cols),
problem_type="multi_label_classification"
)
model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-small", config=config)
# β
Set training args
training_args = TrainingArguments(
output_dir="./results",
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=5,
evaluation_strategy="epoch",
save_strategy="epoch",
logging_dir="./logs",
logging_steps=10,
push_to_hub=True,
hub_model_id=f"Ozziejoe/{model_name}" # This must match your Hugging Face username/repo
)
# β
Train and push
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset
)
trainer.train()
trainer.save_model("./results")
tokenizer.save_pretrained("./results")
# β
Push to Hub
model.push_to_hub(model_name)
tokenizer.push_to_hub(model_name)
print("β
Model and tokenizer pushed successfully!")
|