Spaces:

Ozziejoe
/

eemmExemplarClassfier

Runtime error

App Files Files Community

eemmExemplarClassfier / train.py

Ozziejoe

Update train.py

45a876e verified 3 months ago

raw

history blame contribute delete

2.35 kB

	# train.py

	import pandas as pd
	from datasets import Dataset
	from transformers import (
	AutoTokenizer,
	AutoModelForSequenceClassification,
	TrainingArguments,
	Trainer,
	AutoConfig
	)

	# 🔑 Set your model name (this will be created on Hugging Face)
	model_name = "eemm-deberta-v3-small"

	# ✅ These are the labels in your dataset
	label_cols = [
	"Cog_present", "Aff_present", "Self_present", "Motivation_present",
	"Attention_present", "OB_present", "Context_present",
	"Social", "Physical", "Psych"
	]

	# ✅ Load your cleaned dataset
	df = pd.read_csv("/tmp/eemm_cleaned.csv")
	df = df[["clean_question"] + label_cols]
	dataset = Dataset.from_pandas(df)

	# ✅ Load tokenizer
	tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-small")

	# ✅ Tokenize the data
	def tokenize(example):
	tokens = tokenizer(example["clean_question"], padding="max_length", truncation=True, max_length=128)
	for label in label_cols:
	tokens[label] = example[label]
	return tokens

	dataset = dataset.map(tokenize)
	dataset.set_format(type="torch", columns=["input_ids", "attention_mask"] + label_cols)

	# ✅ Split into train/test
	split = dataset.train_test_split(test_size=0.2)
	train_dataset = split["train"]
	eval_dataset = split["test"]

	# ✅ Load model for multi-label classification
	config = AutoConfig.from_pretrained(
	"microsoft/deberta-v3-small",
	num_labels=len(label_cols),
	problem_type="multi_label_classification"
	)
	model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-small", config=config)

	# ✅ Set training args
	training_args = TrainingArguments(
	output_dir="./results",
	per_device_train_batch_size=16,
	per_device_eval_batch_size=16,
	num_train_epochs=5,
	evaluation_strategy="epoch",
	save_strategy="epoch",
	logging_dir="./logs",
	logging_steps=10,
	push_to_hub=True,
	hub_model_id=f"Ozziejoe/{model_name}" # This must match your Hugging Face username/repo
	)

	# ✅ Train and push
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	eval_dataset=eval_dataset
	)

	trainer.train()
	trainer.save_model("./results")
	tokenizer.save_pretrained("./results")

	# ✅ Push to Hub
	model.push_to_hub(model_name)
	tokenizer.push_to_hub(model_name)
	print("✅ Model and tokenizer pushed successfully!")