# train.py

import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig
)

# 🔑 Set your model name (this will be created on Hugging Face)
model_name = "eemm-deberta-v3-small"

# ✅ These are the labels in your dataset
label_cols = [
    "Cog_present", "Aff_present", "Self_present", "Motivation_present",
    "Attention_present", "OB_present", "Context_present",
    "Social", "Physical", "Psych"
]

# ✅ Load your cleaned dataset
df = pd.read_csv("/tmp/eemm_cleaned.csv")
df = df[["clean_question"] + label_cols]
dataset = Dataset.from_pandas(df)

# ✅ Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-small")

# ✅ Tokenize the data
def tokenize(example):
    tokens = tokenizer(example["clean_question"], padding="max_length", truncation=True, max_length=128)
    for label in label_cols:
        tokens[label] = example[label]
    return tokens

dataset = dataset.map(tokenize)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask"] + label_cols)

# ✅ Split into train/test
split = dataset.train_test_split(test_size=0.2)
train_dataset = split["train"]
eval_dataset = split["test"]

# ✅ Load model for multi-label classification
config = AutoConfig.from_pretrained(
    "microsoft/deberta-v3-small",
    num_labels=len(label_cols),
    problem_type="multi_label_classification"
)
model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-small", config=config)

# ✅ Set training args
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    push_to_hub=True,
    hub_model_id=f"Ozziejoe/{model_name}"  # This must match your Hugging Face username/repo
)

# ✅ Train and push
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

trainer.train()
trainer.save_model("./results")
tokenizer.save_pretrained("./results")

# ✅ Push to Hub
model.push_to_hub(model_name)
tokenizer.push_to_hub(model_name)
print("✅ Model and tokenizer pushed successfully!")