binary-dockerfile-model / scripts /10.1_train_binary_model.py
LeeSek's picture
Add scripts
e9b8340 verified
# 10.1_train_binary_model.py (v4 – z logowaniem)
from datasets import load_from_disk
from transformers import (
AutoModelForSequenceClassification,
Trainer,
TrainingArguments,
EarlyStoppingCallback,
)
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import numpy as np
import torch
from pathlib import Path
import json
import os
# === Konfiguracja ===
MODEL_NAME = "microsoft/codebert-base"
DATASET_PATH = "data/processed/dataset_binary"
OUTPUT_DIR = "models/binary_v3"
NUM_EPOCHS = 6
SEED = 42
# === Wczytanie danych
dataset = load_from_disk(DATASET_PATH)
# === Model
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
# === Waga klas (opcjonalnie — możesz usunąć)
counts = dataset["train"]["label"]
class_weights = torch.tensor([
(1.0 / sum(c == 0 for c in counts)),
(1.0 / sum(c == 1 for c in counts)),
])
class_weights = class_weights / class_weights.sum()
def compute_metrics(pred):
labels = pred.label_ids
preds = np.argmax(pred.predictions, axis=1)
acc = accuracy_score(labels, preds)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
return {
"accuracy": acc,
"precision": precision,
"recall": recall,
"f1": f1,
}
# === Argumenty treningowe
args = TrainingArguments(
output_dir=OUTPUT_DIR,
evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=4,
per_device_eval_batch_size=8,
num_train_epochs=NUM_EPOCHS,
weight_decay=0.01,
load_best_model_at_end=True,
metric_for_best_model="f1",
logging_dir=f"{OUTPUT_DIR}/logs",
logging_steps=50,
save_total_limit=2,
seed=SEED,
fp16=True,
)
# === Trainer
trainer = Trainer(
model=model,
args=args,
train_dataset=dataset["train"],
eval_dataset=dataset["validation"],
compute_metrics=compute_metrics,
callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)
# === Trening
trainer.train()
# === Ewaluacja końcowa
final_metrics = trainer.evaluate(dataset["test"])
print("📊 Final test metrics:", final_metrics)
# === Zapis modelu i metryk
Path(f"{OUTPUT_DIR}/final").mkdir(parents=True, exist_ok=True)
model.save_pretrained(f"{OUTPUT_DIR}/final")
with open(f"{OUTPUT_DIR}/final/metrics.json", "w") as f:
json.dump(final_metrics, f, indent=2)
# === Zapis logów treningowych
log_path = os.path.join(OUTPUT_DIR, "training_log.json")
with open(log_path, "w", encoding="utf-8") as f:
json.dump(trainer.state.log_history, f, indent=2)
print(f"📝 Zapisano log treningu do {log_path}")
print(f"✅ Model zapisany do {OUTPUT_DIR}/final")