# 10.1_train_binary_model.py (v4 – z logowaniem) from datasets import load_from_disk from transformers import ( AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback, ) from sklearn.metrics import precision_recall_fscore_support, accuracy_score import numpy as np import torch from pathlib import Path import json import os # === Konfiguracja === MODEL_NAME = "microsoft/codebert-base" DATASET_PATH = "data/processed/dataset_binary" OUTPUT_DIR = "models/binary_v3" NUM_EPOCHS = 6 SEED = 42 # === Wczytanie danych dataset = load_from_disk(DATASET_PATH) # === Model model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2) # === Waga klas (opcjonalnie — możesz usunąć) counts = dataset["train"]["label"] class_weights = torch.tensor([ (1.0 / sum(c == 0 for c in counts)), (1.0 / sum(c == 1 for c in counts)), ]) class_weights = class_weights / class_weights.sum() def compute_metrics(pred): labels = pred.label_ids preds = np.argmax(pred.predictions, axis=1) acc = accuracy_score(labels, preds) precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary") return { "accuracy": acc, "precision": precision, "recall": recall, "f1": f1, } # === Argumenty treningowe args = TrainingArguments( output_dir=OUTPUT_DIR, evaluation_strategy="epoch", save_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=4, per_device_eval_batch_size=8, num_train_epochs=NUM_EPOCHS, weight_decay=0.01, load_best_model_at_end=True, metric_for_best_model="f1", logging_dir=f"{OUTPUT_DIR}/logs", logging_steps=50, save_total_limit=2, seed=SEED, fp16=True, ) # === Trainer trainer = Trainer( model=model, args=args, train_dataset=dataset["train"], eval_dataset=dataset["validation"], compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=2)], ) # === Trening trainer.train() # === Ewaluacja końcowa final_metrics = trainer.evaluate(dataset["test"]) print("📊 Final test metrics:", final_metrics) # === Zapis modelu i metryk Path(f"{OUTPUT_DIR}/final").mkdir(parents=True, exist_ok=True) model.save_pretrained(f"{OUTPUT_DIR}/final") with open(f"{OUTPUT_DIR}/final/metrics.json", "w") as f: json.dump(final_metrics, f, indent=2) # === Zapis logów treningowych log_path = os.path.join(OUTPUT_DIR, "training_log.json") with open(log_path, "w", encoding="utf-8") as f: json.dump(trainer.state.log_history, f, indent=2) print(f"📝 Zapisano log treningu do {log_path}") print(f"✅ Model zapisany do {OUTPUT_DIR}/final")