File size: 2,724 Bytes
e9b8340
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# 10.1_train_binary_model.py (v4 – z logowaniem)

from datasets import load_from_disk
from transformers import (
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
)
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import numpy as np
import torch
from pathlib import Path
import json
import os

# === Konfiguracja ===
MODEL_NAME = "microsoft/codebert-base"
DATASET_PATH = "data/processed/dataset_binary"
OUTPUT_DIR = "models/binary_v3"
NUM_EPOCHS = 6
SEED = 42

# === Wczytanie danych
dataset = load_from_disk(DATASET_PATH)

# === Model
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# === Waga klas (opcjonalnie — możesz usunąć)
counts = dataset["train"]["label"]
class_weights = torch.tensor([
    (1.0 / sum(c == 0 for c in counts)),
    (1.0 / sum(c == 1 for c in counts)),
])
class_weights = class_weights / class_weights.sum()

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

# === Argumenty treningowe
args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_dir=f"{OUTPUT_DIR}/logs",
    logging_steps=50,
    save_total_limit=2,
    seed=SEED,
    fp16=True,
)

# === Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

# === Trening
trainer.train()

# === Ewaluacja końcowa
final_metrics = trainer.evaluate(dataset["test"])
print("📊 Final test metrics:", final_metrics)

# === Zapis modelu i metryk
Path(f"{OUTPUT_DIR}/final").mkdir(parents=True, exist_ok=True)
model.save_pretrained(f"{OUTPUT_DIR}/final")

with open(f"{OUTPUT_DIR}/final/metrics.json", "w") as f:
    json.dump(final_metrics, f, indent=2)

# === Zapis logów treningowych
log_path = os.path.join(OUTPUT_DIR, "training_log.json")
with open(log_path, "w", encoding="utf-8") as f:
    json.dump(trainer.state.log_history, f, indent=2)
print(f"📝 Zapisano log treningu do {log_path}")
print(f"✅ Model zapisany do {OUTPUT_DIR}/final")