|
|
|
|
|
from datasets import load_from_disk |
|
from transformers import ( |
|
AutoModelForSequenceClassification, |
|
Trainer, |
|
TrainingArguments, |
|
EarlyStoppingCallback, |
|
) |
|
from sklearn.metrics import precision_recall_fscore_support, accuracy_score |
|
import numpy as np |
|
import torch |
|
from pathlib import Path |
|
import json |
|
import os |
|
|
|
|
|
MODEL_NAME = "microsoft/codebert-base" |
|
DATASET_PATH = "data/processed/dataset_binary" |
|
OUTPUT_DIR = "models/binary_v3" |
|
NUM_EPOCHS = 6 |
|
SEED = 42 |
|
|
|
|
|
dataset = load_from_disk(DATASET_PATH) |
|
|
|
|
|
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2) |
|
|
|
|
|
counts = dataset["train"]["label"] |
|
class_weights = torch.tensor([ |
|
(1.0 / sum(c == 0 for c in counts)), |
|
(1.0 / sum(c == 1 for c in counts)), |
|
]) |
|
class_weights = class_weights / class_weights.sum() |
|
|
|
def compute_metrics(pred): |
|
labels = pred.label_ids |
|
preds = np.argmax(pred.predictions, axis=1) |
|
acc = accuracy_score(labels, preds) |
|
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary") |
|
return { |
|
"accuracy": acc, |
|
"precision": precision, |
|
"recall": recall, |
|
"f1": f1, |
|
} |
|
|
|
|
|
args = TrainingArguments( |
|
output_dir=OUTPUT_DIR, |
|
evaluation_strategy="epoch", |
|
save_strategy="epoch", |
|
learning_rate=2e-5, |
|
per_device_train_batch_size=4, |
|
per_device_eval_batch_size=8, |
|
num_train_epochs=NUM_EPOCHS, |
|
weight_decay=0.01, |
|
load_best_model_at_end=True, |
|
metric_for_best_model="f1", |
|
logging_dir=f"{OUTPUT_DIR}/logs", |
|
logging_steps=50, |
|
save_total_limit=2, |
|
seed=SEED, |
|
fp16=True, |
|
) |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=args, |
|
train_dataset=dataset["train"], |
|
eval_dataset=dataset["validation"], |
|
compute_metrics=compute_metrics, |
|
callbacks=[EarlyStoppingCallback(early_stopping_patience=2)], |
|
) |
|
|
|
|
|
trainer.train() |
|
|
|
|
|
final_metrics = trainer.evaluate(dataset["test"]) |
|
print("📊 Final test metrics:", final_metrics) |
|
|
|
|
|
Path(f"{OUTPUT_DIR}/final").mkdir(parents=True, exist_ok=True) |
|
model.save_pretrained(f"{OUTPUT_DIR}/final") |
|
|
|
with open(f"{OUTPUT_DIR}/final/metrics.json", "w") as f: |
|
json.dump(final_metrics, f, indent=2) |
|
|
|
|
|
log_path = os.path.join(OUTPUT_DIR, "training_log.json") |
|
with open(log_path, "w", encoding="utf-8") as f: |
|
json.dump(trainer.state.log_history, f, indent=2) |
|
print(f"📝 Zapisano log treningu do {log_path}") |
|
print(f"✅ Model zapisany do {OUTPUT_DIR}/final") |
|
|