File size: 2,460 Bytes
e9b8340
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# 11.1_evaluate_binary_v3.py

from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer
from datasets import load_from_disk
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import json

# === Ścieżki
MODEL_DIR = Path("models/binary/final")
DATASET_DIR = Path("data/processed/dataset_binary")
OUT_DIR = MODEL_DIR

REPORT_CSV = OUT_DIR / "classification_report.csv"
REPORT_JSON = OUT_DIR / "metrics.json"
CONF_MATRIX_PNG = OUT_DIR / "confusion_matrix.png"

# === Wczytaj model
print("📂 Wczytywanie modelu...")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)

# === Wczytaj tokenizer z modelu lub zapasowy
tokenizer_files = list(MODEL_DIR.glob("tokenizer*"))
if tokenizer_files:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
else:
    print("⚠️ Brak tokenizera w modelu — pobieram z microsoft/codebert-base")
    tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
    tokenizer.save_pretrained(MODEL_DIR)

# === Wczytaj dane
ds = load_from_disk(str(DATASET_DIR))
trainer = Trainer(model=model)

# === Predykcja
print("🔍 Predykcja na zbiorze testowym...")
predictions = trainer.predict(ds["test"])
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

# === Raport klasyfikacji
print("\n📊 Raport klasyfikacji:")
report_dict = classification_report(
    y_true, y_pred, target_names=["good", "bad"], zero_division=0, output_dict=True
)
report_text = classification_report(
    y_true, y_pred, target_names=["good", "bad"], zero_division=0
)
print(report_text)

# Zapis CSV + JSON
df_report = pd.DataFrame(report_dict).transpose()
df_report.to_csv(REPORT_CSV)
with open(REPORT_JSON, "w") as f:
    json.dump(report_dict, f, indent=2)

print(f"💾 Zapisano raport CSV: {REPORT_CSV}")
print(f"💾 Zapisano metryki JSON: {REPORT_JSON}")

# === Macierz pomyłek + wykres
conf_matrix = confusion_matrix(y_true, y_pred)
labels = ["good", "bad"]
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=labels)

plt.figure(figsize=(5, 4))
disp.plot(cmap="Purples", values_format="d")
plt.title("🧱 Confusion Matrix – Binary Classifier")
plt.grid(False)
plt.tight_layout()
plt.savefig(CONF_MATRIX_PNG)
plt.close()

print(f"🖼️ Zapisano confusion matrix jako PNG: {CONF_MATRIX_PNG}")