|
|
|
|
|
import json |
|
from pathlib import Path |
|
from collections import Counter |
|
import matplotlib.pyplot as plt |
|
import numpy as np |
|
|
|
|
|
INPUT_PATH = Path("data/labeled/labeled_dockerfiles.jsonl") |
|
TOP_RULES_PATH = Path("data/metadata/top_rules.json") |
|
TOP_N = 30 |
|
|
|
|
|
labels_counter = Counter() |
|
rules_counter = Counter() |
|
rules_per_file = [] |
|
lines_with_errors_per_file = [] |
|
lengths = [] |
|
all_line_positions = [] |
|
|
|
fixable_counter = 0 |
|
unique_rules_with_fixes = set() |
|
|
|
print("🔍 Analizuję dane...") |
|
|
|
with open(INPUT_PATH, encoding="utf-8") as f: |
|
for line in f: |
|
obj = json.loads(line) |
|
labels_counter[obj["label"]] += 1 |
|
|
|
if obj["label"] == "bad": |
|
rules = obj.get("rules_triggered", []) |
|
rules_counter.update(rules) |
|
rules_per_file.append(len(rules)) |
|
|
|
|
|
fixes = obj.get("fix_suggestions", {}) |
|
if fixes: |
|
fixable_counter += 1 |
|
unique_rules_with_fixes.update(fixes.keys()) |
|
|
|
|
|
lines = obj.get("lines", {}).values() |
|
line_set = set(lines) |
|
lines_with_errors_per_file.append(len(line_set)) |
|
all_line_positions.extend(lines) |
|
|
|
|
|
lengths.append(len(obj["content"])) |
|
|
|
|
|
print("\n📊 Statystyki:") |
|
print(f"✅ Good: {labels_counter['good']}") |
|
print(f"❌ Bad: {labels_counter['bad']}") |
|
print(f"🧩 Łączna liczba unikalnych reguł: {len(rules_counter)}") |
|
print(f"🛠 Plików z co najmniej jednym możliwym fixem: {fixable_counter}") |
|
print(f"🔧 Liczba unikalnych reguł z przypisanym fixem: {len(unique_rules_with_fixes)}") |
|
|
|
|
|
top_rules = rules_counter.most_common(TOP_N) |
|
print(f"\n🏆 Top {TOP_N} najczęściej łamanych reguł:") |
|
for code, count in top_rules: |
|
print(f" {code}: {count}x") |
|
|
|
|
|
TOP_RULES_PATH.parent.mkdir(parents=True, exist_ok=True) |
|
with open(TOP_RULES_PATH, "w", encoding="utf-8") as f: |
|
json.dump([code for code, _ in top_rules], f, indent=2) |
|
print(f"\n💾 Zapisano top {TOP_N} reguł do {TOP_RULES_PATH}") |
|
|
|
|
|
lengths_np = np.array(lengths) |
|
print(f"\n📏 Długość Dockerfile (linie):") |
|
print(f" Średnia: {lengths_np.mean():.2f}") |
|
print(f" Mediana: {np.median(lengths_np):.0f}") |
|
print(f" Min: {lengths_np.min()}") |
|
print(f" Max: {lengths_np.max()}") |
|
|
|
|
|
Path("data/metadata").mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
plt.figure() |
|
plt.hist(lengths_np, bins=40, color="skyblue", edgecolor="black") |
|
plt.title("Rozkład długości Dockerfile") |
|
plt.xlabel("Liczba linii") |
|
plt.ylabel("Liczba plików") |
|
plt.grid(True) |
|
plt.tight_layout() |
|
plt.savefig("data/metadata/dockerfile_length_hist.png") |
|
|
|
|
|
if rules_per_file: |
|
plt.figure() |
|
plt.hist(rules_per_file, bins=range(1, max(rules_per_file)+2), color="salmon", edgecolor="black") |
|
plt.title("Liczba reguł naruszonych na plik") |
|
plt.xlabel("Liczba reguł") |
|
plt.ylabel("Liczba plików") |
|
plt.grid(True) |
|
plt.tight_layout() |
|
plt.savefig("data/metadata/rules_per_file_hist.png") |
|
|
|
|
|
if lines_with_errors_per_file: |
|
plt.figure() |
|
plt.hist(lines_with_errors_per_file, bins=range(1, max(lines_with_errors_per_file)+2), color="orchid", edgecolor="black") |
|
plt.title("Liczba linii z błędami w pliku") |
|
plt.xlabel("Liczba linii z błędami") |
|
plt.ylabel("Liczba plików") |
|
plt.grid(True) |
|
plt.tight_layout() |
|
plt.savefig("data/metadata/error_lines_per_file_hist.png") |
|
|
|
|
|
if all_line_positions: |
|
plt.figure() |
|
plt.hist(all_line_positions, bins=50, color="gold", edgecolor="black") |
|
plt.title("Rozkład pozycji błędów (linie)") |
|
plt.xlabel("Numer linii") |
|
plt.ylabel("Liczba błędów") |
|
plt.grid(True) |
|
plt.tight_layout() |
|
plt.savefig("data/metadata/line_positions_hist.png") |
|
|
|
print("\n📊 Zapisano wykresy do data/metadata/") |
|
|