File size: 4,100 Bytes

e9b8340

# 08_explore_labeled_dataset_v4.py

import json
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np

# === Ścieżki i konfiguracja
INPUT_PATH = Path("data/labeled/labeled_dockerfiles.jsonl")
TOP_RULES_PATH = Path("data/metadata/top_rules.json")
TOP_N = 30

# === Inicjalizacja
labels_counter = Counter()
rules_counter = Counter()
rules_per_file = []
lines_with_errors_per_file = []
lengths = []
all_line_positions = []

fixable_counter = 0
unique_rules_with_fixes = set()

print("🔍 Analizuję dane...")

with open(INPUT_PATH, encoding="utf-8") as f:
    for line in f:
        obj = json.loads(line)
        labels_counter[obj["label"]] += 1

        if obj["label"] == "bad":
            rules = obj.get("rules_triggered", [])
            rules_counter.update(rules)
            rules_per_file.append(len(rules))

            # Fix analysis
            fixes = obj.get("fix_suggestions", {})
            if fixes:
                fixable_counter += 1
                unique_rules_with_fixes.update(fixes.keys())

            # Linie błędów – v4
            lines = obj.get("lines", {}).values()
            line_set = set(lines)
            lines_with_errors_per_file.append(len(line_set))
            all_line_positions.extend(lines)

        # Długość pliku
        lengths.append(len(obj["content"]))

# === Statystyki ogólne
print("\n📊 Statystyki:")
print(f"✅ Good: {labels_counter['good']}")
print(f"❌ Bad:  {labels_counter['bad']}")
print(f"🧩 Łączna liczba unikalnych reguł: {len(rules_counter)}")
print(f"🛠 Plików z co najmniej jednym możliwym fixem: {fixable_counter}")
print(f"🔧 Liczba unikalnych reguł z przypisanym fixem: {len(unique_rules_with_fixes)}")

# === Top N reguł
top_rules = rules_counter.most_common(TOP_N)
print(f"\n🏆 Top {TOP_N} najczęściej łamanych reguł:")
for code, count in top_rules:
    print(f"  {code}: {count}x")

# === Zapisz top N do pliku JSON
TOP_RULES_PATH.parent.mkdir(parents=True, exist_ok=True)
with open(TOP_RULES_PATH, "w", encoding="utf-8") as f:
    json.dump([code for code, _ in top_rules], f, indent=2)
print(f"\n💾 Zapisano top {TOP_N} reguł do {TOP_RULES_PATH}")

# === Długości Dockerfile
lengths_np = np.array(lengths)
print(f"\n📏 Długość Dockerfile (linie):")
print(f"  Średnia:  {lengths_np.mean():.2f}")
print(f"  Mediana:  {np.median(lengths_np):.0f}")
print(f"  Min:      {lengths_np.min()}")
print(f"  Max:      {lengths_np.max()}")

# === Histogramy
Path("data/metadata").mkdir(parents=True, exist_ok=True)

# 1. Długość plików
plt.figure()
plt.hist(lengths_np, bins=40, color="skyblue", edgecolor="black")
plt.title("Rozkład długości Dockerfile")
plt.xlabel("Liczba linii")
plt.ylabel("Liczba plików")
plt.grid(True)
plt.tight_layout()
plt.savefig("data/metadata/dockerfile_length_hist.png")

# 2. Reguły na plik
if rules_per_file:
    plt.figure()
    plt.hist(rules_per_file, bins=range(1, max(rules_per_file)+2), color="salmon", edgecolor="black")
    plt.title("Liczba reguł naruszonych na plik")
    plt.xlabel("Liczba reguł")
    plt.ylabel("Liczba plików")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("data/metadata/rules_per_file_hist.png")

# 3. Linie błędów na plik
if lines_with_errors_per_file:
    plt.figure()
    plt.hist(lines_with_errors_per_file, bins=range(1, max(lines_with_errors_per_file)+2), color="orchid", edgecolor="black")
    plt.title("Liczba linii z błędami w pliku")
    plt.xlabel("Liczba linii z błędami")
    plt.ylabel("Liczba plików")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("data/metadata/error_lines_per_file_hist.png")

# 4. Rozkład pozycji błędów
if all_line_positions:
    plt.figure()
    plt.hist(all_line_positions, bins=50, color="gold", edgecolor="black")
    plt.title("Rozkład pozycji błędów (linie)")
    plt.xlabel("Numer linii")
    plt.ylabel("Liczba błędów")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("data/metadata/line_positions_hist.png")

print("\n📊 Zapisano wykresy do data/metadata/")