binary-dockerfile-model / scripts /07_explore_labeled_dataset.py

Add scripts

e9b8340 verified 24 days ago

4.1 kB

	# 08_explore_labeled_dataset_v4.py

	import json
	from pathlib import Path
	from collections import Counter
	import matplotlib.pyplot as plt
	import numpy as np

	# === Ścieżki i konfiguracja
	INPUT_PATH = Path("data/labeled/labeled_dockerfiles.jsonl")
	TOP_RULES_PATH = Path("data/metadata/top_rules.json")
	TOP_N = 30

	# === Inicjalizacja
	labels_counter = Counter()
	rules_counter = Counter()
	rules_per_file = []
	lines_with_errors_per_file = []
	lengths = []
	all_line_positions = []

	fixable_counter = 0
	unique_rules_with_fixes = set()

	print("🔍 Analizuję dane...")

	with open(INPUT_PATH, encoding="utf-8") as f:
	for line in f:
	obj = json.loads(line)
	labels_counter[obj["label"]] += 1

	if obj["label"] == "bad":
	rules = obj.get("rules_triggered", [])
	rules_counter.update(rules)
	rules_per_file.append(len(rules))

	# Fix analysis
	fixes = obj.get("fix_suggestions", {})
	if fixes:
	fixable_counter += 1
	unique_rules_with_fixes.update(fixes.keys())

	# Linie błędów – v4
	lines = obj.get("lines", {}).values()
	line_set = set(lines)
	lines_with_errors_per_file.append(len(line_set))
	all_line_positions.extend(lines)

	# Długość pliku
	lengths.append(len(obj["content"]))

	# === Statystyki ogólne
	print("\n📊 Statystyki:")
	print(f"✅ Good: {labels_counter['good']}")
	print(f"❌ Bad: {labels_counter['bad']}")
	print(f"🧩 Łączna liczba unikalnych reguł: {len(rules_counter)}")
	print(f"🛠 Plików z co najmniej jednym możliwym fixem: {fixable_counter}")
	print(f"🔧 Liczba unikalnych reguł z przypisanym fixem: {len(unique_rules_with_fixes)}")

	# === Top N reguł
	top_rules = rules_counter.most_common(TOP_N)
	print(f"\n🏆 Top {TOP_N} najczęściej łamanych reguł:")
	for code, count in top_rules:
	print(f" {code}: {count}x")

	# === Zapisz top N do pliku JSON
	TOP_RULES_PATH.parent.mkdir(parents=True, exist_ok=True)
	with open(TOP_RULES_PATH, "w", encoding="utf-8") as f:
	json.dump([code for code, _ in top_rules], f, indent=2)
	print(f"\n💾 Zapisano top {TOP_N} reguł do {TOP_RULES_PATH}")

	# === Długości Dockerfile
	lengths_np = np.array(lengths)
	print(f"\n📏 Długość Dockerfile (linie):")
	print(f" Średnia: {lengths_np.mean():.2f}")
	print(f" Mediana: {np.median(lengths_np):.0f}")
	print(f" Min: {lengths_np.min()}")
	print(f" Max: {lengths_np.max()}")

	# === Histogramy
	Path("data/metadata").mkdir(parents=True, exist_ok=True)

	# 1. Długość plików
	plt.figure()
	plt.hist(lengths_np, bins=40, color="skyblue", edgecolor="black")
	plt.title("Rozkład długości Dockerfile")
	plt.xlabel("Liczba linii")
	plt.ylabel("Liczba plików")
	plt.grid(True)
	plt.tight_layout()
	plt.savefig("data/metadata/dockerfile_length_hist.png")

	# 2. Reguły na plik
	if rules_per_file:
	plt.figure()
	plt.hist(rules_per_file, bins=range(1, max(rules_per_file)+2), color="salmon", edgecolor="black")
	plt.title("Liczba reguł naruszonych na plik")
	plt.xlabel("Liczba reguł")
	plt.ylabel("Liczba plików")
	plt.grid(True)
	plt.tight_layout()
	plt.savefig("data/metadata/rules_per_file_hist.png")

	# 3. Linie błędów na plik
	if lines_with_errors_per_file:
	plt.figure()
	plt.hist(lines_with_errors_per_file, bins=range(1, max(lines_with_errors_per_file)+2), color="orchid", edgecolor="black")
	plt.title("Liczba linii z błędami w pliku")
	plt.xlabel("Liczba linii z błędami")
	plt.ylabel("Liczba plików")
	plt.grid(True)
	plt.tight_layout()
	plt.savefig("data/metadata/error_lines_per_file_hist.png")

	# 4. Rozkład pozycji błędów
	if all_line_positions:
	plt.figure()
	plt.hist(all_line_positions, bins=50, color="gold", edgecolor="black")
	plt.title("Rozkład pozycji błędów (linie)")
	plt.xlabel("Numer linii")
	plt.ylabel("Liczba błędów")
	plt.grid(True)
	plt.tight_layout()
	plt.savefig("data/metadata/line_positions_hist.png")

	print("\n📊 Zapisano wykresy do data/metadata/")