binary-dockerfile-model / scripts /08_balance_dataset.py
LeeSek's picture
Add scripts
e9b8340 verified
# 07_balance_dataset.py
import json
import random
from pathlib import Path
from collections import Counter
import shutil
# === Ścieżki
INPUT_PATH = Path("data/labeled/labeled_dockerfiles.jsonl")
BACKUP_PATH = Path("data/labeled/labeled_dockerfiles_backup.jsonl")
TOP_RULES_PATH = Path("data/metadata/top_rules.json")
OUTPUT_PATH = INPUT_PATH
MAX_GOOD = 1500
MAX_BAD = 15000
TOP_N = 30
# === Backup
if INPUT_PATH.exists():
if not BACKUP_PATH.exists():
print(f"📦 Tworzę kopię zapasową → {BACKUP_PATH.name}")
shutil.copy(INPUT_PATH, BACKUP_PATH)
else:
print(f"ℹ️ Kopia zapasowa już istnieje: {BACKUP_PATH.name}")
# === Wczytaj top 30 reguł
with open(TOP_RULES_PATH, encoding="utf-8") as f:
top_rules = set(json.load(f)[:TOP_N])
print(f"🏆 Używamy top {TOP_N} reguł")
# === Wczytywanie danych
print("🔍 Wczytywanie danych...")
good_samples = []
bad_samples = []
with open(INPUT_PATH, encoding="utf-8") as f:
for line in f:
obj = json.loads(line)
if obj["label"] == "good":
good_samples.append(obj)
elif obj["label"] == "bad":
rules = set(obj.get("rules_triggered", []))
if rules & top_rules:
bad_samples.append(obj)
print(f"✅ Good: {len(good_samples)} | ❌ Bad zawierające top {TOP_N} reguły: {len(bad_samples)}")
# === Losowy wybór GOOD
balanced_good = random.sample(good_samples, min(MAX_GOOD, len(good_samples)))
# === Wybór BAD wg rzadkości top 30 reguł
print("⚙️ Oceniam pliki BAD pod kątem rzadkości reguł...")
rule_freq = Counter()
for sample in bad_samples:
rules = sample.get("rules_triggered", [])
rule_freq.update(r for r in rules if r in top_rules)
def compute_score(sample):
rules = set(sample.get("rules_triggered", [])) & top_rules
return sum(1 / rule_freq[r] for r in rules if rule_freq[r] > 0)
scored_bad = sorted(
bad_samples,
key=lambda s: (
compute_score(s),
-len(set(s.get("rules_triggered", [])) & top_rules)
),
reverse=True
)
balanced_bad = scored_bad[:MAX_BAD]
# === Łączenie i zapis
balanced_all = balanced_good + balanced_bad
random.shuffle(balanced_all)
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_PATH, "w", encoding="utf-8") as f_out:
for rec in balanced_all:
json.dump(rec, f_out)
f_out.write("\n")
print(f"\n✅ Zapisano zbalansowany zbiór (tylko top {TOP_N} reguły): {len(balanced_all)}{OUTPUT_PATH.name}")
print(f" - Good: {len(balanced_good)}")
print(f" - Bad: {len(balanced_bad)}")