# 07_balance_dataset.py import json import random from pathlib import Path from collections import Counter import shutil # === Ścieżki INPUT_PATH = Path("data/labeled/labeled_dockerfiles.jsonl") BACKUP_PATH = Path("data/labeled/labeled_dockerfiles_backup.jsonl") TOP_RULES_PATH = Path("data/metadata/top_rules.json") OUTPUT_PATH = INPUT_PATH MAX_GOOD = 1500 MAX_BAD = 15000 TOP_N = 30 # === Backup if INPUT_PATH.exists(): if not BACKUP_PATH.exists(): print(f"📦 Tworzę kopię zapasową → {BACKUP_PATH.name}") shutil.copy(INPUT_PATH, BACKUP_PATH) else: print(f"ℹ️ Kopia zapasowa już istnieje: {BACKUP_PATH.name}") # === Wczytaj top 30 reguł with open(TOP_RULES_PATH, encoding="utf-8") as f: top_rules = set(json.load(f)[:TOP_N]) print(f"🏆 Używamy top {TOP_N} reguł") # === Wczytywanie danych print("🔍 Wczytywanie danych...") good_samples = [] bad_samples = [] with open(INPUT_PATH, encoding="utf-8") as f: for line in f: obj = json.loads(line) if obj["label"] == "good": good_samples.append(obj) elif obj["label"] == "bad": rules = set(obj.get("rules_triggered", [])) if rules & top_rules: bad_samples.append(obj) print(f"✅ Good: {len(good_samples)} | ❌ Bad zawierające top {TOP_N} reguły: {len(bad_samples)}") # === Losowy wybór GOOD balanced_good = random.sample(good_samples, min(MAX_GOOD, len(good_samples))) # === Wybór BAD wg rzadkości top 30 reguł print("⚙️ Oceniam pliki BAD pod kątem rzadkości reguł...") rule_freq = Counter() for sample in bad_samples: rules = sample.get("rules_triggered", []) rule_freq.update(r for r in rules if r in top_rules) def compute_score(sample): rules = set(sample.get("rules_triggered", [])) & top_rules return sum(1 / rule_freq[r] for r in rules if rule_freq[r] > 0) scored_bad = sorted( bad_samples, key=lambda s: ( compute_score(s), -len(set(s.get("rules_triggered", [])) & top_rules) ), reverse=True ) balanced_bad = scored_bad[:MAX_BAD] # === Łączenie i zapis balanced_all = balanced_good + balanced_bad random.shuffle(balanced_all) OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True) with open(OUTPUT_PATH, "w", encoding="utf-8") as f_out: for rec in balanced_all: json.dump(rec, f_out) f_out.write("\n") print(f"\n✅ Zapisano zbalansowany zbiór (tylko top {TOP_N} reguły): {len(balanced_all)} → {OUTPUT_PATH.name}") print(f" - Good: {len(balanced_good)}") print(f" - Bad: {len(balanced_bad)}")