# 12.1_predict_binary_file.py # python scripts/12.1_predict_binary_file.py test/Dockerfile --debug from transformers import AutoModelForSequenceClassification, AutoTokenizer import torch import sys from pathlib import Path MODEL_PATH = Path("models/binary/final") MAX_LENGTH = 512 def load_model_and_tokenizer(): tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH) model.eval() return tokenizer, model def predict(filepath: Path, tokenizer, model, debug: bool = False): text = filepath.read_text(encoding="utf-8") inputs = tokenizer( text, return_tensors="pt", truncation=True, padding="max_length", max_length=MAX_LENGTH, ) with torch.no_grad(): outputs = model(**inputs) logits = outputs.logits probs = torch.nn.functional.softmax(logits, dim=1).squeeze() pred = torch.argmax(probs).item() label = "good" if pred == 0 else "bad" print(f"\n🧪 Predykcja dla pliku: {filepath.name}") print(f"📄 Długość: {len(text.splitlines())} linii") print(f"📊 Wynik: {label.upper()}") print(f"🔢 Prawdopodobieństwo: good={probs[0]:.4f}, bad={probs[1]:.4f}") if debug: print("\n🛠 DEBUG INFO:") print(f"✏️ Surowy tekst (pierwsze 300 znaków):\n{text[:300]}") print(f"🔢 Liczba tokenów: {len(inputs['input_ids'][0])}") print(f"📈 Logity: {logits.tolist()[0]}") print(f"📊 Softmax: good={probs[0].item():.5f}, bad={probs[1].item():.5f}") def main(): if len(sys.argv) < 2: print("❌ Użycie: python3 12.1_predict_binary_file.py /ścieżka/do/Dockerfile [--debug]") sys.exit(1) filepath = Path(sys.argv[1]) debug = "--debug" in sys.argv if not filepath.exists(): print(f"❌ Plik {filepath} nie istnieje.") sys.exit(1) tokenizer, model = load_model_and_tokenizer() predict(filepath, tokenizer, model, debug=debug) if __name__ == "__main__": main()