binary-dockerfile-model / scripts /12.1_predict_binary_file.py
LeeSek's picture
Add scripts
e9b8340 verified
# 12.1_predict_binary_file.py
# python scripts/12.1_predict_binary_file.py test/Dockerfile --debug
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import sys
from pathlib import Path
MODEL_PATH = Path("models/binary/final")
MAX_LENGTH = 512
def load_model_and_tokenizer():
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
model.eval()
return tokenizer, model
def predict(filepath: Path, tokenizer, model, debug: bool = False):
text = filepath.read_text(encoding="utf-8")
inputs = tokenizer(
text,
return_tensors="pt",
truncation=True,
padding="max_length",
max_length=MAX_LENGTH,
)
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
probs = torch.nn.functional.softmax(logits, dim=1).squeeze()
pred = torch.argmax(probs).item()
label = "good" if pred == 0 else "bad"
print(f"\n🧪 Predykcja dla pliku: {filepath.name}")
print(f"📄 Długość: {len(text.splitlines())} linii")
print(f"📊 Wynik: {label.upper()}")
print(f"🔢 Prawdopodobieństwo: good={probs[0]:.4f}, bad={probs[1]:.4f}")
if debug:
print("\n🛠 DEBUG INFO:")
print(f"✏️ Surowy tekst (pierwsze 300 znaków):\n{text[:300]}")
print(f"🔢 Liczba tokenów: {len(inputs['input_ids'][0])}")
print(f"📈 Logity: {logits.tolist()[0]}")
print(f"📊 Softmax: good={probs[0].item():.5f}, bad={probs[1].item():.5f}")
def main():
if len(sys.argv) < 2:
print("❌ Użycie: python3 12.1_predict_binary_file.py /ścieżka/do/Dockerfile [--debug]")
sys.exit(1)
filepath = Path(sys.argv[1])
debug = "--debug" in sys.argv
if not filepath.exists():
print(f"❌ Plik {filepath} nie istnieje.")
sys.exit(1)
tokenizer, model = load_model_and_tokenizer()
predict(filepath, tokenizer, model, debug=debug)
if __name__ == "__main__":
main()