GuardLLM / app.py
G.
Restructure layout: all prompt selection left, analysis only right
ca68436
"""
GuardLLM — Prompt Security Visualizer
Aleph Beth design system applied. Editorial calm, bilingual FR/EN posture.
Powered by Llama Prompt Guard 2 (86M) and neuralchemy/Prompt-injection-dataset.
"""
import logging
import os
import sys
import json
import gradio as gr
import torch
import numpy as np
import plotly.graph_objects as go
from pathlib import Path
# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[logging.StreamHandler(sys.stdout)],
)
logger = logging.getLogger("GuardLLM")
# ---------------------------------------------------------------------------
# Aleph Beth — palette tokens (mirrored from colors_and_type.css)
# ---------------------------------------------------------------------------
AB = {
"ink_950": "#0B1626",
"ink_900": "#11203A",
"ink_800": "#1B2F4E",
"ink_700": "#2A4566",
"ink_600": "#44607F",
"ink_500": "#6B829D",
"ink_400": "#95A6BB",
"ink_300": "#BCC8D6",
"ink_200": "#DAE1EA",
"ink_100": "#ECF0F5",
"ink_50": "#F6F8FB",
"parchment_50": "#FCFAF2",
"parchment_100": "#F8F3E6",
"parchment_200": "#ECE5D2",
"parchment_300": "#DDD3B9",
"parchment_400": "#C2B695",
"gilt_50": "#FCEEDA",
"gilt_100": "#F8D9A4",
"gilt_200": "#F2BD72",
"gilt_300": "#EAA046",
"gilt_400": "#DC8B2A",
"gilt_500": "#A66718",
"gilt_600": "#7A4912",
"signal_100": "#C9DDEB",
"signal_200": "#9BBFD9",
"signal_300": "#6FA0C2",
"signal_400": "#4A82AA",
"signal_500": "#36678C",
"signal_600": "#244D6B",
"threat_400": "#D44A3E",
"threat_300": "#E07065",
"threat_100": "#F8DAD5",
"safe_400": "#3F8F6E",
"safe_300": "#66AB8C",
"safe_100": "#D4E8DD",
}
# Category colors stay within the brand families — no neon, no inventions.
CATEGORY_COLORS = {
"benign": AB["safe_400"],
"direct_injection": AB["threat_400"],
"jailbreak": AB["gilt_400"],
"system_extraction": AB["gilt_600"],
"encoding_obfuscation": AB["signal_500"],
"persona_replacement": AB["gilt_300"],
"indirect_injection": AB["threat_300"],
"token_smuggling": AB["signal_600"],
"many_shot": AB["signal_400"],
"crescendo": AB["signal_200"],
"context_overflow": AB["ink_600"],
"prompt_leaking": AB["gilt_500"],
"unknown": AB["ink_400"],
}
CATEGORY_LABELS = {
"benign": "Benign",
"direct_injection": "Direct Injection",
"jailbreak": "Jailbreak",
"system_extraction": "System Extraction",
"encoding_obfuscation": "Encoding / Obfuscation",
"persona_replacement": "Persona Replacement",
"indirect_injection": "Indirect Injection",
"token_smuggling": "Token Smuggling",
"many_shot": "Many-Shot",
"crescendo": "Crescendo",
"context_overflow": "Context Overflow",
"prompt_leaking": "Prompt Leaking",
"unknown": "Unknown",
}
LABEL_TO_KEY = {v: k for k, v in CATEGORY_LABELS.items()}
# ---------------------------------------------------------------------------
# Lazy-loaded risk classifier (Llama Prompt Guard 2)
# ---------------------------------------------------------------------------
MODEL_ID = "meta-llama/Llama-Prompt-Guard-2-86M"
LABELS = ["Benign", "Malicious"]
HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
_classifier = {"tokenizer": None, "model": None, "device": None}
def get_classifier():
if _classifier["model"] is None:
logger.info("Lazy-loading Llama Prompt Guard 2...")
from transformers import AutoTokenizer, AutoModelForSequenceClassification
kwargs = {"token": HF_TOKEN} if HF_TOKEN else {}
tok = AutoTokenizer.from_pretrained(MODEL_ID, **kwargs)
mdl = AutoModelForSequenceClassification.from_pretrained(MODEL_ID, **kwargs)
mdl.eval()
dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
mdl.to(dev)
_classifier["tokenizer"] = tok
_classifier["model"] = mdl
_classifier["device"] = dev
logger.info("Classifier loaded on %s", dev)
return _classifier["tokenizer"], _classifier["model"], _classifier["device"]
# ---------------------------------------------------------------------------
# Load precomputed t-SNE data
# ---------------------------------------------------------------------------
CACHE_DIR = Path(__file__).parent / "cache"
CACHE_FILE = CACHE_DIR / "embeddings_tsne.npz"
META_FILE = CACHE_DIR / "metadata.json"
logger.info("Loading precomputed t-SNE cache from %s", CACHE_DIR)
if not CACHE_FILE.exists() or not META_FILE.exists():
raise RuntimeError(
"Cache files not found in %s. Run precompute.py first." % CACHE_DIR
)
_npz = np.load(CACHE_FILE)
TSNE_COORDS = _npz["tsne_2d"]
with open(META_FILE, "r", encoding="utf-8") as f:
METADATA = json.load(f)
logger.info("Loaded %d points for visualization", len(METADATA))
ALL_TEXTS = [m["text"] for m in METADATA]
ALL_CATEGORIES = [m["category"] for m in METADATA]
ALL_SEVERITIES = [m["severity"] for m in METADATA]
ALL_LABELS_DS = [m["label"] for m in METADATA]
UNIQUE_CATEGORIES = sorted(set(ALL_CATEGORIES))
DROPDOWN_CHOICES = []
for i, m in enumerate(METADATA):
preview = m["text"][:70].replace("\n", " ")
if len(m["text"]) > 70:
preview += "..."
DROPDOWN_CHOICES.append(f"{i} | {m['category']} | {preview}")
# ---------------------------------------------------------------------------
# Analysis function
# ---------------------------------------------------------------------------
def analyze_prompt(text):
if not text or not text.strip():
return {}, 0.0
tokenizer, model, DEVICE = get_classifier()
inputs = tokenizer(
text, return_tensors="pt", truncation=True, max_length=512, padding=True
).to(DEVICE)
with torch.no_grad():
outputs = model(**inputs)
probs = torch.softmax(outputs.logits, dim=-1)[0].cpu().numpy()
prob_dict = {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}
safety = float(probs[0])
return prob_dict, safety
# ---------------------------------------------------------------------------
# Plotly figure — parchment surface, ink axes, restrained palette
# ---------------------------------------------------------------------------
def build_tsne_figure(selected_categories=None):
fig = go.Figure()
for cat in UNIQUE_CATEGORIES:
indices = [
i for i, c in enumerate(ALL_CATEGORIES)
if c == cat
and (selected_categories is None or cat in selected_categories)
]
if not indices:
continue
x = TSNE_COORDS[indices, 0].tolist()
y = TSNE_COORDS[indices, 1].tolist()
texts_preview = [
ALL_TEXTS[i][:80].replace("\n", " ") + ("..." if len(ALL_TEXTS[i]) > 80 else "")
for i in indices
]
severities = [ALL_SEVERITIES[i] or "benign" for i in indices]
hover_texts = [
f"<b>{CATEGORY_LABELS.get(cat, cat)}</b><br>"
f"Severity — {sev}<br>"
f"Index — {idx}<br>"
f"<i>{txt}</i>"
for idx, txt, sev in zip(indices, texts_preview, severities)
]
color = CATEGORY_COLORS.get(cat, CATEGORY_COLORS["unknown"])
label = CATEGORY_LABELS.get(cat, cat)
fig.add_trace(go.Scatter(
x=x, y=y,
mode="markers",
name=label,
marker=dict(
size=5 if len(indices) > 500 else 7,
color=color,
opacity=0.78,
line=dict(width=0.5, color="rgba(17,32,58,0.20)"),
),
text=hover_texts,
hoverinfo="text",
customdata=[str(i) for i in indices],
))
fig.update_layout(
template="plotly_white",
paper_bgcolor=AB["parchment_100"],
plot_bgcolor=AB["parchment_50"],
font=dict(family="Geist, Inter, system-ui, sans-serif", color=AB["ink_700"]),
title=dict(
text="<span style='font-family: Instrument Serif, serif; font-size:18px;'>"
"t-SNE — Prompt Security Landscape</span>",
font=dict(color=AB["ink_900"]),
x=0.5,
xanchor="center",
),
legend=dict(
title=dict(text="Category", font=dict(color=AB["ink_700"], size=11)),
bgcolor="rgba(252,250,242,0.88)",
bordercolor="rgba(17,32,58,0.12)",
borderwidth=1,
font=dict(color=AB["ink_800"], size=10),
itemsizing="constant",
itemclick="toggleothers",
itemdoubleclick="toggle",
),
xaxis=dict(
title=dict(text="t-SNE 1", font=dict(color=AB["ink_500"], size=11)),
showgrid=True,
gridcolor="rgba(17,32,58,0.06)",
zeroline=False,
color=AB["ink_500"],
),
yaxis=dict(
title=dict(text="t-SNE 2", font=dict(color=AB["ink_500"], size=11)),
showgrid=True,
gridcolor="rgba(17,32,58,0.06)",
zeroline=False,
color=AB["ink_500"],
),
margin=dict(l=44, r=44, t=56, b=44),
height=620,
dragmode="pan",
hoverlabel=dict(
bgcolor=AB["parchment_50"],
bordercolor="rgba(17,32,58,0.12)",
font=dict(family="Geist, sans-serif", color=AB["ink_900"], size=12),
),
)
return fig
# ---------------------------------------------------------------------------
# Callbacks
# ---------------------------------------------------------------------------
def on_filter_change(categories):
sel = categories if categories else None
return build_tsne_figure(sel)
def select_all_categories():
return gr.update(value=UNIQUE_CATEGORIES), build_tsne_figure(UNIQUE_CATEGORIES)
def deselect_all_categories():
return gr.update(value=[]), build_tsne_figure([])
def on_legend_sync(payload):
"""Plotly legend click → sync the checkbox filter + rebuild the chart."""
if not payload or not payload.strip():
return gr.update(), gr.update()
try:
data = json.loads(payload)
visible_labels = data.get("visible", [])
visible_keys = [LABEL_TO_KEY.get(lbl, lbl) for lbl in visible_labels]
visible_keys = [k for k in visible_keys if k in UNIQUE_CATEGORIES]
if not visible_keys:
return gr.update(value=[]), build_tsne_figure([])
return gr.update(value=visible_keys), build_tsne_figure(visible_keys)
except Exception as e:
logger.error("legend sync error: %s", e)
return gr.update(), gr.update()
def _dataset_meta_block(category, severity, ground_truth):
return (
f"\n\n<span class='ab-eyebrow'>Dataset metadata</span>\n"
f"- Category — **{CATEGORY_LABELS.get(category, category)}**\n"
f"- Severity — **{severity}**\n"
f"- Ground truth — **{ground_truth}**\n"
)
def on_dropdown_select(choice):
if not choice:
return empty_analysis_html(), "*Select a prompt to begin.*", ""
try:
idx = int(choice.split(" | ")[0])
text = ALL_TEXTS[idx]
category = ALL_CATEGORIES[idx]
severity = ALL_SEVERITIES[idx] or "N/A"
ground_truth = "Malicious" if ALL_LABELS_DS[idx] == 1 else "Benign"
prob_dict, _ = analyze_prompt(text)
pred_label = max(prob_dict, key=prob_dict.get)
confidence = prob_dict[pred_label]
result_html = build_result_html(pred_label, confidence, prob_dict, text)
risk_text = build_risk_assessment(pred_label, confidence, prob_dict)
risk_text += _dataset_meta_block(category, severity, ground_truth)
return result_html, risk_text, text
except Exception as e:
logger.error("Error: %s", e)
return empty_analysis_html(), f"Error — {e}", ""
def on_index_input(idx_str):
if not idx_str or not idx_str.strip():
return empty_analysis_html(), "*Click a point on the chart.*", ""
try:
idx = int(idx_str.strip())
if idx < 0 or idx >= len(ALL_TEXTS):
return empty_analysis_html(), f"Invalid index — {idx}", ""
text = ALL_TEXTS[idx]
category = ALL_CATEGORIES[idx]
severity = ALL_SEVERITIES[idx] or "N/A"
ground_truth = "Malicious" if ALL_LABELS_DS[idx] == 1 else "Benign"
prob_dict, _ = analyze_prompt(text)
pred_label = max(prob_dict, key=prob_dict.get)
confidence = prob_dict[pred_label]
result_html = build_result_html(pred_label, confidence, prob_dict, text)
risk_text = build_risk_assessment(pred_label, confidence, prob_dict)
risk_text += _dataset_meta_block(category, severity, ground_truth)
return result_html, risk_text, text
except Exception as e:
logger.error("Error: %s", e)
return empty_analysis_html(), f"Error — {e}", ""
def on_manual_analyze(text):
if not text or not text.strip():
return empty_analysis_html(), ""
prob_dict, _ = analyze_prompt(text)
pred_label = max(prob_dict, key=prob_dict.get)
confidence = prob_dict[pred_label]
result_html = build_result_html(pred_label, confidence, prob_dict, text)
risk_text = build_risk_assessment(pred_label, confidence, prob_dict)
return result_html, risk_text
# ---------------------------------------------------------------------------
# UI builders — editorial, parchment surface, ink type, no emoji
# ---------------------------------------------------------------------------
def empty_analysis_html():
return f"""
<div class="ab-card ab-card--quiet">
<div class="ab-eyebrow">Idle</div>
<p class="ab-prose">
Click a point on the chart, pick a prompt from the list,
or paste your own below. The classifier runs on demand.
</p>
</div>
"""
def build_result_html(label, confidence, probs, text):
is_safe = label == "Benign"
accent = AB["safe_400"] if is_safe else AB["threat_400"]
marker = "●" # geometric primitive instead of emoji
pct = confidence * 100
safety_score = probs["Benign"] * 100
safety_color = (
AB["safe_400"] if safety_score >= 70
else AB["gilt_400"] if safety_score >= 40
else AB["threat_400"]
)
bars_html = ""
for lbl in LABELS:
p = probs[lbl] * 100
c = AB["safe_400"] if lbl == "Benign" else AB["threat_400"]
bars_html += f"""
<div class="ab-bar">
<div class="ab-bar__row">
<span class="ab-bar__label">{lbl}</span>
<span class="ab-bar__value">{p:.1f}%</span>
</div>
<div class="ab-bar__track">
<div class="ab-bar__fill" style="width:{p}%; background:{c};"></div>
</div>
</div>
"""
preview = text[:180].replace("<", "&lt;").replace(">", "&gt;")
if len(text) > 180:
preview += "…"
return f"""
<div class="ab-card">
<div class="ab-result__head">
<span class="ab-result__marker" style="color:{accent};">{marker}</span>
<div>
<div class="ab-eyebrow">Verdict</div>
<div class="ab-result__label" style="color:{accent};">{label}</div>
<div class="ab-caption">Confidence — {pct:.1f}%</div>
</div>
</div>
<div class="ab-divider"></div>
<div class="ab-eyebrow">Safety score</div>
<div class="ab-score">
<div class="ab-score__value" style="color:{safety_color};">{safety_score:.0f}<span>/100</span></div>
<div class="ab-score__track">
<div class="ab-score__fill" style="width:{safety_score}%;"></div>
</div>
</div>
<div class="ab-eyebrow" style="margin-top:18px;">Class probabilities</div>
<div class="ab-bars">{bars_html}</div>
<div class="ab-quote">
<div class="ab-eyebrow">Analyzed prompt</div>
<blockquote>“{preview}”</blockquote>
</div>
</div>
"""
def build_risk_assessment(label, confidence, probs):
safety_score = probs["Benign"] * 100
malicious_score = probs["Malicious"] * 100
if label == "Benign" and confidence > 0.85:
level = "Low"
desc = "The request appears **safe**. No injection or jailbreak patterns were detected."
elif label == "Benign":
level = "Moderate"
desc = "Likely benign, with moderate confidence. The wording may be ambiguous."
elif confidence > 0.85:
level = "Critical"
desc = "**Malicious request detected** with high confidence. Likely injection or jailbreak."
else:
level = "High"
desc = "**Malicious request detected.** Possible injection or jailbreak — review recommended."
return (
f"<span class='ab-eyebrow'>Risk level — {level}</span>\n\n{desc}\n\n"
f"- Safety score — **{safety_score:.0f}/100**\n"
f"- Predicted class — **{label}** ({confidence*100:.1f}%)\n"
f"- P(Benign) — {probs['Benign']*100:.1f}% &nbsp;·&nbsp; P(Malicious) — {malicious_score:.1f}%\n"
)
def build_stats_html():
total = len(METADATA)
n_benign = sum(1 for m in METADATA if m["label"] == 0)
n_malicious = total - n_benign
cat_counts = {}
for m in METADATA:
cat_counts[m["category"]] = cat_counts.get(m["category"], 0) + 1
cats_html = ""
for cat in sorted(cat_counts.keys(), key=lambda c: -cat_counts[c]):
count = cat_counts[cat]
color = CATEGORY_COLORS.get(cat, CATEGORY_COLORS["unknown"])
pct = count / total * 100
label = CATEGORY_LABELS.get(cat, cat)
cats_html += (
f'<div class="ab-stats__row">'
f'<span class="ab-stats__dot" style="background:{color};"></span>'
f'<span class="ab-stats__name">{label}</span>'
f'<span class="ab-stats__count">{count:,} <em>({pct:.1f}%)</em></span>'
f'</div>'
)
return f"""
<div class="ab-card">
<div class="ab-eyebrow">Dataset</div>
<h3 class="ab-h3">Composition</h3>
<div class="ab-kpi-row">
<div class="ab-kpi">
<div class="ab-kpi__label">Total</div>
<div class="ab-kpi__value">{total:,}</div>
</div>
<div class="ab-kpi">
<div class="ab-kpi__label" style="color:{AB['safe_400']};">Benign</div>
<div class="ab-kpi__value" style="color:{AB['safe_400']};">{n_benign:,}</div>
</div>
<div class="ab-kpi">
<div class="ab-kpi__label" style="color:{AB['threat_400']};">Malicious</div>
<div class="ab-kpi__value" style="color:{AB['threat_400']};">{n_malicious:,}</div>
</div>
</div>
<div class="ab-stats">{cats_html}</div>
</div>
"""
# ---------------------------------------------------------------------------
# JavaScript bridge: Plotly clicks → Gradio hidden input
# ---------------------------------------------------------------------------
PLOTLY_CLICK_JS = """
() => {
function pushToHidden(selector, value) {
const el = document.querySelector(selector + ' textarea')
|| document.querySelector(selector + ' input');
if (!el) return;
const proto = el.tagName === 'TEXTAREA'
? window.HTMLTextAreaElement.prototype
: window.HTMLInputElement.prototype;
const setter = Object.getOwnPropertyDescriptor(proto, 'value').set;
setter.call(el, String(value));
el.dispatchEvent(new Event('input', { bubbles: true }));
setTimeout(() => el.dispatchEvent(new Event('change', { bubbles: true })), 40);
}
function attachHandlers(plotEl) {
if (!plotEl || plotEl._abHandlersAttached) return;
plotEl._abHandlersAttached = true;
// Point click → push index to #click-index-input
plotEl.on('plotly_click', function (data) {
if (data && data.points && data.points.length > 0) {
const idx = data.points[0].customdata;
if (idx !== undefined && idx !== null) {
pushToHidden('#click-index-input', idx);
}
}
});
// Legend click → after toggleothers settles, read visible trace names
// and push them to #legend-sync-input as JSON {visible: [...]}.
plotEl.on('plotly_legendclick', function (ed) {
setTimeout(() => {
const visible = (plotEl.data || [])
.filter(t => t.visible === undefined || t.visible === true)
.map(t => t.name);
pushToHidden('#legend-sync-input', JSON.stringify({visible: visible}));
}, 60);
return true; // allow Plotly to process its default toggleothers
});
plotEl.on('plotly_legenddoubleclick', function (ed) {
setTimeout(() => {
const visible = (plotEl.data || [])
.filter(t => t.visible === undefined || t.visible === true)
.map(t => t.name);
pushToHidden('#legend-sync-input', JSON.stringify({visible: visible}));
}, 60);
return true;
});
}
function setup() {
const plotEl = document.querySelector('#tsne-chart .js-plotly-plot');
if (!plotEl) { setTimeout(setup, 500); return; }
attachHandlers(plotEl);
const root = document.querySelector('#tsne-chart') || document.body;
const observer = new MutationObserver(() => {
const newPlot = document.querySelector('#tsne-chart .js-plotly-plot');
if (newPlot) attachHandlers(newPlot);
});
observer.observe(root, { childList: true, subtree: true });
}
setTimeout(setup, 1000);
}
"""
# ---------------------------------------------------------------------------
# Aleph Beth — global CSS
# ---------------------------------------------------------------------------
ALEPH_BETH_CSS = """
@import url('https://fonts.googleapis.com/css2?family=Instrument+Serif:ital@0;1&family=Geist:wght@300;400;500;600;700&family=Geist+Mono:wght@400;500;600&family=Frank+Ruhl+Libre:wght@400;500&family=Amiri:wght@400;700&display=swap');
:root, .gradio-container {
--ab-ink-950:#0B1626; --ab-ink-900:#11203A; --ab-ink-800:#1B2F4E;
--ab-ink-700:#2A4566; --ab-ink-600:#44607F; --ab-ink-500:#6B829D;
--ab-ink-400:#95A6BB; --ab-ink-300:#BCC8D6; --ab-ink-200:#DAE1EA;
--ab-ink-100:#ECF0F5; --ab-ink-50:#F6F8FB;
--ab-parchment-50:#FCFAF2; --ab-parchment-100:#F8F3E6;
--ab-parchment-200:#ECE5D2; --ab-parchment-300:#DDD3B9;
--ab-gilt-300:#EAA046; --ab-gilt-400:#DC8B2A; --ab-gilt-500:#A66718; --ab-gilt-600:#7A4912;
--ab-signal-300:#6FA0C2; --ab-signal-400:#4A82AA; --ab-signal-500:#36678C;
--ab-threat-400:#D44A3E; --ab-safe-400:#3F8F6E;
--ab-border: rgba(17,32,58,0.12);
--ab-border-subtle: rgba(17,32,58,0.06);
--ab-shadow-sm: 0 2px 6px rgba(17,32,58,0.07), 0 1px 2px rgba(17,32,58,0.04);
--ab-shadow-md: 0 8px 20px rgba(17,32,58,0.08), 0 2px 4px rgba(17,32,58,0.05);
--ab-ease: cubic-bezier(0.16, 1, 0.3, 1);
--font-display: 'Instrument Serif', 'Cormorant Garamond', serif;
--font-body: 'Geist', 'Inter', system-ui, sans-serif;
--font-mono: 'Geist Mono', 'JetBrains Mono', ui-monospace, monospace;
}
/* ---------- Base canvas ---------- */
.gradio-container, body, html {
background: var(--ab-parchment-100) !important;
color: var(--ab-ink-900) !important;
font-family: var(--font-body) !important;
font-feature-settings: 'ss01', 'cv01';
}
.gradio-container { max-width: 1440px !important; margin: 0 auto !important; padding: 24px 32px !important; }
/* Remove Gradio gradient backgrounds */
.gradio-container *::before, .gradio-container *::after { background-image: none !important; }
/* ---------- Header / brand ---------- */
.ab-header {
padding: 18px 4px 22px;
border-bottom: 1px solid var(--ab-border);
margin-bottom: 24px;
display: flex; align-items: baseline; justify-content: space-between; gap: 24px;
flex-wrap: wrap;
}
.ab-header__brand {
display: flex; align-items: baseline; gap: 14px;
}
.ab-header__mark {
font-family: var(--font-display);
font-size: 32px; line-height: 1;
color: var(--ab-gilt-500);
letter-spacing: -0.01em;
}
.ab-header__mark .heb { font-family: 'Frank Ruhl Libre', serif; }
.ab-header__mark .ar { font-family: 'Amiri', serif; }
.ab-header__title {
font-family: var(--font-display);
font-size: 38px; line-height: 1.05;
color: var(--ab-ink-900);
letter-spacing: -0.01em;
margin: 0;
}
.ab-header__title em { font-style: italic; color: var(--ab-gilt-600); }
.ab-header__sub {
font-family: var(--font-body);
color: var(--ab-ink-700);
font-size: 14px; line-height: 1.5;
max-width: 460px;
}
.ab-header__sub a { color: var(--ab-signal-500); text-decoration: underline; text-underline-offset: 3px; }
/* ---------- Eyebrow / labels / type ---------- */
.ab-eyebrow {
display: inline-block;
font-family: var(--font-body);
font-size: 11px; font-weight: 500;
text-transform: uppercase;
letter-spacing: 0.16em;
color: var(--ab-gilt-600);
margin-bottom: 6px;
}
.ab-h3 {
font-family: var(--font-display);
font-size: 22px; line-height: 1.2;
color: var(--ab-ink-900);
margin: 0 0 12px 0;
letter-spacing: -0.005em;
}
.ab-prose {
font-family: var(--font-body);
font-size: 14px; line-height: 1.55;
color: var(--ab-ink-700);
}
.ab-caption {
font-family: var(--font-body);
font-size: 12px;
color: var(--ab-ink-500);
letter-spacing: 0.02em;
}
.ab-divider {
height: 1px; background: var(--ab-border);
margin: 16px 0;
}
/* ---------- Cards ---------- */
.ab-card {
background: var(--ab-parchment-50);
border: 1px solid var(--ab-border);
border-radius: 12px;
padding: 20px 22px;
box-shadow: var(--ab-shadow-sm);
font-family: var(--font-body);
}
.ab-card--quiet {
background: transparent;
border-style: dashed;
box-shadow: none;
}
/* ---------- How-to (3-up) ---------- */
.ab-howto {
display: grid;
grid-template-columns: repeat(3, 1fr);
gap: 12px;
margin: 8px 0 20px;
}
@media (max-width: 900px) { .ab-howto { grid-template-columns: 1fr; } }
.ab-howto__step {
background: var(--ab-parchment-50);
border: 1px solid var(--ab-border);
border-radius: 12px;
padding: 16px 18px;
transition: transform var(--ab-ease) 220ms, box-shadow var(--ab-ease) 220ms;
}
.ab-howto__step:hover { transform: translateY(-1px); box-shadow: var(--ab-shadow-md); }
.ab-howto__num {
font-family: var(--font-display);
font-size: 28px;
color: var(--ab-gilt-500);
line-height: 1;
}
.ab-howto__title {
font-family: var(--font-body);
font-size: 14px; font-weight: 600;
color: var(--ab-ink-900);
margin: 8px 0 6px;
}
.ab-howto__body {
font-family: var(--font-body);
font-size: 13px; line-height: 1.5;
color: var(--ab-ink-700);
}
/* ---------- Result card ---------- */
.ab-result__head {
display: flex; align-items: center; gap: 14px;
}
.ab-result__marker {
font-size: 28px; line-height: 1;
}
.ab-result__label {
font-family: var(--font-display);
font-size: 28px;
line-height: 1.1;
letter-spacing: -0.01em;
margin-top: 2px;
}
.ab-score {
display: flex; align-items: center; gap: 14px;
margin: 6px 0 4px;
}
.ab-score__value {
font-family: var(--font-display);
font-size: 44px; line-height: 1;
letter-spacing: -0.02em;
}
.ab-score__value span { font-size: 16px; color: var(--ab-ink-500); margin-left: 2px; }
.ab-score__track {
flex: 1; height: 8px;
background: var(--ab-parchment-200);
border-radius: 999px; overflow: hidden;
}
.ab-score__fill {
height: 100%;
background: linear-gradient(90deg, var(--ab-threat-400), var(--ab-gilt-400) 50%, var(--ab-safe-400));
border-radius: 999px;
transition: width 380ms var(--ab-ease);
}
.ab-bars { display: flex; flex-direction: column; gap: 10px; margin-top: 4px; }
.ab-bar__row {
display: flex; justify-content: space-between;
font-size: 13px; margin-bottom: 4px;
}
.ab-bar__label { color: var(--ab-ink-800); font-weight: 500; }
.ab-bar__value { color: var(--ab-ink-700); font-family: var(--font-mono); font-size: 12px; }
.ab-bar__track {
height: 8px; background: var(--ab-parchment-200);
border-radius: 999px; overflow: hidden;
}
.ab-bar__fill { height: 100%; border-radius: 999px; transition: width 380ms var(--ab-ease); }
.ab-quote {
margin-top: 18px;
padding: 14px 16px;
background: var(--ab-parchment-100);
border-left: 2px solid var(--ab-gilt-400);
border-radius: 4px;
}
.ab-quote blockquote {
font-family: var(--font-display);
font-style: italic;
font-size: 16px;
color: var(--ab-ink-800);
margin: 6px 0 0; padding: 0;
line-height: 1.45;
}
/* ---------- Stats ---------- */
.ab-kpi-row {
display: grid; grid-template-columns: repeat(3, 1fr); gap: 10px;
margin: 4px 0 16px;
}
.ab-kpi {
background: var(--ab-parchment-100);
border: 1px solid var(--ab-border-subtle);
border-radius: 8px;
padding: 10px 12px;
text-align: center;
}
.ab-kpi__label {
font-family: var(--font-body);
font-size: 11px; text-transform: uppercase; letter-spacing: 0.12em;
color: var(--ab-ink-500);
margin-bottom: 4px;
}
.ab-kpi__value {
font-family: var(--font-display);
font-size: 26px; line-height: 1;
color: var(--ab-ink-900);
letter-spacing: -0.01em;
}
.ab-stats { display: flex; flex-direction: column; }
.ab-stats__row {
display: flex; align-items: center; gap: 10px;
padding: 6px 0;
border-bottom: 1px solid var(--ab-border-subtle);
font-size: 13px;
}
.ab-stats__row:last-child { border-bottom: 0; }
.ab-stats__dot { width: 8px; height: 8px; border-radius: 999px; flex-shrink: 0; }
.ab-stats__name { color: var(--ab-ink-800); flex: 1; }
.ab-stats__count { color: var(--ab-ink-600); font-family: var(--font-mono); font-size: 12px; }
.ab-stats__count em { color: var(--ab-ink-500); font-style: normal; }
/* ---------- Gradio component overrides ---------- */
.gradio-container .block, .gradio-container .form, .gradio-container .panel {
background: transparent !important;
border: none !important;
}
.gradio-container .gr-box, .gradio-container .gr-panel,
.gradio-container .gr-form, .gradio-container [data-testid="block"] {
background: transparent !important;
border: none !important;
box-shadow: none !important;
}
/* Plot wrapper — paper card */
#tsne-chart {
background: var(--ab-parchment-50) !important;
border: 1px solid var(--ab-border) !important;
border-radius: 12px !important;
padding: 8px !important;
box-shadow: var(--ab-shadow-sm) !important;
}
/* Buttons */
.gradio-container button {
font-family: var(--font-body) !important;
font-weight: 500 !important;
letter-spacing: 0 !important;
border-radius: 8px !important;
transition: transform 80ms var(--ab-ease), background-color 220ms var(--ab-ease) !important;
}
.gradio-container button:active { transform: scale(0.98) !important; }
.gradio-container button.primary, .gradio-container button[variant="primary"] {
background: var(--ab-ink-900) !important;
color: var(--ab-parchment-50) !important;
border: 1px solid var(--ab-ink-900) !important;
}
.gradio-container button.primary:hover {
background: var(--ab-ink-800) !important;
}
.gradio-container button.secondary {
background: var(--ab-parchment-50) !important;
color: var(--ab-ink-900) !important;
border: 1px solid var(--ab-border) !important;
}
.gradio-container button.secondary:hover {
background: var(--ab-parchment-200) !important;
}
/* Text inputs / textareas */
.gradio-container input[type="text"],
.gradio-container textarea,
.gradio-container .gr-input,
.gradio-container .gr-textbox textarea {
background: var(--ab-parchment-50) !important;
color: var(--ab-ink-900) !important;
border: 1px solid var(--ab-border) !important;
border-radius: 8px !important;
font-family: var(--font-body) !important;
font-size: 14px !important;
box-shadow: inset 0 1px 2px rgba(17,32,58,0.04);
}
.gradio-container input[type="text"]:focus,
.gradio-container textarea:focus,
.gradio-container .gr-textbox textarea:focus {
outline: none !important;
border-color: var(--ab-gilt-400) !important;
box-shadow: 0 0 0 3px rgba(220,139,42,0.18) !important;
}
/* Labels */
.gradio-container label, .gradio-container .label-wrap {
color: var(--ab-ink-700) !important;
font-family: var(--font-body) !important;
font-size: 13px !important;
font-weight: 500 !important;
letter-spacing: 0.01em !important;
}
/* Dropdowns */
.gradio-container .gr-dropdown, .gradio-container [data-testid="dropdown"] select,
.gradio-container .wrap.svelte-1cl284s {
background: var(--ab-parchment-50) !important;
border: 1px solid var(--ab-border) !important;
border-radius: 8px !important;
color: var(--ab-ink-900) !important;
}
/* Checkbox group filter */
.gradio-container .gr-check-radio,
.gradio-container fieldset[data-testid="checkbox-group"] {
background: var(--ab-parchment-50) !important;
border: 1px solid var(--ab-border) !important;
border-radius: 12px !important;
padding: 12px 14px !important;
}
.gradio-container fieldset[data-testid="checkbox-group"] label {
background: var(--ab-parchment-100) !important;
border: 1px solid var(--ab-border-subtle) !important;
border-radius: 999px !important;
padding: 4px 10px !important;
margin: 3px !important;
font-size: 12px !important;
}
.gradio-container fieldset[data-testid="checkbox-group"] label:hover {
background: var(--ab-parchment-200) !important;
}
.gradio-container input[type="checkbox"]:checked + * {
color: var(--ab-ink-900) !important;
}
.gradio-container input[type="checkbox"] {
accent-color: var(--ab-gilt-400) !important;
}
/* Markdown */
.gradio-container .markdown, .gradio-container .prose {
color: var(--ab-ink-800) !important;
font-family: var(--font-body) !important;
}
.gradio-container .markdown h1, .gradio-container .markdown h2,
.gradio-container .prose h1, .gradio-container .prose h2 {
font-family: var(--font-display) !important;
color: var(--ab-ink-900) !important;
font-weight: 400 !important;
letter-spacing: -0.01em !important;
}
.gradio-container .markdown h3, .gradio-container .prose h3 {
font-family: var(--font-body) !important;
font-weight: 600 !important;
color: var(--ab-ink-900) !important;
font-size: 16px !important;
margin-bottom: 8px !important;
}
.gradio-container .markdown strong { color: var(--ab-ink-900) !important; font-weight: 600 !important; }
.gradio-container .markdown a { color: var(--ab-signal-500) !important; }
.gradio-container .markdown hr {
border: none !important;
border-top: 1px solid var(--ab-border) !important;
margin: 18px 0 !important;
}
/* Hidden bridges from Plotly DOM → Gradio state */
#click-index-input, #legend-sync-input {
position: absolute !important;
width: 1px !important;
height: 1px !important;
overflow: hidden !important;
opacity: 0 !important;
pointer-events: none !important;
}
/* Footer */
.ab-footer {
border-top: 1px solid var(--ab-border);
margin-top: 36px;
padding-top: 18px;
text-align: center;
}
.ab-footer__line {
font-family: var(--font-body);
color: var(--ab-ink-500);
font-size: 12px;
letter-spacing: 0.02em;
}
.ab-footer__line a { color: var(--ab-signal-500); }
.ab-footer__mark {
font-family: var(--font-display);
color: var(--ab-gilt-500);
font-size: 14px;
letter-spacing: 0.04em;
margin-bottom: 6px;
}
.ab-footer__mark .heb { font-family: 'Frank Ruhl Libre', serif; }
.ab-footer__mark .ar { font-family: 'Amiri', serif; }
"""
# ---------------------------------------------------------------------------
# Header / How-to / Footer markup
# ---------------------------------------------------------------------------
HEADER_HTML = """
<header class="ab-header">
<div class="ab-header__brand">
<div class="ab-header__mark">
<span class="heb">א-ב</span>&nbsp;·&nbsp;<span class="ar">أب</span>
</div>
<div>
<h1 class="ab-header__title">GuardLLM <em>—</em> Prompt Security Visualizer</h1>
</div>
</div>
<p class="ab-header__sub">
Editorial inspection of the prompt attack surface. Powered by
<a href="https://huggingface.co/meta-llama/Llama-Prompt-Guard-2-86M" target="_blank">Llama Prompt Guard 2 (86M)</a>
on the <a href="https://huggingface.co/datasets/neuralchemy/Prompt-injection-dataset" target="_blank">neuralchemy</a> corpus.
</p>
</header>
"""
HOW_TO_HTML = """
<div class="ab-howto">
<div class="ab-howto__step">
<div class="ab-howto__num">01</div>
<div class="ab-eyebrow">Map</div>
<div class="ab-howto__title">Explore the landscape</div>
<div class="ab-howto__body">
Each point is a prompt placed by semantic similarity. Color encodes the attack class.
Hover to preview, scroll to zoom, drag to pan.
</div>
</div>
<div class="ab-howto__step">
<div class="ab-howto__num">02</div>
<div class="ab-eyebrow">Inspect</div>
<div class="ab-howto__title">Click to analyze</div>
<div class="ab-howto__body">
Selecting a point runs the classifier and returns a verdict, a safety score,
and the full class probability breakdown.
</div>
</div>
<div class="ab-howto__step">
<div class="ab-howto__num">03</div>
<div class="ab-eyebrow">Probe</div>
<div class="ab-howto__title">Try your own prompt</div>
<div class="ab-howto__body">
Paste any text into the custom field below to see whether the model would flag
it as injection or jailbreak.
</div>
</div>
</div>
"""
FOOTER_HTML = """
<footer class="ab-footer">
<div class="ab-footer__mark"><span class="heb">א-ב</span> · ALEPH BETH · <span class="ar">أب</span></div>
<div class="ab-footer__line">
GuardLLM — Prompt Security Visualizer.
Model: <a href="https://huggingface.co/meta-llama/Llama-Prompt-Guard-2-86M">Llama Prompt Guard 2 (86M)</a>.
Dataset: <a href="https://huggingface.co/datasets/neuralchemy/Prompt-injection-dataset">neuralchemy / Prompt-injection-dataset</a>.
</div>
</footer>
"""
# ---------------------------------------------------------------------------
# Gradio theme (parchment / ink)
# ---------------------------------------------------------------------------
ab_theme = gr.themes.Base(
primary_hue=gr.themes.Color(
c50=AB["parchment_50"], c100=AB["parchment_100"], c200=AB["parchment_200"],
c300=AB["parchment_300"], c400=AB["gilt_300"], c500=AB["gilt_400"],
c600=AB["gilt_500"], c700=AB["gilt_600"], c800=AB["ink_800"],
c900=AB["ink_900"], c950=AB["ink_950"],
),
neutral_hue=gr.themes.Color(
c50=AB["parchment_50"], c100=AB["parchment_100"], c200=AB["parchment_200"],
c300=AB["ink_200"], c400=AB["ink_300"], c500=AB["ink_500"],
c600=AB["ink_600"], c700=AB["ink_700"], c800=AB["ink_800"],
c900=AB["ink_900"], c950=AB["ink_950"],
),
font=[gr.themes.GoogleFont("Geist"), "Inter", "system-ui", "sans-serif"],
font_mono=[gr.themes.GoogleFont("Geist Mono"), "JetBrains Mono", "monospace"],
).set(
body_background_fill=AB["parchment_100"],
body_text_color=AB["ink_900"],
background_fill_primary=AB["parchment_50"],
background_fill_secondary=AB["parchment_100"],
border_color_primary="rgba(17,32,58,0.12)",
block_background_fill=AB["parchment_50"],
block_border_color="rgba(17,32,58,0.12)",
block_label_text_color=AB["ink_700"],
block_title_text_color=AB["ink_900"],
input_background_fill=AB["parchment_50"],
input_border_color="rgba(17,32,58,0.12)",
input_border_color_focus=AB["gilt_400"],
button_primary_background_fill=AB["ink_900"],
button_primary_background_fill_hover=AB["ink_800"],
button_primary_text_color=AB["parchment_50"],
button_secondary_background_fill=AB["parchment_50"],
button_secondary_background_fill_hover=AB["parchment_200"],
button_secondary_text_color=AB["ink_900"],
)
# ---------------------------------------------------------------------------
# Gradio Interface
# ---------------------------------------------------------------------------
with gr.Blocks(
title="GuardLLM — Prompt Security Visualizer",
theme=ab_theme,
css=ALEPH_BETH_CSS,
) as demo:
gr.HTML(HEADER_HTML)
gr.HTML(HOW_TO_HTML)
# Hidden bridges from Plotly DOM → Gradio state
click_index = gr.Textbox(value="", visible=True, elem_id="click-index-input")
legend_sync = gr.Textbox(value="", visible=True, elem_id="legend-sync-input")
with gr.Row():
# ============================================================
# LEFT — every way to pick a prompt
# ============================================================
with gr.Column(scale=3):
gr.HTML("<div class='ab-eyebrow'>Map</div>"
"<h3 class='ab-h3'>t-SNE — Prompt landscape</h3>")
tsne_plot = gr.Plot(
value=build_tsne_figure(),
label="t-SNE space",
elem_id="tsne-chart",
show_label=False,
)
gr.Markdown(
"<span class='ab-caption'>Click a point to inspect. "
"Click a legend entry to isolate that category — click again to restore. "
"Double-click a legend entry to toggle just that trace.</span>"
)
gr.HTML("<div class='ab-eyebrow' style='margin-top:18px;'>Filter</div>"
"<h3 class='ab-h3'>By category</h3>")
with gr.Row():
select_all_btn = gr.Button("Select all", size="sm", scale=1)
deselect_all_btn = gr.Button("Deselect all", size="sm", scale=1)
category_filter = gr.CheckboxGroup(
choices=UNIQUE_CATEGORIES,
value=UNIQUE_CATEGORIES,
label="Categories",
show_label=False,
interactive=True,
)
gr.HTML("<div class='ab-eyebrow' style='margin-top:18px;'>Library</div>"
"<h3 class='ab-h3'>Pick a prompt from the dataset</h3>")
prompt_dropdown = gr.Dropdown(
choices=DROPDOWN_CHOICES,
label="Search the dataset",
show_label=False,
filterable=True,
interactive=True,
)
gr.HTML("<div class='ab-eyebrow' style='margin-top:18px;'>Custom</div>"
"<h3 class='ab-h3'>Analyze your own prompt</h3>")
manual_input = gr.Textbox(
label="Prompt",
show_label=False,
placeholder="Type or paste a request to evaluate…",
lines=3,
)
analyze_btn = gr.Button("Inspect", variant="primary")
# ============================================================
# RIGHT — the analysis only
# ============================================================
with gr.Column(scale=2):
gr.HTML("<div class='ab-eyebrow'>Analysis</div>"
"<h3 class='ab-h3'>Verdict & confidence</h3>")
result_html = gr.HTML(value=empty_analysis_html())
risk_md = gr.Markdown(value="")
full_prompt = gr.Textbox(
label="Full prompt",
lines=4,
interactive=False,
visible=True,
)
gr.Markdown("---")
gr.HTML(build_stats_html())
# ---- Events ----
category_filter.change(fn=on_filter_change, inputs=[category_filter], outputs=[tsne_plot])
select_all_btn.click(fn=select_all_categories, inputs=[], outputs=[category_filter, tsne_plot])
deselect_all_btn.click(fn=deselect_all_categories, inputs=[], outputs=[category_filter, tsne_plot])
legend_sync.change(fn=on_legend_sync, inputs=[legend_sync],
outputs=[category_filter, tsne_plot])
click_index.change(fn=on_index_input, inputs=[click_index],
outputs=[result_html, risk_md, full_prompt])
prompt_dropdown.change(fn=on_dropdown_select, inputs=[prompt_dropdown],
outputs=[result_html, risk_md, full_prompt])
analyze_btn.click(fn=on_manual_analyze, inputs=[manual_input],
outputs=[result_html, risk_md])
manual_input.submit(fn=on_manual_analyze, inputs=[manual_input],
outputs=[result_html, risk_md])
demo.load(fn=None, inputs=None, outputs=None, js=PLOTLY_CLICK_JS)
gr.HTML(FOOTER_HTML)
logger.info("Gradio app built. Ready to launch.")
if __name__ == "__main__":
demo.launch()