""" GuardLLM — Prompt Security Visualizer Aleph Beth design system applied. Editorial calm, bilingual FR/EN posture. Powered by Llama Prompt Guard 2 (86M) and neuralchemy/Prompt-injection-dataset. """ import logging import os import sys import json import gradio as gr import torch import numpy as np import plotly.graph_objects as go from pathlib import Path # --------------------------------------------------------------------------- # Logging # --------------------------------------------------------------------------- logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[logging.StreamHandler(sys.stdout)], ) logger = logging.getLogger("GuardLLM") # --------------------------------------------------------------------------- # Aleph Beth — palette tokens (mirrored from colors_and_type.css) # --------------------------------------------------------------------------- AB = { "ink_950": "#0B1626", "ink_900": "#11203A", "ink_800": "#1B2F4E", "ink_700": "#2A4566", "ink_600": "#44607F", "ink_500": "#6B829D", "ink_400": "#95A6BB", "ink_300": "#BCC8D6", "ink_200": "#DAE1EA", "ink_100": "#ECF0F5", "ink_50": "#F6F8FB", "parchment_50": "#FCFAF2", "parchment_100": "#F8F3E6", "parchment_200": "#ECE5D2", "parchment_300": "#DDD3B9", "parchment_400": "#C2B695", "gilt_50": "#FCEEDA", "gilt_100": "#F8D9A4", "gilt_200": "#F2BD72", "gilt_300": "#EAA046", "gilt_400": "#DC8B2A", "gilt_500": "#A66718", "gilt_600": "#7A4912", "signal_100": "#C9DDEB", "signal_200": "#9BBFD9", "signal_300": "#6FA0C2", "signal_400": "#4A82AA", "signal_500": "#36678C", "signal_600": "#244D6B", "threat_400": "#D44A3E", "threat_300": "#E07065", "threat_100": "#F8DAD5", "safe_400": "#3F8F6E", "safe_300": "#66AB8C", "safe_100": "#D4E8DD", } # Category colors stay within the brand families — no neon, no inventions. CATEGORY_COLORS = { "benign": AB["safe_400"], "direct_injection": AB["threat_400"], "jailbreak": AB["gilt_400"], "system_extraction": AB["gilt_600"], "encoding_obfuscation": AB["signal_500"], "persona_replacement": AB["gilt_300"], "indirect_injection": AB["threat_300"], "token_smuggling": AB["signal_600"], "many_shot": AB["signal_400"], "crescendo": AB["signal_200"], "context_overflow": AB["ink_600"], "prompt_leaking": AB["gilt_500"], "unknown": AB["ink_400"], } CATEGORY_LABELS = { "benign": "Benign", "direct_injection": "Direct Injection", "jailbreak": "Jailbreak", "system_extraction": "System Extraction", "encoding_obfuscation": "Encoding / Obfuscation", "persona_replacement": "Persona Replacement", "indirect_injection": "Indirect Injection", "token_smuggling": "Token Smuggling", "many_shot": "Many-Shot", "crescendo": "Crescendo", "context_overflow": "Context Overflow", "prompt_leaking": "Prompt Leaking", "unknown": "Unknown", } LABEL_TO_KEY = {v: k for k, v in CATEGORY_LABELS.items()} # --------------------------------------------------------------------------- # Lazy-loaded risk classifier (Llama Prompt Guard 2) # --------------------------------------------------------------------------- MODEL_ID = "meta-llama/Llama-Prompt-Guard-2-86M" LABELS = ["Benign", "Malicious"] HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") _classifier = {"tokenizer": None, "model": None, "device": None} def get_classifier(): if _classifier["model"] is None: logger.info("Lazy-loading Llama Prompt Guard 2...") from transformers import AutoTokenizer, AutoModelForSequenceClassification kwargs = {"token": HF_TOKEN} if HF_TOKEN else {} tok = AutoTokenizer.from_pretrained(MODEL_ID, **kwargs) mdl = AutoModelForSequenceClassification.from_pretrained(MODEL_ID, **kwargs) mdl.eval() dev = torch.device("cuda" if torch.cuda.is_available() else "cpu") mdl.to(dev) _classifier["tokenizer"] = tok _classifier["model"] = mdl _classifier["device"] = dev logger.info("Classifier loaded on %s", dev) return _classifier["tokenizer"], _classifier["model"], _classifier["device"] # --------------------------------------------------------------------------- # Load precomputed t-SNE data # --------------------------------------------------------------------------- CACHE_DIR = Path(__file__).parent / "cache" CACHE_FILE = CACHE_DIR / "embeddings_tsne.npz" META_FILE = CACHE_DIR / "metadata.json" logger.info("Loading precomputed t-SNE cache from %s", CACHE_DIR) if not CACHE_FILE.exists() or not META_FILE.exists(): raise RuntimeError( "Cache files not found in %s. Run precompute.py first." % CACHE_DIR ) _npz = np.load(CACHE_FILE) TSNE_COORDS = _npz["tsne_2d"] with open(META_FILE, "r", encoding="utf-8") as f: METADATA = json.load(f) logger.info("Loaded %d points for visualization", len(METADATA)) ALL_TEXTS = [m["text"] for m in METADATA] ALL_CATEGORIES = [m["category"] for m in METADATA] ALL_SEVERITIES = [m["severity"] for m in METADATA] ALL_LABELS_DS = [m["label"] for m in METADATA] UNIQUE_CATEGORIES = sorted(set(ALL_CATEGORIES)) DROPDOWN_CHOICES = [] for i, m in enumerate(METADATA): preview = m["text"][:70].replace("\n", " ") if len(m["text"]) > 70: preview += "..." DROPDOWN_CHOICES.append(f"{i} | {m['category']} | {preview}") # --------------------------------------------------------------------------- # Analysis function # --------------------------------------------------------------------------- def analyze_prompt(text): if not text or not text.strip(): return {}, 0.0 tokenizer, model, DEVICE = get_classifier() inputs = tokenizer( text, return_tensors="pt", truncation=True, max_length=512, padding=True ).to(DEVICE) with torch.no_grad(): outputs = model(**inputs) probs = torch.softmax(outputs.logits, dim=-1)[0].cpu().numpy() prob_dict = {LABELS[i]: float(probs[i]) for i in range(len(LABELS))} safety = float(probs[0]) return prob_dict, safety # --------------------------------------------------------------------------- # Plotly figure — parchment surface, ink axes, restrained palette # --------------------------------------------------------------------------- def build_tsne_figure(selected_categories=None): fig = go.Figure() for cat in UNIQUE_CATEGORIES: indices = [ i for i, c in enumerate(ALL_CATEGORIES) if c == cat and (selected_categories is None or cat in selected_categories) ] if not indices: continue x = TSNE_COORDS[indices, 0].tolist() y = TSNE_COORDS[indices, 1].tolist() texts_preview = [ ALL_TEXTS[i][:80].replace("\n", " ") + ("..." if len(ALL_TEXTS[i]) > 80 else "") for i in indices ] severities = [ALL_SEVERITIES[i] or "benign" for i in indices] hover_texts = [ f"{CATEGORY_LABELS.get(cat, cat)}
" f"Severity — {sev}
" f"Index — {idx}
" f"{txt}" for idx, txt, sev in zip(indices, texts_preview, severities) ] color = CATEGORY_COLORS.get(cat, CATEGORY_COLORS["unknown"]) label = CATEGORY_LABELS.get(cat, cat) fig.add_trace(go.Scatter( x=x, y=y, mode="markers", name=label, marker=dict( size=5 if len(indices) > 500 else 7, color=color, opacity=0.78, line=dict(width=0.5, color="rgba(17,32,58,0.20)"), ), text=hover_texts, hoverinfo="text", customdata=[str(i) for i in indices], )) fig.update_layout( template="plotly_white", paper_bgcolor=AB["parchment_100"], plot_bgcolor=AB["parchment_50"], font=dict(family="Geist, Inter, system-ui, sans-serif", color=AB["ink_700"]), title=dict( text="" "t-SNE — Prompt Security Landscape", font=dict(color=AB["ink_900"]), x=0.5, xanchor="center", ), legend=dict( title=dict(text="Category", font=dict(color=AB["ink_700"], size=11)), bgcolor="rgba(252,250,242,0.88)", bordercolor="rgba(17,32,58,0.12)", borderwidth=1, font=dict(color=AB["ink_800"], size=10), itemsizing="constant", itemclick="toggleothers", itemdoubleclick="toggle", ), xaxis=dict( title=dict(text="t-SNE 1", font=dict(color=AB["ink_500"], size=11)), showgrid=True, gridcolor="rgba(17,32,58,0.06)", zeroline=False, color=AB["ink_500"], ), yaxis=dict( title=dict(text="t-SNE 2", font=dict(color=AB["ink_500"], size=11)), showgrid=True, gridcolor="rgba(17,32,58,0.06)", zeroline=False, color=AB["ink_500"], ), margin=dict(l=44, r=44, t=56, b=44), height=620, dragmode="pan", hoverlabel=dict( bgcolor=AB["parchment_50"], bordercolor="rgba(17,32,58,0.12)", font=dict(family="Geist, sans-serif", color=AB["ink_900"], size=12), ), ) return fig # --------------------------------------------------------------------------- # Callbacks # --------------------------------------------------------------------------- def on_filter_change(categories): sel = categories if categories else None return build_tsne_figure(sel) def select_all_categories(): return gr.update(value=UNIQUE_CATEGORIES), build_tsne_figure(UNIQUE_CATEGORIES) def deselect_all_categories(): return gr.update(value=[]), build_tsne_figure([]) def on_legend_sync(payload): """Plotly legend click → sync the checkbox filter + rebuild the chart.""" if not payload or not payload.strip(): return gr.update(), gr.update() try: data = json.loads(payload) visible_labels = data.get("visible", []) visible_keys = [LABEL_TO_KEY.get(lbl, lbl) for lbl in visible_labels] visible_keys = [k for k in visible_keys if k in UNIQUE_CATEGORIES] if not visible_keys: return gr.update(value=[]), build_tsne_figure([]) return gr.update(value=visible_keys), build_tsne_figure(visible_keys) except Exception as e: logger.error("legend sync error: %s", e) return gr.update(), gr.update() def _dataset_meta_block(category, severity, ground_truth): return ( f"\n\nDataset metadata\n" f"- Category — **{CATEGORY_LABELS.get(category, category)}**\n" f"- Severity — **{severity}**\n" f"- Ground truth — **{ground_truth}**\n" ) def on_dropdown_select(choice): if not choice: return empty_analysis_html(), "*Select a prompt to begin.*", "" try: idx = int(choice.split(" | ")[0]) text = ALL_TEXTS[idx] category = ALL_CATEGORIES[idx] severity = ALL_SEVERITIES[idx] or "N/A" ground_truth = "Malicious" if ALL_LABELS_DS[idx] == 1 else "Benign" prob_dict, _ = analyze_prompt(text) pred_label = max(prob_dict, key=prob_dict.get) confidence = prob_dict[pred_label] result_html = build_result_html(pred_label, confidence, prob_dict, text) risk_text = build_risk_assessment(pred_label, confidence, prob_dict) risk_text += _dataset_meta_block(category, severity, ground_truth) return result_html, risk_text, text except Exception as e: logger.error("Error: %s", e) return empty_analysis_html(), f"Error — {e}", "" def on_index_input(idx_str): if not idx_str or not idx_str.strip(): return empty_analysis_html(), "*Click a point on the chart.*", "" try: idx = int(idx_str.strip()) if idx < 0 or idx >= len(ALL_TEXTS): return empty_analysis_html(), f"Invalid index — {idx}", "" text = ALL_TEXTS[idx] category = ALL_CATEGORIES[idx] severity = ALL_SEVERITIES[idx] or "N/A" ground_truth = "Malicious" if ALL_LABELS_DS[idx] == 1 else "Benign" prob_dict, _ = analyze_prompt(text) pred_label = max(prob_dict, key=prob_dict.get) confidence = prob_dict[pred_label] result_html = build_result_html(pred_label, confidence, prob_dict, text) risk_text = build_risk_assessment(pred_label, confidence, prob_dict) risk_text += _dataset_meta_block(category, severity, ground_truth) return result_html, risk_text, text except Exception as e: logger.error("Error: %s", e) return empty_analysis_html(), f"Error — {e}", "" def on_manual_analyze(text): if not text or not text.strip(): return empty_analysis_html(), "" prob_dict, _ = analyze_prompt(text) pred_label = max(prob_dict, key=prob_dict.get) confidence = prob_dict[pred_label] result_html = build_result_html(pred_label, confidence, prob_dict, text) risk_text = build_risk_assessment(pred_label, confidence, prob_dict) return result_html, risk_text # --------------------------------------------------------------------------- # UI builders — editorial, parchment surface, ink type, no emoji # --------------------------------------------------------------------------- def empty_analysis_html(): return f"""
Idle

Click a point on the chart, pick a prompt from the list, or paste your own below. The classifier runs on demand.

""" def build_result_html(label, confidence, probs, text): is_safe = label == "Benign" accent = AB["safe_400"] if is_safe else AB["threat_400"] marker = "●" # geometric primitive instead of emoji pct = confidence * 100 safety_score = probs["Benign"] * 100 safety_color = ( AB["safe_400"] if safety_score >= 70 else AB["gilt_400"] if safety_score >= 40 else AB["threat_400"] ) bars_html = "" for lbl in LABELS: p = probs[lbl] * 100 c = AB["safe_400"] if lbl == "Benign" else AB["threat_400"] bars_html += f"""
{lbl} {p:.1f}%
""" preview = text[:180].replace("<", "<").replace(">", ">") if len(text) > 180: preview += "…" return f"""
{marker}
Verdict
{label}
Confidence — {pct:.1f}%
Safety score
{safety_score:.0f}/100
Class probabilities
{bars_html}
Analyzed prompt
“{preview}”
""" def build_risk_assessment(label, confidence, probs): safety_score = probs["Benign"] * 100 malicious_score = probs["Malicious"] * 100 if label == "Benign" and confidence > 0.85: level = "Low" desc = "The request appears **safe**. No injection or jailbreak patterns were detected." elif label == "Benign": level = "Moderate" desc = "Likely benign, with moderate confidence. The wording may be ambiguous." elif confidence > 0.85: level = "Critical" desc = "**Malicious request detected** with high confidence. Likely injection or jailbreak." else: level = "High" desc = "**Malicious request detected.** Possible injection or jailbreak — review recommended." return ( f"Risk level — {level}\n\n{desc}\n\n" f"- Safety score — **{safety_score:.0f}/100**\n" f"- Predicted class — **{label}** ({confidence*100:.1f}%)\n" f"- P(Benign) — {probs['Benign']*100:.1f}%  ·  P(Malicious) — {malicious_score:.1f}%\n" ) def build_stats_html(): total = len(METADATA) n_benign = sum(1 for m in METADATA if m["label"] == 0) n_malicious = total - n_benign cat_counts = {} for m in METADATA: cat_counts[m["category"]] = cat_counts.get(m["category"], 0) + 1 cats_html = "" for cat in sorted(cat_counts.keys(), key=lambda c: -cat_counts[c]): count = cat_counts[cat] color = CATEGORY_COLORS.get(cat, CATEGORY_COLORS["unknown"]) pct = count / total * 100 label = CATEGORY_LABELS.get(cat, cat) cats_html += ( f'
' f'' f'{label}' f'{count:,} ({pct:.1f}%)' f'
' ) return f"""
Dataset

Composition

Total
{total:,}
Benign
{n_benign:,}
Malicious
{n_malicious:,}
{cats_html}
""" # --------------------------------------------------------------------------- # JavaScript bridge: Plotly clicks → Gradio hidden input # --------------------------------------------------------------------------- PLOTLY_CLICK_JS = """ () => { function pushToHidden(selector, value) { const el = document.querySelector(selector + ' textarea') || document.querySelector(selector + ' input'); if (!el) return; const proto = el.tagName === 'TEXTAREA' ? window.HTMLTextAreaElement.prototype : window.HTMLInputElement.prototype; const setter = Object.getOwnPropertyDescriptor(proto, 'value').set; setter.call(el, String(value)); el.dispatchEvent(new Event('input', { bubbles: true })); setTimeout(() => el.dispatchEvent(new Event('change', { bubbles: true })), 40); } function attachHandlers(plotEl) { if (!plotEl || plotEl._abHandlersAttached) return; plotEl._abHandlersAttached = true; // Point click → push index to #click-index-input plotEl.on('plotly_click', function (data) { if (data && data.points && data.points.length > 0) { const idx = data.points[0].customdata; if (idx !== undefined && idx !== null) { pushToHidden('#click-index-input', idx); } } }); // Legend click → after toggleothers settles, read visible trace names // and push them to #legend-sync-input as JSON {visible: [...]}. plotEl.on('plotly_legendclick', function (ed) { setTimeout(() => { const visible = (plotEl.data || []) .filter(t => t.visible === undefined || t.visible === true) .map(t => t.name); pushToHidden('#legend-sync-input', JSON.stringify({visible: visible})); }, 60); return true; // allow Plotly to process its default toggleothers }); plotEl.on('plotly_legenddoubleclick', function (ed) { setTimeout(() => { const visible = (plotEl.data || []) .filter(t => t.visible === undefined || t.visible === true) .map(t => t.name); pushToHidden('#legend-sync-input', JSON.stringify({visible: visible})); }, 60); return true; }); } function setup() { const plotEl = document.querySelector('#tsne-chart .js-plotly-plot'); if (!plotEl) { setTimeout(setup, 500); return; } attachHandlers(plotEl); const root = document.querySelector('#tsne-chart') || document.body; const observer = new MutationObserver(() => { const newPlot = document.querySelector('#tsne-chart .js-plotly-plot'); if (newPlot) attachHandlers(newPlot); }); observer.observe(root, { childList: true, subtree: true }); } setTimeout(setup, 1000); } """ # --------------------------------------------------------------------------- # Aleph Beth — global CSS # --------------------------------------------------------------------------- ALEPH_BETH_CSS = """ @import url('https://fonts.googleapis.com/css2?family=Instrument+Serif:ital@0;1&family=Geist:wght@300;400;500;600;700&family=Geist+Mono:wght@400;500;600&family=Frank+Ruhl+Libre:wght@400;500&family=Amiri:wght@400;700&display=swap'); :root, .gradio-container { --ab-ink-950:#0B1626; --ab-ink-900:#11203A; --ab-ink-800:#1B2F4E; --ab-ink-700:#2A4566; --ab-ink-600:#44607F; --ab-ink-500:#6B829D; --ab-ink-400:#95A6BB; --ab-ink-300:#BCC8D6; --ab-ink-200:#DAE1EA; --ab-ink-100:#ECF0F5; --ab-ink-50:#F6F8FB; --ab-parchment-50:#FCFAF2; --ab-parchment-100:#F8F3E6; --ab-parchment-200:#ECE5D2; --ab-parchment-300:#DDD3B9; --ab-gilt-300:#EAA046; --ab-gilt-400:#DC8B2A; --ab-gilt-500:#A66718; --ab-gilt-600:#7A4912; --ab-signal-300:#6FA0C2; --ab-signal-400:#4A82AA; --ab-signal-500:#36678C; --ab-threat-400:#D44A3E; --ab-safe-400:#3F8F6E; --ab-border: rgba(17,32,58,0.12); --ab-border-subtle: rgba(17,32,58,0.06); --ab-shadow-sm: 0 2px 6px rgba(17,32,58,0.07), 0 1px 2px rgba(17,32,58,0.04); --ab-shadow-md: 0 8px 20px rgba(17,32,58,0.08), 0 2px 4px rgba(17,32,58,0.05); --ab-ease: cubic-bezier(0.16, 1, 0.3, 1); --font-display: 'Instrument Serif', 'Cormorant Garamond', serif; --font-body: 'Geist', 'Inter', system-ui, sans-serif; --font-mono: 'Geist Mono', 'JetBrains Mono', ui-monospace, monospace; } /* ---------- Base canvas ---------- */ .gradio-container, body, html { background: var(--ab-parchment-100) !important; color: var(--ab-ink-900) !important; font-family: var(--font-body) !important; font-feature-settings: 'ss01', 'cv01'; } .gradio-container { max-width: 1440px !important; margin: 0 auto !important; padding: 24px 32px !important; } /* Remove Gradio gradient backgrounds */ .gradio-container *::before, .gradio-container *::after { background-image: none !important; } /* ---------- Header / brand ---------- */ .ab-header { padding: 18px 4px 22px; border-bottom: 1px solid var(--ab-border); margin-bottom: 24px; display: flex; align-items: baseline; justify-content: space-between; gap: 24px; flex-wrap: wrap; } .ab-header__brand { display: flex; align-items: baseline; gap: 14px; } .ab-header__mark { font-family: var(--font-display); font-size: 32px; line-height: 1; color: var(--ab-gilt-500); letter-spacing: -0.01em; } .ab-header__mark .heb { font-family: 'Frank Ruhl Libre', serif; } .ab-header__mark .ar { font-family: 'Amiri', serif; } .ab-header__title { font-family: var(--font-display); font-size: 38px; line-height: 1.05; color: var(--ab-ink-900); letter-spacing: -0.01em; margin: 0; } .ab-header__title em { font-style: italic; color: var(--ab-gilt-600); } .ab-header__sub { font-family: var(--font-body); color: var(--ab-ink-700); font-size: 14px; line-height: 1.5; max-width: 460px; } .ab-header__sub a { color: var(--ab-signal-500); text-decoration: underline; text-underline-offset: 3px; } /* ---------- Eyebrow / labels / type ---------- */ .ab-eyebrow { display: inline-block; font-family: var(--font-body); font-size: 11px; font-weight: 500; text-transform: uppercase; letter-spacing: 0.16em; color: var(--ab-gilt-600); margin-bottom: 6px; } .ab-h3 { font-family: var(--font-display); font-size: 22px; line-height: 1.2; color: var(--ab-ink-900); margin: 0 0 12px 0; letter-spacing: -0.005em; } .ab-prose { font-family: var(--font-body); font-size: 14px; line-height: 1.55; color: var(--ab-ink-700); } .ab-caption { font-family: var(--font-body); font-size: 12px; color: var(--ab-ink-500); letter-spacing: 0.02em; } .ab-divider { height: 1px; background: var(--ab-border); margin: 16px 0; } /* ---------- Cards ---------- */ .ab-card { background: var(--ab-parchment-50); border: 1px solid var(--ab-border); border-radius: 12px; padding: 20px 22px; box-shadow: var(--ab-shadow-sm); font-family: var(--font-body); } .ab-card--quiet { background: transparent; border-style: dashed; box-shadow: none; } /* ---------- How-to (3-up) ---------- */ .ab-howto { display: grid; grid-template-columns: repeat(3, 1fr); gap: 12px; margin: 8px 0 20px; } @media (max-width: 900px) { .ab-howto { grid-template-columns: 1fr; } } .ab-howto__step { background: var(--ab-parchment-50); border: 1px solid var(--ab-border); border-radius: 12px; padding: 16px 18px; transition: transform var(--ab-ease) 220ms, box-shadow var(--ab-ease) 220ms; } .ab-howto__step:hover { transform: translateY(-1px); box-shadow: var(--ab-shadow-md); } .ab-howto__num { font-family: var(--font-display); font-size: 28px; color: var(--ab-gilt-500); line-height: 1; } .ab-howto__title { font-family: var(--font-body); font-size: 14px; font-weight: 600; color: var(--ab-ink-900); margin: 8px 0 6px; } .ab-howto__body { font-family: var(--font-body); font-size: 13px; line-height: 1.5; color: var(--ab-ink-700); } /* ---------- Result card ---------- */ .ab-result__head { display: flex; align-items: center; gap: 14px; } .ab-result__marker { font-size: 28px; line-height: 1; } .ab-result__label { font-family: var(--font-display); font-size: 28px; line-height: 1.1; letter-spacing: -0.01em; margin-top: 2px; } .ab-score { display: flex; align-items: center; gap: 14px; margin: 6px 0 4px; } .ab-score__value { font-family: var(--font-display); font-size: 44px; line-height: 1; letter-spacing: -0.02em; } .ab-score__value span { font-size: 16px; color: var(--ab-ink-500); margin-left: 2px; } .ab-score__track { flex: 1; height: 8px; background: var(--ab-parchment-200); border-radius: 999px; overflow: hidden; } .ab-score__fill { height: 100%; background: linear-gradient(90deg, var(--ab-threat-400), var(--ab-gilt-400) 50%, var(--ab-safe-400)); border-radius: 999px; transition: width 380ms var(--ab-ease); } .ab-bars { display: flex; flex-direction: column; gap: 10px; margin-top: 4px; } .ab-bar__row { display: flex; justify-content: space-between; font-size: 13px; margin-bottom: 4px; } .ab-bar__label { color: var(--ab-ink-800); font-weight: 500; } .ab-bar__value { color: var(--ab-ink-700); font-family: var(--font-mono); font-size: 12px; } .ab-bar__track { height: 8px; background: var(--ab-parchment-200); border-radius: 999px; overflow: hidden; } .ab-bar__fill { height: 100%; border-radius: 999px; transition: width 380ms var(--ab-ease); } .ab-quote { margin-top: 18px; padding: 14px 16px; background: var(--ab-parchment-100); border-left: 2px solid var(--ab-gilt-400); border-radius: 4px; } .ab-quote blockquote { font-family: var(--font-display); font-style: italic; font-size: 16px; color: var(--ab-ink-800); margin: 6px 0 0; padding: 0; line-height: 1.45; } /* ---------- Stats ---------- */ .ab-kpi-row { display: grid; grid-template-columns: repeat(3, 1fr); gap: 10px; margin: 4px 0 16px; } .ab-kpi { background: var(--ab-parchment-100); border: 1px solid var(--ab-border-subtle); border-radius: 8px; padding: 10px 12px; text-align: center; } .ab-kpi__label { font-family: var(--font-body); font-size: 11px; text-transform: uppercase; letter-spacing: 0.12em; color: var(--ab-ink-500); margin-bottom: 4px; } .ab-kpi__value { font-family: var(--font-display); font-size: 26px; line-height: 1; color: var(--ab-ink-900); letter-spacing: -0.01em; } .ab-stats { display: flex; flex-direction: column; } .ab-stats__row { display: flex; align-items: center; gap: 10px; padding: 6px 0; border-bottom: 1px solid var(--ab-border-subtle); font-size: 13px; } .ab-stats__row:last-child { border-bottom: 0; } .ab-stats__dot { width: 8px; height: 8px; border-radius: 999px; flex-shrink: 0; } .ab-stats__name { color: var(--ab-ink-800); flex: 1; } .ab-stats__count { color: var(--ab-ink-600); font-family: var(--font-mono); font-size: 12px; } .ab-stats__count em { color: var(--ab-ink-500); font-style: normal; } /* ---------- Gradio component overrides ---------- */ .gradio-container .block, .gradio-container .form, .gradio-container .panel { background: transparent !important; border: none !important; } .gradio-container .gr-box, .gradio-container .gr-panel, .gradio-container .gr-form, .gradio-container [data-testid="block"] { background: transparent !important; border: none !important; box-shadow: none !important; } /* Plot wrapper — paper card */ #tsne-chart { background: var(--ab-parchment-50) !important; border: 1px solid var(--ab-border) !important; border-radius: 12px !important; padding: 8px !important; box-shadow: var(--ab-shadow-sm) !important; } /* Buttons */ .gradio-container button { font-family: var(--font-body) !important; font-weight: 500 !important; letter-spacing: 0 !important; border-radius: 8px !important; transition: transform 80ms var(--ab-ease), background-color 220ms var(--ab-ease) !important; } .gradio-container button:active { transform: scale(0.98) !important; } .gradio-container button.primary, .gradio-container button[variant="primary"] { background: var(--ab-ink-900) !important; color: var(--ab-parchment-50) !important; border: 1px solid var(--ab-ink-900) !important; } .gradio-container button.primary:hover { background: var(--ab-ink-800) !important; } .gradio-container button.secondary { background: var(--ab-parchment-50) !important; color: var(--ab-ink-900) !important; border: 1px solid var(--ab-border) !important; } .gradio-container button.secondary:hover { background: var(--ab-parchment-200) !important; } /* Text inputs / textareas */ .gradio-container input[type="text"], .gradio-container textarea, .gradio-container .gr-input, .gradio-container .gr-textbox textarea { background: var(--ab-parchment-50) !important; color: var(--ab-ink-900) !important; border: 1px solid var(--ab-border) !important; border-radius: 8px !important; font-family: var(--font-body) !important; font-size: 14px !important; box-shadow: inset 0 1px 2px rgba(17,32,58,0.04); } .gradio-container input[type="text"]:focus, .gradio-container textarea:focus, .gradio-container .gr-textbox textarea:focus { outline: none !important; border-color: var(--ab-gilt-400) !important; box-shadow: 0 0 0 3px rgba(220,139,42,0.18) !important; } /* Labels */ .gradio-container label, .gradio-container .label-wrap { color: var(--ab-ink-700) !important; font-family: var(--font-body) !important; font-size: 13px !important; font-weight: 500 !important; letter-spacing: 0.01em !important; } /* Dropdowns */ .gradio-container .gr-dropdown, .gradio-container [data-testid="dropdown"] select, .gradio-container .wrap.svelte-1cl284s { background: var(--ab-parchment-50) !important; border: 1px solid var(--ab-border) !important; border-radius: 8px !important; color: var(--ab-ink-900) !important; } /* Checkbox group filter */ .gradio-container .gr-check-radio, .gradio-container fieldset[data-testid="checkbox-group"] { background: var(--ab-parchment-50) !important; border: 1px solid var(--ab-border) !important; border-radius: 12px !important; padding: 12px 14px !important; } .gradio-container fieldset[data-testid="checkbox-group"] label { background: var(--ab-parchment-100) !important; border: 1px solid var(--ab-border-subtle) !important; border-radius: 999px !important; padding: 4px 10px !important; margin: 3px !important; font-size: 12px !important; } .gradio-container fieldset[data-testid="checkbox-group"] label:hover { background: var(--ab-parchment-200) !important; } .gradio-container input[type="checkbox"]:checked + * { color: var(--ab-ink-900) !important; } .gradio-container input[type="checkbox"] { accent-color: var(--ab-gilt-400) !important; } /* Markdown */ .gradio-container .markdown, .gradio-container .prose { color: var(--ab-ink-800) !important; font-family: var(--font-body) !important; } .gradio-container .markdown h1, .gradio-container .markdown h2, .gradio-container .prose h1, .gradio-container .prose h2 { font-family: var(--font-display) !important; color: var(--ab-ink-900) !important; font-weight: 400 !important; letter-spacing: -0.01em !important; } .gradio-container .markdown h3, .gradio-container .prose h3 { font-family: var(--font-body) !important; font-weight: 600 !important; color: var(--ab-ink-900) !important; font-size: 16px !important; margin-bottom: 8px !important; } .gradio-container .markdown strong { color: var(--ab-ink-900) !important; font-weight: 600 !important; } .gradio-container .markdown a { color: var(--ab-signal-500) !important; } .gradio-container .markdown hr { border: none !important; border-top: 1px solid var(--ab-border) !important; margin: 18px 0 !important; } /* Hidden bridges from Plotly DOM → Gradio state */ #click-index-input, #legend-sync-input { position: absolute !important; width: 1px !important; height: 1px !important; overflow: hidden !important; opacity: 0 !important; pointer-events: none !important; } /* Footer */ .ab-footer { border-top: 1px solid var(--ab-border); margin-top: 36px; padding-top: 18px; text-align: center; } .ab-footer__line { font-family: var(--font-body); color: var(--ab-ink-500); font-size: 12px; letter-spacing: 0.02em; } .ab-footer__line a { color: var(--ab-signal-500); } .ab-footer__mark { font-family: var(--font-display); color: var(--ab-gilt-500); font-size: 14px; letter-spacing: 0.04em; margin-bottom: 6px; } .ab-footer__mark .heb { font-family: 'Frank Ruhl Libre', serif; } .ab-footer__mark .ar { font-family: 'Amiri', serif; } """ # --------------------------------------------------------------------------- # Header / How-to / Footer markup # --------------------------------------------------------------------------- HEADER_HTML = """
א-ב · أب

GuardLLM Prompt Security Visualizer

Editorial inspection of the prompt attack surface. Powered by Llama Prompt Guard 2 (86M) on the neuralchemy corpus.

""" HOW_TO_HTML = """
01
Map
Explore the landscape
Each point is a prompt placed by semantic similarity. Color encodes the attack class. Hover to preview, scroll to zoom, drag to pan.
02
Inspect
Click to analyze
Selecting a point runs the classifier and returns a verdict, a safety score, and the full class probability breakdown.
03
Probe
Try your own prompt
Paste any text into the custom field below to see whether the model would flag it as injection or jailbreak.
""" FOOTER_HTML = """ """ # --------------------------------------------------------------------------- # Gradio theme (parchment / ink) # --------------------------------------------------------------------------- ab_theme = gr.themes.Base( primary_hue=gr.themes.Color( c50=AB["parchment_50"], c100=AB["parchment_100"], c200=AB["parchment_200"], c300=AB["parchment_300"], c400=AB["gilt_300"], c500=AB["gilt_400"], c600=AB["gilt_500"], c700=AB["gilt_600"], c800=AB["ink_800"], c900=AB["ink_900"], c950=AB["ink_950"], ), neutral_hue=gr.themes.Color( c50=AB["parchment_50"], c100=AB["parchment_100"], c200=AB["parchment_200"], c300=AB["ink_200"], c400=AB["ink_300"], c500=AB["ink_500"], c600=AB["ink_600"], c700=AB["ink_700"], c800=AB["ink_800"], c900=AB["ink_900"], c950=AB["ink_950"], ), font=[gr.themes.GoogleFont("Geist"), "Inter", "system-ui", "sans-serif"], font_mono=[gr.themes.GoogleFont("Geist Mono"), "JetBrains Mono", "monospace"], ).set( body_background_fill=AB["parchment_100"], body_text_color=AB["ink_900"], background_fill_primary=AB["parchment_50"], background_fill_secondary=AB["parchment_100"], border_color_primary="rgba(17,32,58,0.12)", block_background_fill=AB["parchment_50"], block_border_color="rgba(17,32,58,0.12)", block_label_text_color=AB["ink_700"], block_title_text_color=AB["ink_900"], input_background_fill=AB["parchment_50"], input_border_color="rgba(17,32,58,0.12)", input_border_color_focus=AB["gilt_400"], button_primary_background_fill=AB["ink_900"], button_primary_background_fill_hover=AB["ink_800"], button_primary_text_color=AB["parchment_50"], button_secondary_background_fill=AB["parchment_50"], button_secondary_background_fill_hover=AB["parchment_200"], button_secondary_text_color=AB["ink_900"], ) # --------------------------------------------------------------------------- # Gradio Interface # --------------------------------------------------------------------------- with gr.Blocks( title="GuardLLM — Prompt Security Visualizer", theme=ab_theme, css=ALEPH_BETH_CSS, ) as demo: gr.HTML(HEADER_HTML) gr.HTML(HOW_TO_HTML) # Hidden bridges from Plotly DOM → Gradio state click_index = gr.Textbox(value="", visible=True, elem_id="click-index-input") legend_sync = gr.Textbox(value="", visible=True, elem_id="legend-sync-input") with gr.Row(): # ============================================================ # LEFT — every way to pick a prompt # ============================================================ with gr.Column(scale=3): gr.HTML("
Map
" "

t-SNE — Prompt landscape

") tsne_plot = gr.Plot( value=build_tsne_figure(), label="t-SNE space", elem_id="tsne-chart", show_label=False, ) gr.Markdown( "Click a point to inspect. " "Click a legend entry to isolate that category — click again to restore. " "Double-click a legend entry to toggle just that trace." ) gr.HTML("
Filter
" "

By category

") with gr.Row(): select_all_btn = gr.Button("Select all", size="sm", scale=1) deselect_all_btn = gr.Button("Deselect all", size="sm", scale=1) category_filter = gr.CheckboxGroup( choices=UNIQUE_CATEGORIES, value=UNIQUE_CATEGORIES, label="Categories", show_label=False, interactive=True, ) gr.HTML("
Library
" "

Pick a prompt from the dataset

") prompt_dropdown = gr.Dropdown( choices=DROPDOWN_CHOICES, label="Search the dataset", show_label=False, filterable=True, interactive=True, ) gr.HTML("
Custom
" "

Analyze your own prompt

") manual_input = gr.Textbox( label="Prompt", show_label=False, placeholder="Type or paste a request to evaluate…", lines=3, ) analyze_btn = gr.Button("Inspect", variant="primary") # ============================================================ # RIGHT — the analysis only # ============================================================ with gr.Column(scale=2): gr.HTML("
Analysis
" "

Verdict & confidence

") result_html = gr.HTML(value=empty_analysis_html()) risk_md = gr.Markdown(value="") full_prompt = gr.Textbox( label="Full prompt", lines=4, interactive=False, visible=True, ) gr.Markdown("---") gr.HTML(build_stats_html()) # ---- Events ---- category_filter.change(fn=on_filter_change, inputs=[category_filter], outputs=[tsne_plot]) select_all_btn.click(fn=select_all_categories, inputs=[], outputs=[category_filter, tsne_plot]) deselect_all_btn.click(fn=deselect_all_categories, inputs=[], outputs=[category_filter, tsne_plot]) legend_sync.change(fn=on_legend_sync, inputs=[legend_sync], outputs=[category_filter, tsne_plot]) click_index.change(fn=on_index_input, inputs=[click_index], outputs=[result_html, risk_md, full_prompt]) prompt_dropdown.change(fn=on_dropdown_select, inputs=[prompt_dropdown], outputs=[result_html, risk_md, full_prompt]) analyze_btn.click(fn=on_manual_analyze, inputs=[manual_input], outputs=[result_html, risk_md]) manual_input.submit(fn=on_manual_analyze, inputs=[manual_input], outputs=[result_html, risk_md]) demo.load(fn=None, inputs=None, outputs=None, js=PLOTLY_CLICK_JS) gr.HTML(FOOTER_HTML) logger.info("Gradio app built. Ready to launch.") if __name__ == "__main__": demo.launch()