ItzRoBeerT commited on
Commit
17de1f9
·
1 Parent(s): 9957dd2

Added prompt injection tool

Browse files
Files changed (2) hide show
  1. mcp_server.py +2 -2
  2. src/securty/prompt_injection.py +243 -0
mcp_server.py CHANGED
@@ -1,5 +1,5 @@
1
  from mcp.server.fastmcp import FastMCP
2
-
3
  mcp = FastMCP("Youtube Service")
4
 
5
  @mcp.tool()
@@ -15,7 +15,7 @@ def say_hello(name: str) -> str:
15
  """
16
  return f"Hello, {name}!"
17
 
18
-
19
 
20
  if __name__ == "__main__":
21
  mcp.run()
 
1
  from mcp.server.fastmcp import FastMCP
2
+ from src.securty.prompt_injection import check_prompt_injection
3
  mcp = FastMCP("Youtube Service")
4
 
5
  @mcp.tool()
 
15
  """
16
  return f"Hello, {name}!"
17
 
18
+ mcp.add_tool(check_prompt_injection)
19
 
20
  if __name__ == "__main__":
21
  mcp.run()
src/securty/prompt_injection.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import Dict, List, Tuple
3
+
4
+ def check_prompt_injection(message: str) -> Dict[str, any]:
5
+ """
6
+ Checks if the message contains a prompt injection attempt.
7
+
8
+ Note: This function expects English text. If the model receives a message in another
9
+ language, it should translate it to English before calling this function.
10
+
11
+ Args:
12
+ message (str): The message to check (should be in English).
13
+
14
+ Returns:
15
+ Dict: A dictionary containing detection results with risk level and details.
16
+ """
17
+ if not message or not isinstance(message, str):
18
+ return {
19
+ "is_injection": False,
20
+ "risk_level": "none",
21
+ "confidence": 0.0,
22
+ "detected_patterns": [],
23
+ "risk_score": 0,
24
+ "message": "No valid input to analyze"
25
+ }
26
+
27
+ message_lower = message.lower().strip()
28
+
29
+ risk_score = 0
30
+ detected_patterns = []
31
+
32
+
33
+ suspicious_chars = {
34
+ '{}': 2,
35
+ '><': 1,
36
+ '&': 1,
37
+ '%': 1,
38
+ '$': 2,
39
+ '#': 1,
40
+ '|': 2,
41
+ ';': 3,
42
+ '`': 3,
43
+ '\\': 2,
44
+ }
45
+
46
+ char_score = 0
47
+ found_chars = []
48
+ for char, weight in suspicious_chars.items():
49
+ if char in message:
50
+ char_score += weight * message.count(char)
51
+ found_chars.append(char)
52
+
53
+ if char_score > 3:
54
+ risk_score += min(char_score, 10)
55
+ detected_patterns.append(f"Suspicious characters: {', '.join(found_chars)}")
56
+
57
+ # Prompt injection patterns
58
+ injection_patterns = [
59
+ # Direct instruction attempts
60
+ (r'\b(ignore|disregard|forget|skip)\s+(all\s+)?(previous|prior|above|earlier|past)\s+(instructions?|prompts?|rules?|commands?|directives?)', 9),
61
+ (r'\b(system|assistant|ai|bot):\s*', 6),
62
+ (r'\b(you\s+are\s+now|from\s+now\s+on|starting\s+now|new\s+instructions?)', 8),
63
+ (r'\b(act\s+as|pretend\s+to\s+be|roleplay\s+as|behave\s+like|simulate\s+being)', 6),
64
+
65
+ # System manipulation
66
+ (r'\[system\]|\[user\]|\[assistant\]|\[human\]', 8),
67
+ (r'<\s*(system|user|assistant|human)\s*>', 8),
68
+ (r'\b(override|bypass|circumvent|disable|turn\s+off)\s+(security|safety|guidelines|restrictions|filters)', 10),
69
+ (r'\b(jailbreak|break\s+free|escape\s+from|break\s+out)', 9),
70
+
71
+ # Data extraction attempts
72
+ (r'\b(repeat|echo|print|output|display|show|reveal)\s+(your|the)\s+(instructions?|prompts?|system\s+message|guidelines|rules)', 8),
73
+ (r'\b(what\s+(are\s+)?your|tell\s+me\s+your|give\s+me\s+your)\s+(instructions?|prompts?|guidelines|rules|system\s+message)', 7),
74
+ (r'\b(show\s+me|reveal|display|expose)\s+(your\s+)?(source|code|prompt|instructions?|system)', 7),
75
+
76
+ # Mode changes
77
+ (r'\b(developer|debug|admin|god|root|sudo)\s+mode', 8),
78
+ (r'\b(unrestricted|unlimited|uncensored|unfiltered)\s+(mode|access|version)', 8),
79
+ (r'\b(enable|activate|turn\s+on)\s+(developer|debug|admin)\s+mode', 9),
80
+
81
+ # Character breaking
82
+ (r'\b(break\s+character|step\s+out\s+of\s+character|ignore\s+your\s+role)', 7),
83
+ (r'\b(stop\s+being|quit\s+being|cease\s+being)\s+(an\s+)?(assistant|ai|bot)', 7),
84
+
85
+ # Social engineering
86
+ (r'\b(emergency|urgent|critical|important):\s*(ignore|override|bypass)', 6),
87
+ (r'\b(i\s+am\s+(your\s+)?(developer|creator|owner|admin|boss))', 5),
88
+ (r'\b(this\s+is\s+a\s+(test|drill|simulation|experiment))', 4),
89
+ (r'\b(for\s+research\s+purposes|educational\s+purposes)\s*(ignore|bypass)', 5),
90
+
91
+ # MCP/Tool specific
92
+ (r'\b(call|invoke|execute|use|run)\s+(tool|function|mcp|command)\s+(without|bypass|ignore|skip)\s+(safety|security|validation|checks?)', 9),
93
+ (r'\b(mcp|tool|function)\s+(override|bypass|ignore|disable)', 8),
94
+ (r'\b(raw|direct|unfiltered|unsafe)\s+(tool|mcp|function|command)\s+(access|call|execution)', 8),
95
+ (r'\b(disable|turn\s+off|deactivate|skip)\s+(safety|security|validation|filtering|checks?)', 9),
96
+
97
+ # Advanced manipulation
98
+ (r'\b(modify|change|alter|update)\s+(your\s+)?(behavior|instructions|guidelines|rules)', 7),
99
+ (r'\b(temporary|temporarily)\s+(ignore|disable|bypass)', 6),
100
+ (r'\b(just\s+this\s+once|only\s+this\s+time)\s+(ignore|bypass)', 5),
101
+ ]
102
+
103
+ pattern_matches = []
104
+ for pattern, weight in injection_patterns:
105
+ matches = re.findall(pattern, message_lower, re.IGNORECASE)
106
+ if matches:
107
+ risk_score += weight * len(matches)
108
+ pattern_matches.append(f"Injection pattern detected (weight: {weight})")
109
+
110
+ detected_patterns.extend(pattern_matches)
111
+
112
+ # Suspicious phrases
113
+ suspicious_phrases = {
114
+ 'ignore previous instructions': 10,
115
+ 'forget everything': 8,
116
+ 'new instructions': 7,
117
+ 'system prompt': 8,
118
+ 'you are now': 6,
119
+ 'act as if': 5,
120
+ 'pretend that': 5,
121
+ 'roleplay as': 5,
122
+ 'break character': 7,
123
+ 'step out of character': 7,
124
+ 'developer mode': 8,
125
+ 'admin mode': 8,
126
+ 'god mode': 7,
127
+ 'debug mode': 7,
128
+ 'unrestricted mode': 9,
129
+ 'jailbreak': 9,
130
+ 'prompt injection': 10,
131
+ 'ignore safety': 9,
132
+ 'bypass security': 9,
133
+ 'disable filters': 8,
134
+ 'raw access': 7,
135
+ 'direct access': 6,
136
+ 'unfiltered access': 8,
137
+ 'override safety': 9,
138
+ 'emergency override': 8,
139
+ 'for research purposes ignore': 6,
140
+ 'this is a test ignore': 5,
141
+ 'temporarily ignore': 6,
142
+ 'just this once ignore': 5,
143
+ 'modify your behavior': 7,
144
+ 'change your instructions': 8,
145
+ 'update your guidelines': 7,
146
+ 'alter your rules': 7,
147
+ }
148
+
149
+ phrase_matches = []
150
+ for phrase, weight in suspicious_phrases.items():
151
+ if phrase in message_lower:
152
+ risk_score += weight
153
+ phrase_matches.append(f"Suspicious phrase: '{phrase}'")
154
+
155
+ detected_patterns.extend(phrase_matches)
156
+
157
+ # Code injection patterns
158
+ code_patterns = [
159
+ (r'```\s*(python|javascript|bash|sh|cmd|powershell|sql|php)', 4),
160
+ (r'\b(eval|exec|system|subprocess|os\.|import\s+os|require\()', 6),
161
+ (r'<script|javascript:|vbscript:|data:|file://|ftp://', 7),
162
+ (r'\{\{.*\}\}', 5), # Template injection
163
+ (r'\$\{.*\}', 5), # Variable substitution
164
+ (r'<%.*%>', 5), # ASP/ERB style
165
+ (r'<\?.*\?>', 5), # PHP style
166
+ (r'\{\%.*\%\}', 5), # Jinja2/Django style
167
+ ]
168
+
169
+ for pattern, weight in code_patterns:
170
+ matches = re.findall(pattern, message_lower, re.IGNORECASE)
171
+ if matches:
172
+ risk_score += weight * len(matches)
173
+ detected_patterns.append(f"Code injection pattern detected")
174
+
175
+ # 5. Length and repetition analysis
176
+ if len(message) > 2000:
177
+ risk_score += 2
178
+ detected_patterns.append("Unusually long message")
179
+
180
+ # Check for repeated patterns (could indicate injection attempts)
181
+ words = message_lower.split()
182
+ if len(words) > 10:
183
+ word_freq = {}
184
+ for word in words:
185
+ if len(word) > 3:
186
+ word_freq[word] = word_freq.get(word, 0) + 1
187
+
188
+ repeated_words = [(word, count) for word, count in word_freq.items() if count > 3]
189
+ if repeated_words:
190
+ risk_score += min(len(repeated_words) * 2, 5)
191
+ detected_patterns.append(f"Excessive word repetition detected")
192
+
193
+ # Unicode/encoding tricks
194
+ suspicious_unicode = [
195
+ '\u200b', # Zero-width space
196
+ '\u200c', # Zero-width non-joiner
197
+ '\u200d', # Zero-width joiner
198
+ '\ufeff', # Byte order mark
199
+ ]
200
+
201
+ for char in suspicious_unicode:
202
+ if char in message:
203
+ risk_score += 3
204
+ detected_patterns.append("Suspicious Unicode characters detected")
205
+ break
206
+
207
+ # Multiple instruction attempts (layered attacks)
208
+ instruction_keywords = ['ignore', 'forget', 'disregard', 'override', 'bypass', 'disable']
209
+ instruction_count = sum(1 for keyword in instruction_keywords if keyword in message_lower)
210
+ if instruction_count >= 3:
211
+ risk_score += instruction_count * 2
212
+ detected_patterns.append(f"Multiple instruction manipulation attempts ({instruction_count})")
213
+
214
+ # Calculate risk level and confidence
215
+ if risk_score >= 15:
216
+ risk_level = "high"
217
+ confidence = min(0.9, 0.5 + (risk_score - 15) * 0.02)
218
+ elif risk_score >= 8:
219
+ risk_level = "medium"
220
+ confidence = min(0.8, 0.3 + (risk_score - 8) * 0.03)
221
+ elif risk_score >= 3:
222
+ risk_level = "low"
223
+ confidence = min(0.6, 0.1 + risk_score * 0.05)
224
+ else:
225
+ risk_level = "none"
226
+ confidence = 0.0
227
+
228
+ # Determine if it's likely an injection
229
+ is_injection = risk_score >= 8
230
+
231
+ if is_injection:
232
+ result_message = f"⚠️ Potential prompt injection detected (Risk: {risk_level}, Score: {risk_score})"
233
+ else:
234
+ result_message = f"✅ No significant prompt injection patterns detected (Score: {risk_score})"
235
+
236
+ return {
237
+ "is_injection": is_injection,
238
+ "risk_level": risk_level,
239
+ "risk_score": risk_score,
240
+ "confidence": round(confidence, 2),
241
+ "detected_patterns": detected_patterns,
242
+ "message": result_message
243
+ }