test3 / tests /guardrails_tests /test_presidio_pii.py
DesertWolf's picture
Upload folder using huggingface_hub
447ebeb verified
import sys
import os
import io, asyncio
import pytest
import time
from litellm import mock_completion
from unittest.mock import MagicMock, AsyncMock, patch
sys.path.insert(0, os.path.abspath("../.."))
import litellm
from litellm.proxy.guardrails.guardrail_hooks.presidio import _OPTIONAL_PresidioPIIMasking, PresidioPerRequestConfig
from litellm.types.guardrails import PiiEntityType, PiiAction
from litellm.proxy._types import UserAPIKeyAuth
from litellm.caching.caching import DualCache
from litellm.exceptions import BlockedPiiEntityError
from litellm.types.utils import CallTypes as LitellmCallTypes
@pytest.mark.asyncio
async def test_presidio_with_entities_config():
"""Test for Presidio guardrail with entities config - requires actual Presidio API"""
# Setup the guardrail with specific entities config
litellm._turn_on_debug()
pii_entities_config = {
PiiEntityType.CREDIT_CARD: PiiAction.MASK,
PiiEntityType.EMAIL_ADDRESS: PiiAction.MASK,
}
presidio_guardrail = _OPTIONAL_PresidioPIIMasking(
pii_entities_config=pii_entities_config,
presidio_analyzer_api_base=os.environ.get("PRESIDIO_ANALYZER_API_BASE"),
presidio_anonymizer_api_base=os.environ.get("PRESIDIO_ANONYMIZER_API_BASE")
)
# Test text with different PII types
test_text = "My credit card number is 4111-1111-1111-1111, my email is test@example.com, and my phone is 555-123-4567"
# Test the analyze request configuration
analyze_request = presidio_guardrail._get_presidio_analyze_request_payload(
text=test_text,
presidio_config=None,
request_data={}
)
# Verify entities were passed correctly
assert "entities" in analyze_request
assert set(analyze_request["entities"]) == set(pii_entities_config.keys())
# Test the check_pii method - this will call the actual Presidio API
redacted_text = await presidio_guardrail.check_pii(
text=test_text,
output_parse_pii=True,
presidio_config=None,
request_data={}
)
# Verify PII has been masked/replaced/redacted in the result
assert "4111-1111-1111-1111" not in redacted_text
assert "test@example.com" not in redacted_text
# Since this entity is not in the config, it should not be masked
assert "555-123-4567" in redacted_text
# The specific replacements will vary based on Presidio's implementation
print(f"Redacted text: {redacted_text}")
@pytest.mark.asyncio
async def test_presidio_apply_guardrail():
"""Test for Presidio guardrail apply guardrail - requires actual Presidio API"""
litellm._turn_on_debug()
presidio_guardrail = _OPTIONAL_PresidioPIIMasking(
pii_entities_config={},
presidio_analyzer_api_base=os.environ.get("PRESIDIO_ANALYZER_API_BASE"),
presidio_anonymizer_api_base=os.environ.get("PRESIDIO_ANONYMIZER_API_BASE")
)
response = await presidio_guardrail.apply_guardrail(
text="My credit card number is 4111-1111-1111-1111 and my email is test@example.com",
language="en",
)
print("response from apply guardrail for presidio: ", response)
# assert tthe default config masks the credit card and email
assert "4111-1111-1111-1111" not in response
assert "test@example.com" not in response
@pytest.mark.asyncio
async def test_presidio_with_blocked_entities():
"""Test for Presidio guardrail with blocked entities - requires actual Presidio API"""
# Setup the guardrail with specific entities config - BLOCK for credit card
litellm._turn_on_debug()
pii_entities_config = {
PiiEntityType.CREDIT_CARD: PiiAction.BLOCK, # This entity should cause a block
PiiEntityType.EMAIL_ADDRESS: PiiAction.MASK, # This entity should be masked
}
presidio_guardrail = _OPTIONAL_PresidioPIIMasking(
pii_entities_config=pii_entities_config,
presidio_analyzer_api_base=os.environ.get("PRESIDIO_ANALYZER_API_BASE"),
presidio_anonymizer_api_base=os.environ.get("PRESIDIO_ANONYMIZER_API_BASE")
)
# Test text with blocked PII type
test_text = "My credit card number is 4111-1111-1111-1111 and my email is test@example.com"
# Verify the analyze request configuration
analyze_request = presidio_guardrail._get_presidio_analyze_request_payload(
text=test_text,
presidio_config=None,
request_data={}
)
# Verify entities were passed correctly
assert "entities" in analyze_request
assert set(analyze_request["entities"]) == set(pii_entities_config.keys())
# Test that BlockedPiiEntityError is raised when check_pii is called
with pytest.raises(BlockedPiiEntityError) as excinfo:
await presidio_guardrail.check_pii(
text=test_text,
output_parse_pii=True,
presidio_config=None,
request_data={}
)
# Verify the error contains the correct entity type
assert excinfo.value.entity_type == PiiEntityType.CREDIT_CARD
assert excinfo.value.guardrail_name == presidio_guardrail.guardrail_name
@pytest.mark.asyncio
async def test_presidio_pre_call_hook_with_blocked_entities():
"""Test for Presidio guardrail pre-call hook with blocked entities on a chat completion request"""
# Setup the guardrail with specific entities config
pii_entities_config = {
PiiEntityType.CREDIT_CARD: PiiAction.BLOCK, # This entity should cause a block
PiiEntityType.EMAIL_ADDRESS: PiiAction.MASK, # This entity should be masked
}
presidio_guardrail = _OPTIONAL_PresidioPIIMasking(
pii_entities_config=pii_entities_config,
presidio_analyzer_api_base=os.environ.get("PRESIDIO_ANALYZER_API_BASE"),
presidio_anonymizer_api_base=os.environ.get("PRESIDIO_ANONYMIZER_API_BASE")
)
# Create a sample chat completion request with PII data
data = {
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "My credit card is 4111-1111-1111-1111 and my email is test@example.com."}
],
"model": "gpt-3.5-turbo"
}
# Mock objects needed for the pre-call hook
user_api_key_dict = UserAPIKeyAuth(api_key="test_key")
cache = DualCache()
# Call the pre-call hook and expect BlockedPiiEntityError
with pytest.raises(BlockedPiiEntityError) as excinfo:
await presidio_guardrail.async_pre_call_hook(
user_api_key_dict=user_api_key_dict,
cache=cache,
data=data,
call_type="completion"
)
print(f"got error: {excinfo}")
# Verify the error contains the correct entity type
assert excinfo.value.entity_type == PiiEntityType.CREDIT_CARD
assert excinfo.value.guardrail_name == presidio_guardrail.guardrail_name
@pytest.mark.asyncio
@pytest.mark.parametrize("call_type", ["completion", "acompletion"])
async def test_presidio_pre_call_hook_with_different_call_types(call_type):
"""Test for Presidio guardrail pre-call hook with both completion and acompletion call types"""
# Setup the guardrail with specific entities config
pii_entities_config = {
PiiEntityType.CREDIT_CARD: PiiAction.MASK,
PiiEntityType.EMAIL_ADDRESS: PiiAction.MASK,
}
presidio_guardrail = _OPTIONAL_PresidioPIIMasking(
pii_entities_config=pii_entities_config,
presidio_analyzer_api_base=os.environ.get("PRESIDIO_ANALYZER_API_BASE"),
presidio_anonymizer_api_base=os.environ.get("PRESIDIO_ANONYMIZER_API_BASE")
)
# Create a sample request with PII data
data = {
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "My credit card is 4111-1111-1111-1111 and my email is test@example.com. My phone number is 555-123-4567"}
],
"model": "gpt-3.5-turbo"
}
# Mock objects needed for the pre-call hook
user_api_key_dict = UserAPIKeyAuth(api_key="test_key")
cache = DualCache()
# Call the pre-call hook with the specified call type
modified_data = await presidio_guardrail.async_pre_call_hook(
user_api_key_dict=user_api_key_dict,
cache=cache,
data=data,
call_type=call_type
)
# Verify the messages have been modified to mask PII
assert modified_data["messages"][0]["content"] == "You are a helpful assistant." # System prompt should be unchanged
user_message = modified_data["messages"][1]["content"]
assert "4111-1111-1111-1111" not in user_message
assert "test@example.com" not in user_message
# Since this entity is not in the config, it should not be masked
assert "555-123-4567" in user_message
print(f"Modified user message for call_type={call_type}: {user_message}")
@pytest.mark.parametrize(
"base_url",
[
"presidio-analyzer-s3pa:10000",
"https://presidio-analyzer-s3pa:10000",
"http://presidio-analyzer-s3pa:10000",
],
)
def test_validate_environment_missing_http(base_url):
pii_masking = _OPTIONAL_PresidioPIIMasking(mock_testing=True)
# Use patch.dict to temporarily modify environment variables only for this test
env_vars = {
"PRESIDIO_ANALYZER_API_BASE": f"{base_url}/analyze",
"PRESIDIO_ANONYMIZER_API_BASE": f"{base_url}/anonymize"
}
with patch.dict(os.environ, env_vars):
pii_masking.validate_environment()
expected_url = base_url
if not (base_url.startswith("https://") or base_url.startswith("http://")):
expected_url = "http://" + base_url
assert (
pii_masking.presidio_anonymizer_api_base == f"{expected_url}/anonymize/"
), "Got={}, Expected={}".format(
pii_masking.presidio_anonymizer_api_base, f"{expected_url}/anonymize/"
)
assert pii_masking.presidio_analyzer_api_base == f"{expected_url}/analyze/"
@pytest.mark.asyncio
async def test_output_parsing():
"""
- have presidio pii masking - mask an input message
- make llm completion call
- have presidio pii masking - output parse message
- assert that no masked tokens are in the input message
"""
litellm.set_verbose = True
litellm.output_parse_pii = True
pii_masking = _OPTIONAL_PresidioPIIMasking(mock_testing=True)
initial_message = [
{
"role": "user",
"content": "hello world, my name is Jane Doe. My number is: 034453334",
}
]
filtered_message = [
{
"role": "user",
"content": "hello world, my name is <PERSON>. My number is: <PHONE_NUMBER>",
}
]
pii_masking.pii_tokens = {"<PERSON>": "Jane Doe", "<PHONE_NUMBER>": "034453334"}
response = mock_completion(
model="gpt-3.5-turbo",
messages=filtered_message,
mock_response="Hello <PERSON>! How can I assist you today?",
)
new_response = await pii_masking.async_post_call_success_hook(
user_api_key_dict=UserAPIKeyAuth(),
data={
"messages": [{"role": "system", "content": "You are an helpfull assistant"}]
},
response=response,
)
assert (
new_response.choices[0].message.content
== "Hello Jane Doe! How can I assist you today?"
)
# asyncio.run(test_output_parsing())
### UNIT TESTS FOR PRESIDIO PII MASKING ###
input_a_anonymizer_results = {
"text": "hello world, my name is <PERSON>. My number is: <PHONE_NUMBER>",
"items": [
{
"start": 48,
"end": 62,
"entity_type": "PHONE_NUMBER",
"text": "<PHONE_NUMBER>",
"operator": "replace",
},
{
"start": 24,
"end": 32,
"entity_type": "PERSON",
"text": "<PERSON>",
"operator": "replace",
},
],
}
input_b_anonymizer_results = {
"text": "My name is <PERSON>, who are you? Say my name in your response",
"items": [
{
"start": 11,
"end": 19,
"entity_type": "PERSON",
"text": "<PERSON>",
"operator": "replace",
}
],
}
# Test if PII masking works with input A
@pytest.mark.asyncio
async def test_presidio_pii_masking_input_a():
"""
Tests to see if correct parts of sentence anonymized
"""
pii_masking = _OPTIONAL_PresidioPIIMasking(
mock_testing=True, mock_redacted_text=input_a_anonymizer_results
)
_api_key = "sk-12345"
user_api_key_dict = UserAPIKeyAuth(api_key=_api_key)
local_cache = DualCache()
new_data = await pii_masking.async_pre_call_hook(
user_api_key_dict=user_api_key_dict,
cache=local_cache,
data={
"messages": [
{
"role": "user",
"content": "hello world, my name is Jane Doe. My number is: 23r323r23r2wwkl",
}
]
},
call_type="completion",
)
assert "<PERSON>" in new_data["messages"][0]["content"]
assert "<PHONE_NUMBER>" in new_data["messages"][0]["content"]
# Test if PII masking works with input B (also test if the response != A's response)
@pytest.mark.asyncio
async def test_presidio_pii_masking_input_b():
"""
Tests to see if correct parts of sentence anonymized
"""
pii_masking = _OPTIONAL_PresidioPIIMasking(
mock_testing=True, mock_redacted_text=input_b_anonymizer_results
)
_api_key = "sk-12345"
user_api_key_dict = UserAPIKeyAuth(api_key=_api_key)
local_cache = DualCache()
new_data = await pii_masking.async_pre_call_hook(
user_api_key_dict=user_api_key_dict,
cache=local_cache,
data={
"messages": [
{
"role": "user",
"content": "My name is Jane Doe, who are you? Say my name in your response",
}
]
},
call_type="completion",
)
assert "<PERSON>" in new_data["messages"][0]["content"]
assert "<PHONE_NUMBER>" not in new_data["messages"][0]["content"]
@pytest.mark.asyncio
async def test_presidio_pii_masking_logging_output_only_no_pre_api_hook():
from litellm.types.guardrails import GuardrailEventHooks
pii_masking = _OPTIONAL_PresidioPIIMasking(
logging_only=True,
mock_testing=True,
mock_redacted_text=input_b_anonymizer_results,
)
_api_key = "sk-12345"
user_api_key_dict = UserAPIKeyAuth(api_key=_api_key)
local_cache = DualCache()
test_messages = [
{
"role": "user",
"content": "My name is Jane Doe, who are you? Say my name in your response",
}
]
assert (
pii_masking.should_run_guardrail(
data={"messages": test_messages},
event_type=GuardrailEventHooks.pre_call,
)
is False
)
@pytest.mark.asyncio
@patch.dict(os.environ, {
"PRESIDIO_ANALYZER_API_BASE": "http://localhost:5002",
"PRESIDIO_ANONYMIZER_API_BASE": "http://localhost:5001"
})
async def test_presidio_pii_masking_logging_output_only_logged_response_guardrails_config():
from typing import Dict, List, Optional
import litellm
from litellm.proxy.guardrails.init_guardrails import initialize_guardrails
from litellm.types.guardrails import (
GuardrailItem,
GuardrailItemSpec,
GuardrailEventHooks,
)
litellm.set_verbose = True
# Environment variables are now patched via the decorator instead of setting them directly
guardrails_config: List[Dict[str, GuardrailItemSpec]] = [
{
"pii_masking": {
"callbacks": ["presidio"],
"default_on": True,
"logging_only": True,
}
}
]
litellm_settings = {"guardrails": guardrails_config}
assert len(litellm.guardrail_name_config_map) == 0
initialize_guardrails(
guardrails_config=guardrails_config,
premium_user=True,
config_file_path="",
litellm_settings=litellm_settings,
)
assert len(litellm.guardrail_name_config_map) == 1
pii_masking_obj: Optional[_OPTIONAL_PresidioPIIMasking] = None
for callback in litellm.callbacks:
print(f"CALLBACK: {callback}")
if isinstance(callback, _OPTIONAL_PresidioPIIMasking):
pii_masking_obj = callback
assert pii_masking_obj is not None
assert hasattr(pii_masking_obj, "logging_only")
assert pii_masking_obj.event_hook == GuardrailEventHooks.logging_only
assert pii_masking_obj.should_run_guardrail(
data={}, event_type=GuardrailEventHooks.logging_only
)
@pytest.mark.asyncio
async def test_presidio_language_configuration():
"""Test that presidio_language parameter is properly set and used in analyze requests"""
litellm._turn_on_debug()
# Test with German language using mock testing to avoid API calls
presidio_guardrail_de = _OPTIONAL_PresidioPIIMasking(
pii_entities_config={},
presidio_language="de",
mock_testing=True # This bypasses the API validation
)
test_text = "Meine Telefonnummer ist +49 30 12345678"
# Test the analyze request configuration
analyze_request = presidio_guardrail_de._get_presidio_analyze_request_payload(
text=test_text,
presidio_config=None,
request_data={}
)
# Verify the language is set to German
assert analyze_request["language"] == "de"
assert analyze_request["text"] == test_text
# Test with Spanish language
presidio_guardrail_es = _OPTIONAL_PresidioPIIMasking(
pii_entities_config={},
presidio_language="es",
mock_testing=True
)
test_text_es = "Mi número de teléfono es +34 912 345 678"
analyze_request_es = presidio_guardrail_es._get_presidio_analyze_request_payload(
text=test_text_es,
presidio_config=None,
request_data={}
)
# Verify the language is set to Spanish
assert analyze_request_es["language"] == "es"
assert analyze_request_es["text"] == test_text_es
# Test default language (English) when not specified
presidio_guardrail_default = _OPTIONAL_PresidioPIIMasking(
pii_entities_config={},
mock_testing=True
)
test_text_en = "My phone number is +1 555-123-4567"
analyze_request_default = presidio_guardrail_default._get_presidio_analyze_request_payload(
text=test_text_en,
presidio_config=None,
request_data={}
)
# Verify the language defaults to English
assert analyze_request_default["language"] == "en"
assert analyze_request_default["text"] == test_text_en
@pytest.mark.asyncio
async def test_presidio_language_configuration_with_per_request_override():
"""Test that per-request language configuration overrides the default configured language"""
litellm._turn_on_debug()
# Set up guardrail with German as default language
presidio_guardrail = _OPTIONAL_PresidioPIIMasking(
pii_entities_config={},
presidio_language="de",
mock_testing=True
)
test_text = "Test text with PII"
# Test with per-request config overriding the default language
presidio_config = PresidioPerRequestConfig(language="fr")
analyze_request = presidio_guardrail._get_presidio_analyze_request_payload(
text=test_text,
presidio_config=presidio_config,
request_data={}
)
# Verify the per-request language (French) overrides the default (German)
assert analyze_request["language"] == "fr"
assert analyze_request["text"] == test_text
# Test without per-request config - should use default language
analyze_request_default = presidio_guardrail._get_presidio_analyze_request_payload(
text=test_text,
presidio_config=None,
request_data={}
)
# Verify the default language (German) is used
assert analyze_request_default["language"] == "de"
assert analyze_request_default["text"] == test_text