anonymizing-sessions

Sleeping

File size: 6,679 Bytes

import re
import gradio as gr
from gliner import GLiNER
from cerberus import Validator
from transformers import AutoTokenizer

# ----------------------------------------------------------------------------
# Load model + labels
# ----------------------------------------------------------------------------

model = GLiNER.from_pretrained("urchade/gliner_multi_pii-v1")
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

with open("labels.txt", "r", encoding="utf-8") as f:
    labels = [line.strip() for line in f.readlines()]

MAX_TOKENS = 512  # безопасный лимит токенов на один фрагмент

# ----------------------------------------------------------------------------
# Simple Cerberus validation for incoming data
# ----------------------------------------------------------------------------

schema = {
    "text": {
        "type": "string",
        "empty": False
    }
}

validator = Validator(schema)

def validate_input(data: dict) -> str:
    if not validator.validate(data):
        raise ValueError(f"Invalid input data. Errors: {validator.errors}")
    return data["text"]

# ----------------------------------------------------------------------------
# Chunking + Anonymization logic
# ----------------------------------------------------------------------------

def split_text_into_chunks(text, max_tokens=MAX_TOKENS):
    words = text.split()
    chunks = []
    chunk = []
    chunk_token_count = 0
    current_offset = 0

    for word in words:
        token_count = len(tokenizer.tokenize(word))
        if chunk_token_count + token_count > max_tokens:
            chunk_text = ' '.join(chunk)
            chunks.append((chunk_text, current_offset))
            current_offset += len(chunk_text) + 1
            chunk = [word]
            chunk_token_count = token_count
        else:
            chunk.append(word)
            chunk_token_count += token_count

    if chunk:
        chunk_text = ' '.join(chunk)
        chunks.append((chunk_text, current_offset))

    return chunks

def anonymize_text_long(text):
    chunks = split_text_into_chunks(text)
    full_anonymized = ""
    global_entity_map = {}

    for chunk_text, _ in chunks:
        entities = model.predict_entities(chunk_text, labels=labels, threshold=0.2)
        entities.sort(key=lambda e: e['start'])

        anonymized_chunk = ""
        next_start = 0

        for entity in entities:
            label = entity['label'].replace(" ", "_").upper()
            original_text = entity['text']
            start_idx, end_idx = entity['start'], entity['end']

            if label not in global_entity_map:
                global_entity_map[label] = [original_text]
                idx = 1
            else:
                if original_text in global_entity_map[label]:
                    idx = global_entity_map[label].index(original_text) + 1
                else:
                    global_entity_map[label].append(original_text)
                    idx = len(global_entity_map[label])

            anonymized_chunk += chunk_text[next_start:start_idx]
            anonymized_chunk += f"<PII_{label}_{idx}>"
            next_start = end_idx

        anonymized_chunk += chunk_text[next_start:]
        full_anonymized += anonymized_chunk + " "

    return full_anonymized.strip(), global_entity_map

# ----------------------------------------------------------------------------
# De-anonymization logic
# ----------------------------------------------------------------------------

def deanonymize_text(anonymized_response, entity_map):
    def replace_match(match):
        label = match.group(1)
        idx_str = match.group(2)
        idx = int(idx_str) - 1
        if label in entity_map and 0 <= idx < len(entity_map[label]):
            return entity_map[label][idx]
        return match.group(0)

    pattern = r"<PII_(\w+)_(\d+)>"
    return re.sub(pattern, replace_match, anonymized_response)

# ----------------------------------------------------------------------------
# Gradio Interface
# ----------------------------------------------------------------------------

def anonymize_fn(original_text):
    data = {"text": original_text}
    try:
        user_text = validate_input(data)
    except ValueError as e:
        return "", {}, f"Validation error: {str(e)}"

    anonymized, entities = anonymize_text_long(user_text)
    return anonymized, entities, "Успешно анонимизировано!"

def deanonymize_fn(anonymized_llm_response, entity_map):
    if not anonymized_llm_response.strip():
        return "", "Вставьте анонимизированный текст."
    if not entity_map:
        return "", "No entity map found; anonymize some text first."

    result = deanonymize_text(anonymized_llm_response, entity_map)
    return result, "Успешно деанонимизировано!"

md_text = """# Анонимизатор психотерапевтических сессий

Вставьте текст в раздел \"Исходный текст\", чтобы анонимизировать сензитивные данные.
"""

with gr.Blocks() as demo:
    gr.Markdown(md_text)

    with gr.Row():
        with gr.Column():
            original_text = gr.Textbox(
                lines=6, label="Исходный текст (анонимизировать)"
            )
            anonymized_text = gr.Textbox(
                lines=6, label="Анонимизированный текст", interactive=False
            )
            button_anon = gr.Button("Анонимизировать")

            entity_map_state = gr.State()
            message_out = gr.Textbox(label="Status", interactive=False)

            button_anon.click(
                anonymize_fn,
                inputs=[original_text],
                outputs=[anonymized_text, entity_map_state, message_out]
            )

        with gr.Column():
            anonymized_llm_response = gr.Textbox(
                lines=6, label="Анонимизированная сессия (вставить)"
            )
            deanonymized_text = gr.Textbox(
                lines=6, label="Деанонимизированная сессия", interactive=False
            )
            button_deanon = gr.Button("Деанонимизировать")

            message_out_de = gr.Textbox(label="Status", interactive=False)

            button_deanon.click(
                deanonymize_fn,
                inputs=[anonymized_llm_response, entity_map_state],
                outputs=[deanonymized_text, message_out_de]
            )

if __name__ == "__main__":
    demo.launch()