anonymizing-sessions

Running

App Files Files

MentalTech commited on May 20

Commit

9c85f16

verified ·

1 Parent(s): ec93d27

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -61

app.py CHANGED Viewed

@@ -2,21 +2,24 @@ import re
 import gradio as gr
 from gliner import GLiNER
 from cerberus import Validator
 # ----------------------------------------------------------------------------
 # Load model + labels
 # ----------------------------------------------------------------------------
 model = GLiNER.from_pretrained("urchade/gliner_multi_pii-v1")
 with open("labels.txt", "r", encoding="utf-8") as f:
     labels = [line.strip() for line in f.readlines()]
 # ----------------------------------------------------------------------------
 # Simple Cerberus validation for incoming data
 # ----------------------------------------------------------------------------
-# We expect a dict with at least {"text": "<some string>"}
 schema = {
     "text": {
         "type": "string",
@@ -26,74 +29,88 @@ schema = {
 validator = Validator(schema)
 def validate_input(data: dict) -> str:
-    """Validate that data has a non-empty 'text' key."""
     if not validator.validate(data):
-        # If invalid, raise an exception. You could handle this more gracefully if you like.
         raise ValueError(f"Invalid input data. Errors: {validator.errors}")
     return data["text"]
 # ----------------------------------------------------------------------------
-# Core anonymize / de-anonymize logic (same as before)
 # ----------------------------------------------------------------------------
-def anonymize_text(text):
-    """
-    1) Detect PII using GLiNER,
-    2) Replace each entity with a placeholder (<PII_LABEL_INDEX>)
-    3) Return anonymized_text + entity_map
-    """
-    entities = model.predict_entities(text, labels=labels, threshold=0.2)
-    # Sort by start index to apply placeholders in correct order
-    entities.sort(key=lambda e: e['start'])
-    entity_map = {}  # e.g. {'PERSON': ['Alice', 'Bob']}
-    anonymized_text = ""
-    next_start = 0
-    for entity in entities:
-        label = entity['label'].replace(" ", "_").upper()
-        original_text = entity['text']
-        start_idx, end_idx = entity['start'], entity['end']
-        if label not in entity_map:
-            entity_map[label] = [original_text]
-            idx = 1
         else:
-            # If same exact string repeated, use the same index as before
-            if original_text in entity_map[label]:
-                idx = entity_map[label].index(original_text) + 1
             else:
-                entity_map[label].append(original_text)
-                idx = len(entity_map[label])
-        # Copy everything before this entity
-        anonymized_text += text[next_start:start_idx]
-        # Insert placeholder
-        anonymized_text += f"<PII_{label}_{idx}>"
-        next_start = end_idx
-    # Remainder of the text after last entity
-    anonymized_text += text[next_start:]
-    return anonymized_text, entity_map
-def deanonymize_text(anonymized_response, entity_map):
-    """
-    Replace <PII_LABEL_INDEX> placeholders in anonymized_response
-    with their original strings from entity_map.
-    """
     def replace_match(match):
-        label = match.group(1)  # e.g. "PERSON"
-        idx_str = match.group(2)  # e.g. "1"
-        idx = int(idx_str) - 1    # 1-based index -> 0-based list index
         if label in entity_map and 0 <= idx < len(entity_map[label]):
             return entity_map[label][idx]
-        return match.group(0)  # If something is off, return the placeholder as-is
     pattern = r"<PII_(\w+)_(\d+)>"
     return re.sub(pattern, replace_match, anonymized_response)
@@ -103,18 +120,15 @@ def deanonymize_text(anonymized_response, entity_map):
 # ----------------------------------------------------------------------------
 def anonymize_fn(original_text):
-    # We’ll do a simple dict so we can pass it to our Cerberus validator:
     data = {"text": original_text}
     try:
         user_text = validate_input(data)
     except ValueError as e:
-        # If invalid, show error in Gradio output
         return "", {}, f"Validation error: {str(e)}"
-    anonymized, entities = anonymize_text(user_text)
     return anonymized, entities, "Успешно анонимизировано!"
 def deanonymize_fn(anonymized_llm_response, entity_map):
     if not anonymized_llm_response.strip():
         return "", "Вставьте анонимизированный текст."
@@ -124,11 +138,9 @@ def deanonymize_fn(anonymized_llm_response, entity_map):
     result = deanonymize_text(anonymized_llm_response, entity_map)
     return result, "Успешно деанонимизировано!"
 md_text = """# Анонимизатор психотерапевтических сессий
-Вставьте текст в раздел "Исходный текст", чтобы анонимизировать сензитивные данные.
 """
 with gr.Blocks() as demo:
@@ -144,9 +156,7 @@ with gr.Blocks() as demo:
             )
             button_anon = gr.Button("Анонимизировать")
-            # Hidden state to store the entity map
             entity_map_state = gr.State()
             message_out = gr.Textbox(label="Status", interactive=False)
             button_anon.click(
@@ -173,4 +183,4 @@ with gr.Blocks() as demo:
             )
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
 from gliner import GLiNER
 from cerberus import Validator
+from transformers import AutoTokenizer
 # ----------------------------------------------------------------------------
 # Load model + labels
 # ----------------------------------------------------------------------------
 model = GLiNER.from_pretrained("urchade/gliner_multi_pii-v1")
+tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
 with open("labels.txt", "r", encoding="utf-8") as f:
     labels = [line.strip() for line in f.readlines()]
+MAX_TOKENS = 512  # безопасный лимит токенов на один фрагмент
 # ----------------------------------------------------------------------------
 # Simple Cerberus validation for incoming data
 # ----------------------------------------------------------------------------
 schema = {
     "text": {
         "type": "string",
 validator = Validator(schema)
 def validate_input(data: dict) -> str:
     if not validator.validate(data):
         raise ValueError(f"Invalid input data. Errors: {validator.errors}")
     return data["text"]
 # ----------------------------------------------------------------------------
+# Chunking + Anonymization logic
 # ----------------------------------------------------------------------------
+def split_text_into_chunks(text, max_tokens=MAX_TOKENS):
+    words = text.split()
+    chunks = []
+    chunk = []
+    chunk_token_count = 0
+    current_offset = 0
+    for word in words:
+        token_count = len(tokenizer.tokenize(word))
+        if chunk_token_count + token_count > max_tokens:
+            chunk_text = ' '.join(chunk)
+            chunks.append((chunk_text, current_offset))
+            current_offset += len(chunk_text) + 1
+            chunk = [word]
+            chunk_token_count = token_count
         else:
+            chunk.append(word)
+            chunk_token_count += token_count
+    if chunk:
+        chunk_text = ' '.join(chunk)
+        chunks.append((chunk_text, current_offset))
+    return chunks
+def anonymize_text_long(text):
+    chunks = split_text_into_chunks(text)
+    full_anonymized = ""
+    global_entity_map = {}
+    for chunk_text, _ in chunks:
+        entities = model.predict_entities(chunk_text, labels=labels, threshold=0.2)
+        entities.sort(key=lambda e: e['start'])
+        anonymized_chunk = ""
+        next_start = 0
+        for entity in entities:
+            label = entity['label'].replace(" ", "_").upper()
+            original_text = entity['text']
+            start_idx, end_idx = entity['start'], entity['end']
+            if label not in global_entity_map:
+                global_entity_map[label] = [original_text]
+                idx = 1
             else:
+                if original_text in global_entity_map[label]:
+                    idx = global_entity_map[label].index(original_text) + 1
+                else:
+                    global_entity_map[label].append(original_text)
+                    idx = len(global_entity_map[label])
+            anonymized_chunk += chunk_text[next_start:start_idx]
+            anonymized_chunk += f"<PII_{label}_{idx}>"
+            next_start = end_idx
+        anonymized_chunk += chunk_text[next_start:]
+        full_anonymized += anonymized_chunk + " "
+    return full_anonymized.strip(), global_entity_map
+# ----------------------------------------------------------------------------
+# De-anonymization logic
+# ----------------------------------------------------------------------------
+def deanonymize_text(anonymized_response, entity_map):
     def replace_match(match):
+        label = match.group(1)
+        idx_str = match.group(2)
+        idx = int(idx_str) - 1
         if label in entity_map and 0 <= idx < len(entity_map[label]):
             return entity_map[label][idx]
+        return match.group(0)
     pattern = r"<PII_(\w+)_(\d+)>"
     return re.sub(pattern, replace_match, anonymized_response)
 # ----------------------------------------------------------------------------
 def anonymize_fn(original_text):
     data = {"text": original_text}
     try:
         user_text = validate_input(data)
     except ValueError as e:
         return "", {}, f"Validation error: {str(e)}"
+    anonymized, entities = anonymize_text_long(user_text)
     return anonymized, entities, "Успешно анонимизировано!"
 def deanonymize_fn(anonymized_llm_response, entity_map):
     if not anonymized_llm_response.strip():
         return "", "Вставьте анонимизированный текст."
     result = deanonymize_text(anonymized_llm_response, entity_map)
     return result, "Успешно деанонимизировано!"
 md_text = """# Анонимизатор психотерапевтических сессий
+Вставьте текст в раздел \"Исходный текст\", чтобы анонимизировать сензитивные данные.
 """
 with gr.Blocks() as demo:
             )
             button_anon = gr.Button("Анонимизировать")
             entity_map_state = gr.State()
             message_out = gr.Textbox(label="Status", interactive=False)
             button_anon.click(
             )
 if __name__ == "__main__":
+    demo.launch()