File size: 1,037 Bytes
d1923ad
 
9d25ef8
f3df499
 
 
 
 
 
 
 
 
 
 
 
 
d1923ad
 
9d25ef8
 
 
 
 
f3df499
d1923ad
 
 
 
 
34abba7
 
 
d1923ad
 
 
 
 
34abba7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import chardet
import spacy
from spacy.cli import download

# ------------------------
# CONFIGURATION
# ------------------------
custom_spacy_config = {
    "gliner_model": "urchade/gliner_multi_pii-v1",
    "labels": [
        "person", "organization", "company", "country",
        "medical condition", "credit card brand",
    ],
    "threshold": 0.39,
    "style": "ent",
}

# Load SpaCy and add GLiNER to the pipeline
try:
    nlp = spacy.load("en_core_web_lg")
except OSError:
    download("en_core_web_lg")
    nlp = spacy.load("en_core_web_lg")
nlp.add_pipe("gliner_spacy", config=custom_spacy_config)

def detect_encoding(file_bytes):
    result = chardet.detect(file_bytes)
    return result.get('encoding', 'utf-8')

def extract_entities_from_file(file_path):
    with open(file_path, "rb") as f:
        file_bytes = f.read()
    encoding = detect_encoding(file_bytes)
    text = file_bytes.decode(encoding, errors='ignore')
    doc = nlp(text)
    results = [(ent.text, ent.label_) for ent in doc.ents]
    return results