import chardet import spacy from spacy.cli import download # ------------------------ # CONFIGURATION # ------------------------ custom_spacy_config = { "gliner_model": "urchade/gliner_multi_pii-v1", "labels": [ "person", "organization", "company", "country", "medical condition", "credit card brand", ], "threshold": 0.39, "style": "ent", } # Load SpaCy and add GLiNER to the pipeline try: nlp = spacy.load("en_core_web_lg") except OSError: download("en_core_web_lg") nlp = spacy.load("en_core_web_lg") nlp.add_pipe("gliner_spacy", config=custom_spacy_config) def detect_encoding(file_bytes): result = chardet.detect(file_bytes) return result.get('encoding', 'utf-8') def extract_entities_from_file(file_path): with open(file_path, "rb") as f: file_bytes = f.read() encoding = detect_encoding(file_bytes) text = file_bytes.decode(encoding, errors='ignore') doc = nlp(text) results = [(ent.text, ent.label_) for ent in doc.ents] return results