# src/ner.py — Named Entity Recognition (NER) using spaCy and CoNLL-2003 import spacy from spacy.training import Example from spacy.util import minibatch from datasets import load_dataset # Load CoNLL-2003 dataset print("Loading CoNLL-2003 dataset...") dataset = load_dataset("conll2003") # Prepare training data in spaCy format train_data = [] for item in dataset["train"]: words = item["tokens"] # Currently skipping entity conversion; placeholder empty entities train_data.append((" ".join(words), {})) # No annotations for now # Load blank English model nlp = spacy.blank("en") # Create NER pipe if "ner" not in nlp.pipe_names: ner = nlp.add_pipe("ner") else: ner = nlp.get_pipe("ner") # Add labels for label in ["ORG", "PER", "LOC", "MISC"]: ner.add_label(label) # Disable other pipes to train only NER other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] with nlp.disable_pipes(*other_pipes): optimizer = nlp.begin_training() for itn in range(10): losses = {} batches = minibatch(train_data, size=8) for batch in batches: for text, annotations in batch: doc = nlp.make_doc(text) example = Example.from_dict(doc, annotations) nlp.update([example], drop=0.3, losses=losses, sgd=optimizer) print(f"Losses at iteration {itn}: {losses}") # Save model nlp.to_disk("models/ner") print("NER model saved to models/ner") # Prediction function def predict_entities(text): nlp = spacy.load("models/ner") doc = nlp(text) return [(ent.text, ent.label_) for ent in doc.ents] if __name__ == "__main__": print("Training NER model...")