File size: 1,750 Bytes
7cde242 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
# src/ner.py — Named Entity Recognition (NER) using spaCy and CoNLL-2003
import spacy
from spacy.training import Example
from spacy.util import minibatch
from datasets import load_dataset
# Load CoNLL-2003 dataset
print("Loading CoNLL-2003 dataset...")
dataset = load_dataset("conll2003")
# Prepare training data in spaCy format
train_data = []
for item in dataset["train"]:
words = item["tokens"]
# Currently skipping entity conversion; placeholder empty entities
train_data.append((" ".join(words), {})) # No annotations for now
# Load blank English model
nlp = spacy.blank("en")
# Create NER pipe
if "ner" not in nlp.pipe_names:
ner = nlp.add_pipe("ner")
else:
ner = nlp.get_pipe("ner")
# Add labels
for label in ["ORG", "PER", "LOC", "MISC"]:
ner.add_label(label)
# Disable other pipes to train only NER
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
optimizer = nlp.begin_training()
for itn in range(10):
losses = {}
batches = minibatch(train_data, size=8)
for batch in batches:
for text, annotations in batch:
doc = nlp.make_doc(text)
example = Example.from_dict(doc, annotations)
nlp.update([example], drop=0.3, losses=losses, sgd=optimizer)
print(f"Losses at iteration {itn}: {losses}")
# Save model
nlp.to_disk("models/ner")
print("NER model saved to models/ner")
# Prediction function
def predict_entities(text):
nlp = spacy.load("models/ner")
doc = nlp(text)
return [(ent.text, ent.label_) for ent in doc.ents]
if __name__ == "__main__":
print("Training NER model...") |