|
|
|
|
|
import spacy
|
|
from spacy.training import Example
|
|
from spacy.util import minibatch
|
|
from datasets import load_dataset
|
|
|
|
|
|
print("Loading CoNLL-2003 dataset...")
|
|
dataset = load_dataset("conll2003")
|
|
|
|
|
|
train_data = []
|
|
for item in dataset["train"]:
|
|
words = item["tokens"]
|
|
|
|
train_data.append((" ".join(words), {}))
|
|
|
|
|
|
nlp = spacy.blank("en")
|
|
|
|
|
|
if "ner" not in nlp.pipe_names:
|
|
ner = nlp.add_pipe("ner")
|
|
else:
|
|
ner = nlp.get_pipe("ner")
|
|
|
|
|
|
for label in ["ORG", "PER", "LOC", "MISC"]:
|
|
ner.add_label(label)
|
|
|
|
|
|
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
|
|
|
|
with nlp.disable_pipes(*other_pipes):
|
|
optimizer = nlp.begin_training()
|
|
for itn in range(10):
|
|
losses = {}
|
|
batches = minibatch(train_data, size=8)
|
|
for batch in batches:
|
|
for text, annotations in batch:
|
|
doc = nlp.make_doc(text)
|
|
example = Example.from_dict(doc, annotations)
|
|
nlp.update([example], drop=0.3, losses=losses, sgd=optimizer)
|
|
print(f"Losses at iteration {itn}: {losses}")
|
|
|
|
|
|
nlp.to_disk("models/ner")
|
|
print("NER model saved to models/ner")
|
|
|
|
|
|
|
|
def predict_entities(text):
|
|
nlp = spacy.load("models/ner")
|
|
doc = nlp(text)
|
|
return [(ent.text, ent.label_) for ent in doc.ents]
|
|
|
|
|
|
if __name__ == "__main__":
|
|
print("Training NER model...") |