File size: 5,258 Bytes
eb57aa1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
from nltk.parse.dependencygraph import DependencyGraph
from nltk.parse.malt import MaltParser
import os
import tempfile
from .stemmer import FindStems
from .postagger import POSTagger
from .tokenizer import Tokenizer
from .normalizer import Normalizer
class MyMaltParser(MaltParser):
def __init__(self, parser_dirname, model_filename, tagger, stemmer):
"""
An interface for parsing with the Malt Parser.
:param parser_dirname: The path to the maltparser directory that
contains the maltparser-1.x.jar
:type parser_dirname: str
:param model_filename: The name of the pre-trained model with .mco file
extension. If provided, training will not be required.
(see http://www.maltparser.org/mco/mco.html and
see http://www.patful.com/chalk/node/185)
:type model_filename: str
:param tagger: The tagger used to POS tag the raw string before
formatting to CONLL format. It should behave like `nltk.pos_tag`
:type tagger: function
:param stemmer: a function which returns stem of the word
:type function
"""
self.working_dir = parser_dirname
self.mco = model_filename
self.pos_tagger = tagger
self._malt_bin = os.path.join(parser_dirname, 'maltparser-1.9.2.jar')
self.stemmer = stemmer.convert_to_stem if stemmer else lambda w, t: '_'
def parse_tagged_sent(self, sentences, verbose=False, top_relation_label='null'):
tmp_file_address = tempfile.gettempdir()
input_file = tempfile.NamedTemporaryFile(prefix='malt_input.conll', dir=tmp_file_address, delete=False)
output_file = tempfile.NamedTemporaryFile(prefix='malt_output.conll', dir=tmp_file_address, delete=False)
for sentence in sentences:
for i, (word, tag) in enumerate(sentence, start=1):
word = word.strip()
if not word:
word = '_'
input_file.write(('\t'.join([str(i), word.replace(' ', '_'), self.stemmer(word, tag).replace(' ', '_'), tag, tag, '_', '0', 'ROOT', '_', '_', '\n'])).encode('utf8'))
input_file.write('\n'.encode('utf8'))
input_file.close()
cmd = ['java', '-jar', self._malt_bin, '-w', self.working_dir, '-c', self.mco, '-i', input_file.name, '-o', output_file.name, '-m', 'parse']
if self._execute(cmd, verbose) != 0:
raise Exception("MaltParser parsing failed: %s" % (' '.join(cmd)))
dependency_graph = []
with open(output_file.name, encoding='utf-8') as infile:
content = infile.read().strip().split('\n\n')
for sent in content:
dependency_graph.append(DependencyGraph(sent))
input_file.close()
output_file.close()
os.remove(input_file.name)
os.remove(output_file.name)
return dependency_graph
class DependencyParser:
def __init__(self, _normalizer=None, _tokenizer=None, _stemmer=None, _tagger=None):
self.dir_path = os.path.dirname(os.path.realpath(__file__)) + "/"
if _normalizer is None:
self.my_normalizer = Normalizer()
else:
self.my_normalizer = _normalizer
if _tokenizer is None:
self.my_tokenizer = Tokenizer()
else:
self.my_tokenizer = _tokenizer
if _stemmer is None:
self.my_stemmer = FindStems()
else:
self.my_stemmer = _stemmer
if _tagger is None:
self.my_tagger = POSTagger(tagging_model="wapiti").parse
else:
self.my_tagger = _tagger
self.parser = MyMaltParser(parser_dirname=self.dir_path + 'resource/dependency_parser',
model_filename='total_dep_parser.mco',
tagger=self.my_tagger,
stemmer=self.my_stemmer)
def make_trainable_corpus(self, in_file, out_file):
tagger = self.my_tagger
with open(in_file, 'r') as infile:
content = infile.read().strip().split('\n\n')
for i, sent in enumerate(content):
if len(sent) == 0:
continue
lines = sent.split('\n')
sent_tokens = [x.split('\t')[1] for x in lines]
tagged_sent = tagger(sent_tokens)
tages = [x[1] for x in tagged_sent]
for j, line in enumerate(lines):
line = line.split('\t')
line[3] = tages[j]
line[4] = tages[j]
line = '\t'.join(line)
lines[j] = line
sent = '\n'.join(lines)
content[i] = sent
content = '\n\n'.join(content)
with open(out_file, 'w') as outfile:
outfile.write(content)
return content
def parse_sents(self, sents, verbose=False):
tagger = self.my_tagger
tagged_sents = [tagger(self.my_tokenizer.tokenize_words(sent)) for sent in sents]
return self.parser.parse_tagged_sent(tagged_sents, verbose)
|