File size: 5,258 Bytes
eb57aa1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from nltk.parse.dependencygraph import DependencyGraph
from nltk.parse.malt import MaltParser
import os
import tempfile
from .stemmer import FindStems
from .postagger import POSTagger
from .tokenizer import Tokenizer
from .normalizer import Normalizer


class MyMaltParser(MaltParser):
    def __init__(self, parser_dirname, model_filename, tagger, stemmer):
        """

        An interface for parsing with the Malt Parser.

        :param parser_dirname: The path to the maltparser directory that

        contains the maltparser-1.x.jar

        :type parser_dirname: str

        :param model_filename: The name of the pre-trained model with .mco file

        extension. If provided, training will not be required.

        (see http://www.maltparser.org/mco/mco.html and

        see http://www.patful.com/chalk/node/185)

        :type model_filename: str

        :param tagger: The tagger used to POS tag the raw string before

        formatting to CONLL format. It should behave like `nltk.pos_tag`

        :type tagger: function

        :param stemmer: a function which returns stem of the word

        :type function

        """
        self.working_dir = parser_dirname
        self.mco = model_filename
        self.pos_tagger = tagger
        self._malt_bin = os.path.join(parser_dirname, 'maltparser-1.9.2.jar')
        self.stemmer = stemmer.convert_to_stem if stemmer else lambda w, t: '_'

    def parse_tagged_sent(self, sentences, verbose=False, top_relation_label='null'):
        tmp_file_address = tempfile.gettempdir()
        input_file = tempfile.NamedTemporaryFile(prefix='malt_input.conll', dir=tmp_file_address, delete=False)
        output_file = tempfile.NamedTemporaryFile(prefix='malt_output.conll', dir=tmp_file_address, delete=False)

        for sentence in sentences:
            for i, (word, tag) in enumerate(sentence, start=1):
                word = word.strip()
                if not word:
                    word = '_'
                input_file.write(('\t'.join([str(i), word.replace(' ', '_'), self.stemmer(word, tag).replace(' ', '_'), tag, tag, '_', '0', 'ROOT', '_', '_', '\n'])).encode('utf8'))
            input_file.write('\n'.encode('utf8'))
        input_file.close()

        cmd = ['java', '-jar', self._malt_bin, '-w', self.working_dir, '-c', self.mco, '-i', input_file.name, '-o', output_file.name, '-m', 'parse']
        if self._execute(cmd, verbose) != 0:
            raise Exception("MaltParser parsing failed: %s" % (' '.join(cmd)))

        dependency_graph = []
        with open(output_file.name, encoding='utf-8') as infile:
            content = infile.read().strip().split('\n\n')
            for sent in content:
                dependency_graph.append(DependencyGraph(sent))

        input_file.close()
        output_file.close()
        os.remove(input_file.name)
        os.remove(output_file.name)
        return dependency_graph


class DependencyParser:
    def __init__(self, _normalizer=None, _tokenizer=None, _stemmer=None, _tagger=None):
        self.dir_path = os.path.dirname(os.path.realpath(__file__)) + "/"

        if _normalizer is None:
            self.my_normalizer = Normalizer()
        else:
            self.my_normalizer = _normalizer

        if _tokenizer is None:
            self.my_tokenizer = Tokenizer()
        else:
            self.my_tokenizer = _tokenizer

        if _stemmer is None:
            self.my_stemmer = FindStems()
        else:
            self.my_stemmer = _stemmer

        if _tagger is None:
            self.my_tagger = POSTagger(tagging_model="wapiti").parse
        else:
            self.my_tagger = _tagger

        self.parser = MyMaltParser(parser_dirname=self.dir_path + 'resource/dependency_parser',
                                   model_filename='total_dep_parser.mco',
                                   tagger=self.my_tagger,
                                   stemmer=self.my_stemmer)

    def make_trainable_corpus(self, in_file, out_file):
        tagger = self.my_tagger
        with open(in_file, 'r') as infile:
            content = infile.read().strip().split('\n\n')
            for i, sent in enumerate(content):
                if len(sent) == 0:
                    continue
                lines = sent.split('\n')
                sent_tokens = [x.split('\t')[1] for x in lines]
                tagged_sent = tagger(sent_tokens)
                tages = [x[1] for x in tagged_sent]
                for j, line in enumerate(lines):
                    line = line.split('\t')
                    line[3] = tages[j]
                    line[4] = tages[j]
                    line = '\t'.join(line)
                    lines[j] = line
                sent = '\n'.join(lines)
                content[i] = sent
        content = '\n\n'.join(content)
        with open(out_file, 'w') as outfile:
            outfile.write(content)
        return content

    def parse_sents(self, sents, verbose=False):
        tagger = self.my_tagger
        tagged_sents = [tagger(self.my_tokenizer.tokenize_words(sent)) for sent in sents]
        return self.parser.parse_tagged_sent(tagged_sents, verbose)