mana-tts / Parsivar /stemmer.py
abreza's picture
add ge2pe
eb57aa1
import os
from .data_helper import DataHelper
from .normalizer import Normalizer
class FindStems():
def __init__(self):
self.dir_path = os.path.dirname(os.path.realpath(__file__)) + "/"
self.noun_lex_path = self.dir_path + "resource/stemmer/stem_lex.pckl"
self.verb_lex_path = self.dir_path + "resource/stemmer/verbStemDict.pckl"
self.verb_tense_map_path = self.dir_path + "resource/stemmer/stem_verbMap.pckl"
self.irregular_nouns_path = self.dir_path + "resource/stemmer/stem_irregularNounDict.pckl"
self.prefix_list_path = self.dir_path + "resource/stemmer/pishvand.txt"
self.postfix_list_path = self.dir_path + "resource/stemmer/pasvand.txt"
self.verb_tense_file_path = self.dir_path + "resource/stemmer/verb_tense.txt"
self.mokasar_noun_path = self.dir_path + "resource/stemmer/mokasar.txt"
self.data_helper = DataHelper()
if(os.path.isfile(self.noun_lex_path) and os.path.isfile(self.verb_lex_path)
and os.path.isfile(self.verb_tense_map_path) and os.path.isfile(self.irregular_nouns_path)):
self.noun_lexicon = self.data_helper.load_var(self.noun_lex_path)
self.verb_lexicon = self.data_helper.load_var(self.verb_lex_path)
self.verb_tense_map = self.data_helper.load_var(self.verb_tense_map_path)
self.irregular_nouns = self.data_helper.load_var(self.irregular_nouns_path)
self.verb_p2f_map, self.verb_f2p_map = self.verb_tense_map[0], self.verb_tense_map[1]
else:
self.mynormalizer = Normalizer()
self.noun_lexicon, self.verb_lexicon, \
self.verb_tense_map, self.irregular_nouns =\
self.data_helper.build_stem_dictionary(self.mynormalizer,
self.verb_tense_file_path,
self.mokasar_noun_path)
self.data_helper.save_var(save_path=self.noun_lex_path, variable=self.noun_lexicon)
self.data_helper.save_var(save_path=self.verb_lex_path, variable=self.verb_lexicon)
self.data_helper.save_var(save_path=self.verb_tense_map_path, variable=self.verb_tense_map)
self.data_helper.save_var(save_path=self.irregular_nouns_path, variable=self.irregular_nouns)
self.verb_p2f_map, self.verb_f2p_map = self.verb_tense_map[0], self.verb_tense_map[1]
self.prefix_list = set({})
with open(self.prefix_list_path, "r", encoding='utf-8') as pishvand_input_file:
pishvandFile_content = pishvand_input_file.readlines()
for el in pishvandFile_content:
self.prefix_list.add(el.strip())
self.postfix_list = set({})
with open(self.postfix_list_path, "r", encoding='utf-8') as pasvand_input_file:
pasvandFile_content = pasvand_input_file.readlines()
for el in pasvandFile_content:
self.postfix_list.add(el.strip())
def select_candidate(self, candidate_list, lexicon_set=None):
length = 1000
selected = ""
for tmp_candidate in candidate_list:
if lexicon_set == None and len(tmp_candidate) < length:
selected = tmp_candidate
length = len(tmp_candidate)
elif (lexicon_set != None) and (tmp_candidate in lexicon_set):
if(length == 1000):
selected = tmp_candidate
length = len(tmp_candidate)
else:
if(len(tmp_candidate) > length):
selected = tmp_candidate
length = len(tmp_candidate)
return selected
def is_prefix(self, word, prefix):
word = word.strip("\u200c")
return word.startswith(prefix)
def is_postfix(self, word, post):
word = word.strip("\u200c")
return word.endswith(post)
def remove_prefixes(self, word, prefix):
word = word.strip("\u200c")
candidateStem = set({})
for el in prefix:
if word.startswith(el):
if len(el) > 0:
tmp = word[len(el):].strip().strip('\u200c')
else:
tmp = word
candidateStem.add(tmp)
return candidateStem
def remove_postfixes(self, word, postfix):
word = word.strip("\u200c")
candidateStem = set({})
for el in postfix:
if word.endswith(el):
if len(el) > 0:
tmp = word[:-len(el)].strip().strip('\u200c')
else:
tmp = word
candidateStem.add(tmp)
return candidateStem
def map_irregular_noun(self, word):
if word in self.irregular_nouns:
return self.irregular_nouns[word]
else:
return word
def convert_to_stem(self, word, word_pos=None):
if word in self.noun_lexicon:
if (word_pos == None or word_pos == 'N'):
#print("in word dict...")
return self.map_irregular_noun(word)
elif word in self.verb_lexicon:
if word_pos is None or word_pos == 'V':
# print("in verb dict...")
if word in self.verb_f2p_map:
stem = self.verb_f2p_map[word] + "&" + word
elif word in self. verb_p2f_map:
stem = word + "&" + self.verb_p2f_map[word]
else:
stem = word
return stem
# if word is a verb
if word_pos is None or word_pos == "V":
# ماضی مستمر
candidate_list = self.remove_prefixes(word, ["داشتم", "داشتی", "داشت",
"داشتیم", "داشتید", "داشتند"])
if len(candidate_list) > 0:
new_word = self.select_candidate(candidate_list)
candidate_list = self.remove_prefixes(new_word, ["می"])
if len(candidate_list) > 0:
new_word = self.select_candidate(candidate_list)
candidate_list = self.remove_postfixes(new_word, ["یم", "ید", "ند",
"م", "ی", ""])
if len(candidate_list) > 0:
new_word = self.select_candidate(candidate_list, self.verb_lexicon)
if new_word:
if new_word in self.verb_p2f_map:
stem = new_word + "&" + self.verb_p2f_map[new_word]
return stem
# مضارع مستمر
candidate_list = self.remove_prefixes(word, ["دارم", "داری", "دارد",
"داریم", "دارید", "دارند"])
if len(candidate_list) > 0:
new_word = self.select_candidate(candidate_list)
candidate_list = self.remove_prefixes(new_word, ["می"])
if len(candidate_list) > 0:
new_word = self.select_candidate(candidate_list)
candidate_list = self.remove_postfixes(new_word, ["یم", "ید", "ند",
"م", "ی", "د"])
if len(candidate_list) > 0:
new_word = self.select_candidate(candidate_list, self.verb_lexicon)
if new_word:
if new_word in self.verb_f2p_map:
stem = self.verb_f2p_map[new_word] + "&" + new_word
return stem
# مضارع اخباری
candidate_list = self.remove_prefixes(word, ["می", "نمی", "همی"])
if len(candidate_list) > 0:
new_word = self.select_candidate(candidate_list)
candidate_list = self.remove_postfixes(new_word, ["یم", "ید", "ند",
"م", "ی", "د"])
if len(candidate_list) > 0:
new_word = self.select_candidate(candidate_list, self.verb_lexicon)
if new_word:
if new_word in self.verb_f2p_map:
stem = self.verb_f2p_map[new_word] + "&" + new_word
return stem
# ماضی استمراری
candidate_list = self.remove_prefixes(word, ["می", "نمی", "همی"])
if len(candidate_list) > 0:
new_word = self.select_candidate(candidate_list)
candidate_list = self.remove_postfixes(new_word, ["یم", "ید", "ند",
"م", "ی", ""])
if len(candidate_list) > 0:
new_word = self.select_candidate(candidate_list, self.verb_lexicon)
if new_word:
if new_word in self.verb_p2f_map:
stem = new_word + "&" + self.verb_p2f_map[new_word]
return stem
# ماضی بعید و التزامی
candidate_list = self.remove_postfixes(word, ["یم", "ید", "ند",
"م", "ی", ""])
if len(candidate_list) > 0:
new_word = self.select_candidate(candidate_list)
candidate_list = self.remove_postfixes(new_word, ["بود", "باشد", "باش"])
if len(candidate_list) > 0:
new_word = self.select_candidate(candidate_list)
candidate_list = self.remove_postfixes(new_word, ["ه"])
if len(candidate_list) > 0:
new_word = self.select_candidate(candidate_list)
candidate_list = self.remove_prefixes(new_word, ["ن", ""])
if len(candidate_list) > 0:
new_word = self.select_candidate(candidate_list, self.verb_lexicon)
if new_word:
if new_word in self.verb_p2f_map:
stem = new_word + "&" + self.verb_p2f_map[new_word]
return stem
# ماضی نقلی
candidate_list = self.remove_postfixes(word, ["ام", "ای", "است",
"ایم", "اید", "اند"])
if len(candidate_list) > 0:
new_word = self.select_candidate(candidate_list)
candidate_list = self.remove_postfixes(new_word, ["ه"])
if len(candidate_list) > 0:
new_word = self.select_candidate(candidate_list)
candidate_list = self.remove_prefixes(new_word, ["ن", ""])
if len(candidate_list) > 0:
new_word = self.select_candidate(candidate_list, self.verb_lexicon)
if new_word:
if new_word in self.verb_p2f_map:
stem = new_word + "&" + self.verb_p2f_map[new_word]
return stem
# ماضی ابعد
candidate_list = self.remove_postfixes(word, ["ام", "ای", "است",
"ایم", "اید", "اند"])
if len(candidate_list) > 0:
new_word = self.select_candidate(candidate_list)
candidate_list = self.remove_postfixes(new_word, ["ه"])
if len(candidate_list) > 0:
new_word = self.select_candidate(candidate_list)
candidate_list = self.remove_postfixes(new_word, ["بود"])
if len(candidate_list) > 0:
new_word = self.select_candidate(candidate_list)
candidate_list = self.remove_postfixes(new_word, ["ه"])
if len(candidate_list) > 0:
new_word = self.select_candidate(candidate_list)
candidate_list = self.remove_prefixes(new_word, ["ن", ""])
if len(candidate_list) > 0:
new_word = self.select_candidate(new_word, self.verb_lexicon)
if new_word:
if new_word in self.verb_p2f_map:
stem = new_word + "&" + self.verb_p2f_map[new_word]
return stem
# آینده
candidate_list = self.remove_prefixes(word, ["خواه", "نخواه"])
if len(candidate_list) > 0:
new_word = self.select_candidate(candidate_list)
candidate_list = self.remove_prefixes(new_word, ["یم", "ید", "ند",
"م", "ی", "د"])
if len(candidate_list) > 0:
new_word = self.select_candidate(candidate_list, self.verb_lexicon)
if new_word:
if new_word in self.verb_p2f_map:
stem = new_word + "&" + self.verb_p2f_map[new_word]
return stem
# مضارع التزامی و امر
candidate_list = self.remove_prefixes(word, ["ب", "ن", ""])
if len(candidate_list) > 0:
new_word = self.select_candidate(candidate_list)
candidate_list = self.remove_postfixes(new_word, ["یم", "ید", "ند", "م",
"ی", "د", ""])
if len(candidate_list) > 0:
new_word = self.select_candidate(candidate_list)
if (self.is_prefix(new_word, "یا")) and (new_word not in self.verb_lexicon):
candidate_list = self.remove_prefixes(new_word, ["یا"])
new_word = self.select_candidate(candidate_list)
new_word = "آ" + new_word
if self.is_postfix(new_word, "آی") or self.is_postfix(new_word, "ای"):
if new_word not in self.verb_lexicon:
candidate_list = self.remove_postfixes(new_word, ["ی"])
new_word = self.select_candidate(candidate_list)
if self.is_prefix(new_word, "ی"):
candidate_list = self.remove_prefixes(new_word, ["ی"])
tmp_word = self.select_candidate(candidate_list)
if tmp_word and ("ا" + tmp_word) in self.verb_lexicon:
new_word = "ا" + tmp_word
if new_word and new_word in self.verb_lexicon:
if new_word in self.verb_f2p_map:
stem = self.verb_f2p_map[new_word] + "&" + new_word
return stem
# ماضی ساده
candidate_list = self.remove_postfixes(word, ["م", "ی", "",
"یم", "ید", "ند"])
if len(candidate_list) > 0:
new_word = self.select_candidate(candidate_list)
candidate_list = self.remove_prefixes(new_word, ["ن", ""])
if len(candidate_list) > 0:
new_word = self.select_candidate(candidate_list, self.verb_lexicon)
if new_word:
if new_word in self.verb_p2f_map:
stem = new_word + "&" + self.verb_p2f_map[new_word]
return stem
# حالت مصدر
candidate_list = self.remove_postfixes(word, ["ن"])
if len(candidate_list) > 0:
new_word = self.select_candidate(candidate_list, self.verb_lexicon)
if new_word:
if new_word in self.verb_p2f_map:
stem = new_word + "&" + self.verb_p2f_map[new_word]
return stem
if word_pos is None or word_pos == "N":
# پسوندهای مالکیت
stem_candidate = word
candidate_list = self.remove_postfixes(word, ["م", "ت", "ش", "یم", "یت", "یش",
"یتان", "یشان", "یمان",
"مان", "تان", "شان", "ان"])
if len(candidate_list) > 0:
stem_candidate = self.select_candidate(candidate_list, self.noun_lexicon)
if stem_candidate:
new_word = stem_candidate
else:
new_word = self.select_candidate(candidate_list)
stem_candidate = new_word
else:
new_word = stem_candidate
if new_word in self.noun_lexicon:
return self.map_irregular_noun(new_word)
candidate_list = self.remove_postfixes(new_word, ["ها", "ات", "های",
"ان", "هایی", "ین"])
if len(candidate_list) > 0:
stem_candidate = self.select_candidate(candidate_list, self.noun_lexicon)
if stem_candidate:
new_word = stem_candidate
else:
new_word = self.select_candidate(candidate_list)
stem_candidate = new_word
else:
new_word = stem_candidate
if new_word in self.noun_lexicon:
return self.map_irregular_noun(new_word)
candidate_list = self.remove_postfixes(new_word, ["گ"])
if len(candidate_list) > 0:
new_word = self.select_candidate(candidate_list)
new_word = new_word + "ه"
stem_candidate = new_word
else:
new_word = stem_candidate
if new_word in self.noun_lexicon:
return self.map_irregular_noun(new_word)
candidate_list = self.remove_postfixes(new_word, self.postfix_list)
if len(candidate_list) > 0:
stem_candidate = self.select_candidate(candidate_list, self.noun_lexicon)
if stem_candidate:
new_word = stem_candidate
else:
new_word = self.select_candidate(candidate_list)
stem_candidate = new_word
else:
new_word = stem_candidate
if new_word in self.noun_lexicon:
return self.map_irregular_noun(new_word)
# stem = new_word
candidate_list = self.remove_prefixes(new_word, self.prefix_list)
if len(candidate_list) > 0:
stem_candidate = self.select_candidate(candidate_list, self.noun_lexicon)
if stem_candidate:
new_word = stem_candidate
else:
new_word = self.select_candidate(candidate_list)
stem_candidate = new_word
else:
new_word = stem_candidate
if new_word in self.noun_lexicon:
return self.map_irregular_noun(new_word)
# stem = new_word
# افعال پیشوندی
candidate_list = self.remove_prefixes(word, ['در', 'بر', 'پر', 'باز',
'ور', 'فرو', 'فرا', 'وا'])
if len(candidate_list) > 0:
new_word = self.select_candidate(candidate_list)
if new_word:
tmp_pr = word[:-len(new_word)].strip().strip('\u200c')
new_word = self.convert_to_stem(new_word, word_pos='V')
if new_word and new_word in self.verb_lexicon:
return tmp_pr + new_word
return word