Spaces:

abreza
/

mana-tts

Running on Zero

App Files Files Community

mana-tts / Parsivar /stemmer.py

abreza

add ge2pe

eb57aa1 22 days ago

raw

history blame contribute delete

20.7 kB

	import os
	from .data_helper import DataHelper
	from .normalizer import Normalizer

	class FindStems():

	def __init__(self):

	self.dir_path = os.path.dirname(os.path.realpath(__file__)) + "/"

	self.noun_lex_path = self.dir_path + "resource/stemmer/stem_lex.pckl"
	self.verb_lex_path = self.dir_path + "resource/stemmer/verbStemDict.pckl"
	self.verb_tense_map_path = self.dir_path + "resource/stemmer/stem_verbMap.pckl"
	self.irregular_nouns_path = self.dir_path + "resource/stemmer/stem_irregularNounDict.pckl"
	self.prefix_list_path = self.dir_path + "resource/stemmer/pishvand.txt"
	self.postfix_list_path = self.dir_path + "resource/stemmer/pasvand.txt"
	self.verb_tense_file_path = self.dir_path + "resource/stemmer/verb_tense.txt"
	self.mokasar_noun_path = self.dir_path + "resource/stemmer/mokasar.txt"
	self.data_helper = DataHelper()

	if(os.path.isfile(self.noun_lex_path) and os.path.isfile(self.verb_lex_path)
	and os.path.isfile(self.verb_tense_map_path) and os.path.isfile(self.irregular_nouns_path)):
	self.noun_lexicon = self.data_helper.load_var(self.noun_lex_path)
	self.verb_lexicon = self.data_helper.load_var(self.verb_lex_path)
	self.verb_tense_map = self.data_helper.load_var(self.verb_tense_map_path)
	self.irregular_nouns = self.data_helper.load_var(self.irregular_nouns_path)

	self.verb_p2f_map, self.verb_f2p_map = self.verb_tense_map[0], self.verb_tense_map[1]

	else:
	self.mynormalizer = Normalizer()
	self.noun_lexicon, self.verb_lexicon, \
	self.verb_tense_map, self.irregular_nouns =\
	self.data_helper.build_stem_dictionary(self.mynormalizer,
	self.verb_tense_file_path,
	self.mokasar_noun_path)
	self.data_helper.save_var(save_path=self.noun_lex_path, variable=self.noun_lexicon)
	self.data_helper.save_var(save_path=self.verb_lex_path, variable=self.verb_lexicon)
	self.data_helper.save_var(save_path=self.verb_tense_map_path, variable=self.verb_tense_map)
	self.data_helper.save_var(save_path=self.irregular_nouns_path, variable=self.irregular_nouns)

	self.verb_p2f_map, self.verb_f2p_map = self.verb_tense_map[0], self.verb_tense_map[1]

	self.prefix_list = set({})
	with open(self.prefix_list_path, "r", encoding='utf-8') as pishvand_input_file:
	pishvandFile_content = pishvand_input_file.readlines()
	for el in pishvandFile_content:
	self.prefix_list.add(el.strip())

	self.postfix_list = set({})
	with open(self.postfix_list_path, "r", encoding='utf-8') as pasvand_input_file:
	pasvandFile_content = pasvand_input_file.readlines()
	for el in pasvandFile_content:
	self.postfix_list.add(el.strip())

	def select_candidate(self, candidate_list, lexicon_set=None):
	length = 1000
	selected = ""
	for tmp_candidate in candidate_list:
	if lexicon_set == None and len(tmp_candidate) < length:
	selected = tmp_candidate
	length = len(tmp_candidate)
	elif (lexicon_set != None) and (tmp_candidate in lexicon_set):
	if(length == 1000):
	selected = tmp_candidate
	length = len(tmp_candidate)
	else:
	if(len(tmp_candidate) > length):
	selected = tmp_candidate
	length = len(tmp_candidate)
	return selected

	def is_prefix(self, word, prefix):
	word = word.strip("\u200c")
	return word.startswith(prefix)

	def is_postfix(self, word, post):
	word = word.strip("\u200c")
	return word.endswith(post)

	def remove_prefixes(self, word, prefix):
	word = word.strip("\u200c")
	candidateStem = set({})
	for el in prefix:
	if word.startswith(el):
	if len(el) > 0:
	tmp = word[len(el):].strip().strip('\u200c')
	else:
	tmp = word
	candidateStem.add(tmp)
	return candidateStem

	def remove_postfixes(self, word, postfix):
	word = word.strip("\u200c")
	candidateStem = set({})
	for el in postfix:
	if word.endswith(el):
	if len(el) > 0:
	tmp = word[:-len(el)].strip().strip('\u200c')
	else:
	tmp = word
	candidateStem.add(tmp)
	return candidateStem

	def map_irregular_noun(self, word):
	if word in self.irregular_nouns:
	return self.irregular_nouns[word]
	else:
	return word

	def convert_to_stem(self, word, word_pos=None):
	if word in self.noun_lexicon:
	if (word_pos == None or word_pos == 'N'):
	#print("in word dict...")
	return self.map_irregular_noun(word)

	elif word in self.verb_lexicon:
	if word_pos is None or word_pos == 'V':
	# print("in verb dict...")
	if word in self.verb_f2p_map:
	stem = self.verb_f2p_map[word] + "&" + word
	elif word in self. verb_p2f_map:
	stem = word + "&" + self.verb_p2f_map[word]
	else:
	stem = word
	return stem

	# if word is a verb
	if word_pos is None or word_pos == "V":
	# ماضی مستمر
	candidate_list = self.remove_prefixes(word, ["داشتم", "داشتی", "داشت",
	"داشتیم", "داشتید", "داشتند"])
	if len(candidate_list) > 0:
	new_word = self.select_candidate(candidate_list)
	candidate_list = self.remove_prefixes(new_word, ["می"])
	if len(candidate_list) > 0:
	new_word = self.select_candidate(candidate_list)
	candidate_list = self.remove_postfixes(new_word, ["یم", "ید", "ند",
	"م", "ی", ""])
	if len(candidate_list) > 0:
	new_word = self.select_candidate(candidate_list, self.verb_lexicon)
	if new_word:
	if new_word in self.verb_p2f_map:
	stem = new_word + "&" + self.verb_p2f_map[new_word]
	return stem

	# مضارع مستمر
	candidate_list = self.remove_prefixes(word, ["دارم", "داری", "دارد",
	"داریم", "دارید", "دارند"])
	if len(candidate_list) > 0:
	new_word = self.select_candidate(candidate_list)
	candidate_list = self.remove_prefixes(new_word, ["می"])
	if len(candidate_list) > 0:
	new_word = self.select_candidate(candidate_list)
	candidate_list = self.remove_postfixes(new_word, ["یم", "ید", "ند",
	"م", "ی", "د"])
	if len(candidate_list) > 0:
	new_word = self.select_candidate(candidate_list, self.verb_lexicon)
	if new_word:
	if new_word in self.verb_f2p_map:
	stem = self.verb_f2p_map[new_word] + "&" + new_word
	return stem

	# مضارع اخباری
	candidate_list = self.remove_prefixes(word, ["می", "نمی", "همی"])
	if len(candidate_list) > 0:
	new_word = self.select_candidate(candidate_list)
	candidate_list = self.remove_postfixes(new_word, ["یم", "ید", "ند",
	"م", "ی", "د"])
	if len(candidate_list) > 0:
	new_word = self.select_candidate(candidate_list, self.verb_lexicon)
	if new_word:
	if new_word in self.verb_f2p_map:
	stem = self.verb_f2p_map[new_word] + "&" + new_word
	return stem

	# ماضی استمراری
	candidate_list = self.remove_prefixes(word, ["می", "نمی", "همی"])
	if len(candidate_list) > 0:
	new_word = self.select_candidate(candidate_list)
	candidate_list = self.remove_postfixes(new_word, ["یم", "ید", "ند",
	"م", "ی", ""])
	if len(candidate_list) > 0:
	new_word = self.select_candidate(candidate_list, self.verb_lexicon)
	if new_word:
	if new_word in self.verb_p2f_map:
	stem = new_word + "&" + self.verb_p2f_map[new_word]
	return stem

	# ماضی بعید و التزامی
	candidate_list = self.remove_postfixes(word, ["یم", "ید", "ند",
	"م", "ی", ""])
	if len(candidate_list) > 0:
	new_word = self.select_candidate(candidate_list)
	candidate_list = self.remove_postfixes(new_word, ["بود", "باشد", "باش"])
	if len(candidate_list) > 0:
	new_word = self.select_candidate(candidate_list)
	candidate_list = self.remove_postfixes(new_word, ["ه"])
	if len(candidate_list) > 0:
	new_word = self.select_candidate(candidate_list)
	candidate_list = self.remove_prefixes(new_word, ["ن", ""])
	if len(candidate_list) > 0:
	new_word = self.select_candidate(candidate_list, self.verb_lexicon)
	if new_word:
	if new_word in self.verb_p2f_map:
	stem = new_word + "&" + self.verb_p2f_map[new_word]
	return stem

	# ماضی نقلی
	candidate_list = self.remove_postfixes(word, ["ام", "ای", "است",
	"ایم", "اید", "اند"])
	if len(candidate_list) > 0:
	new_word = self.select_candidate(candidate_list)
	candidate_list = self.remove_postfixes(new_word, ["ه"])
	if len(candidate_list) > 0:
	new_word = self.select_candidate(candidate_list)
	candidate_list = self.remove_prefixes(new_word, ["ن", ""])
	if len(candidate_list) > 0:
	new_word = self.select_candidate(candidate_list, self.verb_lexicon)
	if new_word:
	if new_word in self.verb_p2f_map:
	stem = new_word + "&" + self.verb_p2f_map[new_word]
	return stem

	# ماضی ابعد
	candidate_list = self.remove_postfixes(word, ["ام", "ای", "است",
	"ایم", "اید", "اند"])
	if len(candidate_list) > 0:
	new_word = self.select_candidate(candidate_list)
	candidate_list = self.remove_postfixes(new_word, ["ه"])
	if len(candidate_list) > 0:
	new_word = self.select_candidate(candidate_list)
	candidate_list = self.remove_postfixes(new_word, ["بود"])
	if len(candidate_list) > 0:
	new_word = self.select_candidate(candidate_list)
	candidate_list = self.remove_postfixes(new_word, ["ه"])
	if len(candidate_list) > 0:
	new_word = self.select_candidate(candidate_list)
	candidate_list = self.remove_prefixes(new_word, ["ن", ""])
	if len(candidate_list) > 0:
	new_word = self.select_candidate(new_word, self.verb_lexicon)
	if new_word:
	if new_word in self.verb_p2f_map:
	stem = new_word + "&" + self.verb_p2f_map[new_word]
	return stem

	# آینده
	candidate_list = self.remove_prefixes(word, ["خواه", "نخواه"])
	if len(candidate_list) > 0:
	new_word = self.select_candidate(candidate_list)
	candidate_list = self.remove_prefixes(new_word, ["یم", "ید", "ند",
	"م", "ی", "د"])
	if len(candidate_list) > 0:
	new_word = self.select_candidate(candidate_list, self.verb_lexicon)
	if new_word:
	if new_word in self.verb_p2f_map:
	stem = new_word + "&" + self.verb_p2f_map[new_word]
	return stem

	# مضارع التزامی و امر
	candidate_list = self.remove_prefixes(word, ["ب", "ن", ""])
	if len(candidate_list) > 0:
	new_word = self.select_candidate(candidate_list)
	candidate_list = self.remove_postfixes(new_word, ["یم", "ید", "ند", "م",
	"ی", "د", ""])
	if len(candidate_list) > 0:
	new_word = self.select_candidate(candidate_list)
	if (self.is_prefix(new_word, "یا")) and (new_word not in self.verb_lexicon):
	candidate_list = self.remove_prefixes(new_word, ["یا"])
	new_word = self.select_candidate(candidate_list)
	new_word = "آ" + new_word
	if self.is_postfix(new_word, "آی") or self.is_postfix(new_word, "ای"):
	if new_word not in self.verb_lexicon:
	candidate_list = self.remove_postfixes(new_word, ["ی"])
	new_word = self.select_candidate(candidate_list)
	if self.is_prefix(new_word, "ی"):
	candidate_list = self.remove_prefixes(new_word, ["ی"])
	tmp_word = self.select_candidate(candidate_list)
	if tmp_word and ("ا" + tmp_word) in self.verb_lexicon:
	new_word = "ا" + tmp_word

	if new_word and new_word in self.verb_lexicon:
	if new_word in self.verb_f2p_map:
	stem = self.verb_f2p_map[new_word] + "&" + new_word
	return stem

	# ماضی ساده
	candidate_list = self.remove_postfixes(word, ["م", "ی", "",
	"یم", "ید", "ند"])
	if len(candidate_list) > 0:
	new_word = self.select_candidate(candidate_list)
	candidate_list = self.remove_prefixes(new_word, ["ن", ""])
	if len(candidate_list) > 0:
	new_word = self.select_candidate(candidate_list, self.verb_lexicon)
	if new_word:
	if new_word in self.verb_p2f_map:
	stem = new_word + "&" + self.verb_p2f_map[new_word]
	return stem

	# حالت مصدر
	candidate_list = self.remove_postfixes(word, ["ن"])
	if len(candidate_list) > 0:
	new_word = self.select_candidate(candidate_list, self.verb_lexicon)
	if new_word:
	if new_word in self.verb_p2f_map:
	stem = new_word + "&" + self.verb_p2f_map[new_word]
	return stem

	if word_pos is None or word_pos == "N":
	# پسوندهای مالکیت
	stem_candidate = word
	candidate_list = self.remove_postfixes(word, ["م", "ت", "ش", "یم", "یت", "یش",
	"یتان", "یشان", "یمان",
	"مان", "تان", "شان", "ان"])
	if len(candidate_list) > 0:
	stem_candidate = self.select_candidate(candidate_list, self.noun_lexicon)
	if stem_candidate:
	new_word = stem_candidate
	else:
	new_word = self.select_candidate(candidate_list)
	stem_candidate = new_word
	else:
	new_word = stem_candidate
	if new_word in self.noun_lexicon:
	return self.map_irregular_noun(new_word)

	candidate_list = self.remove_postfixes(new_word, ["ها", "ات", "های",
	"ان", "هایی", "ین"])
	if len(candidate_list) > 0:
	stem_candidate = self.select_candidate(candidate_list, self.noun_lexicon)
	if stem_candidate:
	new_word = stem_candidate
	else:
	new_word = self.select_candidate(candidate_list)
	stem_candidate = new_word
	else:
	new_word = stem_candidate

	if new_word in self.noun_lexicon:
	return self.map_irregular_noun(new_word)

	candidate_list = self.remove_postfixes(new_word, ["گ"])
	if len(candidate_list) > 0:
	new_word = self.select_candidate(candidate_list)
	new_word = new_word + "ه"
	stem_candidate = new_word
	else:
	new_word = stem_candidate
	if new_word in self.noun_lexicon:
	return self.map_irregular_noun(new_word)

	candidate_list = self.remove_postfixes(new_word, self.postfix_list)
	if len(candidate_list) > 0:
	stem_candidate = self.select_candidate(candidate_list, self.noun_lexicon)
	if stem_candidate:
	new_word = stem_candidate
	else:
	new_word = self.select_candidate(candidate_list)
	stem_candidate = new_word
	else:
	new_word = stem_candidate
	if new_word in self.noun_lexicon:
	return self.map_irregular_noun(new_word)
	# stem = new_word

	candidate_list = self.remove_prefixes(new_word, self.prefix_list)
	if len(candidate_list) > 0:
	stem_candidate = self.select_candidate(candidate_list, self.noun_lexicon)
	if stem_candidate:
	new_word = stem_candidate
	else:
	new_word = self.select_candidate(candidate_list)
	stem_candidate = new_word
	else:
	new_word = stem_candidate

	if new_word in self.noun_lexicon:
	return self.map_irregular_noun(new_word)
	# stem = new_word

	# افعال پیشوندی
	candidate_list = self.remove_prefixes(word, ['در', 'بر', 'پر', 'باز',
	'ور', 'فرو', 'فرا', 'وا'])

	if len(candidate_list) > 0:
	new_word = self.select_candidate(candidate_list)
	if new_word:
	tmp_pr = word[:-len(new_word)].strip().strip('\u200c')
	new_word = self.convert_to_stem(new_word, word_pos='V')
	if new_word and new_word in self.verb_lexicon:
	return tmp_pr + new_word
	return word