Spaces:

abreza
/

mana-tts

Running on Zero

App Files Files Community

mana-tts / Parsivar /spell_checker.py

abreza

add ge2pe

eb57aa1 22 days ago

raw

history blame contribute delete

15.5 kB

	import pickle
	import math
	import os

	from .normalizer import Normalizer
	from .tokenizer import Tokenizer
	from .data_helper import DataHelper


	class SpellCheck:
	def __init__(self):
	self.normalizer = Normalizer()
	self.tokenizer = Tokenizer()
	self.data_helper = DataHelper()

	self.dir_path = os.path.dirname(os.path.realpath(__file__)) + "/"

	self.bigram_lm = self.data_helper.load_var(self.dir_path + "resource/spell/mybigram_lm.pckl")
	self.onegram_lm = self.data_helper.load_var(self.dir_path + "resource/spell/onegram.pckl")
	self.ingroup_chars = [{'ا', 'آ', 'ع'},
	{'ت', 'ط'},
	{'ث', 'س', 'ص'},
	{'ح', 'ه'},
	{'ذ', 'ز', 'ض', 'ظ'},
	{'ق', 'غ'}]

	def deletion(self, word):
	p_list = []
	for k in range(len(word)):
	if word[k] == '-' or word[k] == '#':
	continue
	begin = word[:k]
	end = word[k+1:]
	tmp_string = begin + end
	p_list.append(tmp_string)
	return p_list

	def splitting(self, word):
	p_list = set([])
	delimator = '-'
	for i, char in enumerate(word):
	begin = word[:i].strip('\u200c')
	end = word[i:].strip('\u200c')
	tmp_string = begin + delimator + end
	p_list.add(tmp_string)
	return list(p_list)

	def insertion(self, word):
	p_list = []
	alphabet = ['ا', 'آ', 'ب', 'پ', 'ت', 'ث', 'ج', 'چ', 'ح', 'خ',
	'د', 'ذ', 'ر', 'ز', 'ژ', 'س', 'ش', 'ص', 'ض', 'ط',
	'ظ', 'ع', 'غ', 'ف', 'ق', 'ک', 'گ', 'ل', 'م', 'ن',
	'و', 'ه', 'ی', '‌']
	for k in range(len(word)+1):
	for char in alphabet:
	begin = word[:k]
	end = word[k:]
	tmp_string = begin + char + end
	p_list.append(tmp_string)
	return p_list

	def substitution(self, word):
	p_list = []
	alphabet = ['ا', 'آ', 'ب', 'پ', 'ت', 'ث', 'ج', 'چ', 'ح', 'خ',
	'د', 'ذ', 'ر', 'ز', 'ژ', 'س', 'ش', 'ص', 'ض', 'ط',
	'ظ', 'ع', 'غ', 'ف', 'ق', 'ک', 'گ', 'ل', 'م', 'ن',
	'و', 'ه', 'ی']
	for i, char in enumerate(word):
	if char == '-' or char == '#':
	continue
	for c in alphabet:
	begin = word[:i]
	end = word[i+1:]
	tmp_string = begin + c + end
	p_list.append(tmp_string)
	return p_list

	def transposition(self, word):
	p_list = []
	word = list(word)
	tmp_word = word[:]
	for k1 in range(len(word)):
	k2 = k1 + 1
	if k2 == len(word):
	break
	tmp = tmp_word[k1]
	tmp_word[k1] = tmp_word[k2]
	tmp_word[k2] = tmp
	tmp_string = ''.join(tmp_word)
	p_list.append(tmp_string)
	tmp_word = word[:]
	return p_list

	def build_similar_words(self, word_seq, index, zi, operation):
	z_list = []
	o_list = []

	if operation == "Spell":
	tmp = self.deletion(zi)
	for i in tmp:
	z_list.append(i)
	o_list.append("Deletion")
	tmp = self.insertion(zi)
	for i in tmp:
	z_list.append(i)
	o_list.append("Insertion")
	tmp = self.substitution(zi)
	for i in tmp:
	z_list.append(i)
	o_list.append("Substitution")
	tmp = self.transposition(zi)
	for i in tmp:
	z_list.append(i)
	o_list.append("Transposition")

	elif operation == "Split":
	tmp = self.splitting(zi)
	for i in tmp:
	z_list.append(i)
	o_list.append("Split")

	elif operation == "Merg":
	if index < len(word_seq)-1:
	tmp = zi + '#' + word_seq[index+1]
	z_list.append(tmp)
	o_list.append("Merg")

	return [z_list, o_list]

	def bigram_markov_factor(self, yi_1, yi):
	bigram_counts, total_count = self.bigram_lm
	tmp = (yi_1, yi)
	if tmp in bigram_counts.keys():
	x = bigram_counts[tmp]
	x = float(x)/total_count
	x = math.log2(x)
	return x
	else:
	return -28

	def get_word_probability(self, word):
	lex_dict = self.onegram_lm[0]
	total_words = self.onegram_lm[1]

	if word in lex_dict:
	count = lex_dict[word]
	logprob = math.log2(float(count)/total_words)
	return logprob
	else:
	return -50.0

	def isword(self, x):
	if abs(x.find('#') - x.find('-')) == 1:
	return False

	dash_idx = x.find('-')
	if dash_idx != -1:
	first = x[:dash_idx] # from beginning to n (n not included)
	secound = x[dash_idx+1:] # n+1 through end of string
	if self.get_word_probability(first) < -49:
	return False
	elif self.get_word_probability(secound) < -49:
	return False
	else:
	return True

	sharp_idx = x.find('#')
	if sharp_idx != -1:
	begin = x[:sharp_idx]
	end = x[sharp_idx+1:]
	tmp_str = begin + end
	if self.get_word_probability(tmp_str) < -49:
	return False
	else:
	return True
	else:
	if self.get_word_probability(x) < -49:
	return False
	else:
	return True

	def get_possible_words(self, word_seq, index):
	wi = word_seq[index]
	possible_words = []
	operation_list = []

	possible_words.append(wi)
	operation_list.append("Nothing")

	if len(wi) == 1:
	return possible_words, operation_list

	'''Merg Split Spell'''
	[c_list, o_list] = self.build_similar_words(word_seq, index, wi, "Merg")
	for i, c in enumerate(c_list):
	if self.isword(c):
	possible_words.append(c)
	operation_list.append(o_list[i])

	[c_list, o_list] = self.build_similar_words(word_seq, index, wi, "Split")
	for i, c in enumerate(c_list):
	if self.isword(c):
	possible_words.append(c)
	operation_list.append(o_list[i])

	[c_list, o_list] = self.build_similar_words(word_seq, index, wi, "Spell")
	for i, c in enumerate(c_list):
	if self.isword(c):
	possible_words.append(c)
	operation_list.append(o_list[i])

	return possible_words, operation_list

	def select_n_best(self, c_list, o_list, n=3):
	my_dict = {}
	map_dict = {}
	for i, word in enumerate(c_list):
	if o_list[i] == 'Merg':
	tmp_word = word.replace("#", "")
	prob = self.get_word_probability(tmp_word)

	elif o_list[i] == 'Split':
	begin = word.split('-')[0]
	end = word.split('-')[1]
	prob = float(self.get_word_probability(begin) + self.get_word_probability(end))/2

	else:
	prob = self.get_word_probability(word)

	if word not in my_dict:
	my_dict[word] = prob
	map_dict[word] = o_list[i]

	n_best = set(sorted(my_dict, key=my_dict.get, reverse=True)[:n])
	n_best.add(c_list[0])
	n_best = list(n_best)
	n_best_op = [map_dict[key] for key in n_best]
	return n_best, n_best_op

	def is_ingroup_substitution(self, main_word, candidate_word):
	main_word = list(main_word)
	candidate_word = list(candidate_word)
	flag = False
	for i, c in enumerate(main_word):
	if c == candidate_word[i]:
	continue
	else:
	flag = False
	for l in self.ingroup_chars:
	if c in l and candidate_word[i] in l:
	flag = True
	break
	break
	return flag

	def select_correct_spell(self, candidate_list, next_candidates, next_next_candidates, prev_word, current_word):
	best_candidate = None
	best_operation = None
	best_score = -1000
	next_next_candidate_list = []
	next_next_operation_list = []

	candidate_list, operation_list = candidate_list
	if next_candidates is not None:
	next_candidate_list, next_operation_list = next_candidates
	else:
	next_candidate_list, next_operation_list = [None], "Nothing"

	if next_next_candidates is not None:
	next_next_candidate_list, next_next_operation_list = next_next_candidates
	else:
	next_candidate_list, next_operation_list = [None], "Nothing"

	for i, candidate in enumerate(candidate_list):
	operation = operation_list[i]

	if operation == "Split":
	begin = candidate[:candidate.find('-')]
	end = candidate[candidate.find('-')+1:]
	candidate = begin
	next_word = end

	onegram_score = self.get_word_probability(candidate)
	bigram_score_with_prev = self.bigram_markov_factor(prev_word, candidate)

	bigram_score_next = -1000
	tmp_score_next = self.bigram_markov_factor(candidate, next_word)
	for j, next_next_word in enumerate(next_candidate_list):
	opt = next_operation_list[j]
	if opt == 'Merg':
	next_next_word = next_next_word.replace("#", "")
	elif opt == 'Split':
	next_next_word = next_next_word.split('-')[0]

	tmp_score_next_next = self.bigram_markov_factor(next_word, next_next_word)
	if tmp_score_next_next > bigram_score_next:
	bigram_score_next = tmp_score_next_next
	bigram_score_next = float(bigram_score_next + tmp_score_next)/2

	elif operation == "Merg":
	begin = candidate[:candidate.find('#')]
	end = candidate[candidate.find('#')+1:]
	candidate = begin + end

	onegram_score = self.get_word_probability(candidate)
	bigram_score_with_prev = self.bigram_markov_factor(prev_word, candidate)

	bigram_score_next = -1000
	for j, next_next_word in enumerate(next_next_candidate_list):
	opt = next_next_operation_list[j]
	if opt == 'Merg':
	next_next_word = next_next_word.replace("#", "")
	elif opt == 'Split':
	next_next_word = next_next_word.split('-')[0]

	tmp_score = self.bigram_markov_factor(candidate, next_next_word)
	if tmp_score > bigram_score_next:
	bigram_score_next = tmp_score

	else:
	onegram_score = self.get_word_probability(candidate)
	bigram_score_with_prev = self.bigram_markov_factor(prev_word, candidate)

	bigram_score_next = -1000
	for j, next_word in enumerate(next_candidate_list):
	opt = next_operation_list[j]
	if opt == 'Merg':
	next_word = next_word.replace("#", "")
	elif opt == 'Split':
	next_word = next_word.split('-')[0]

	tmp_score = self.bigram_markov_factor(candidate, next_word)
	if tmp_score > bigram_score_next:
	bigram_score_next = tmp_score

	if operation == 'Substitution':
	if self.is_ingroup_substitution(current_word, candidate):
	onegram_score += 20
	else:
	onegram_score += 10
	elif operation == 'Deletion' or operation == 'Insertion':
	onegram_score += 5
	if '\u200c' in candidate and '\u200c' not in current_word:
	onegram_score += 5
	elif operation == 'Split' or operation == 'Merg':
	onegram_score += 7
	elif operation == 'Nothing':
	onegram_score += 20

	score = 1onegram_score + 0.7bigram_score_with_prev + 0.7*bigram_score_next

	if score > best_score:
	best_operation = operation
	best_candidate = candidate_list[i]
	best_score = score

	return best_candidate, best_operation

	def spell_corrector(self, doc_string):
	words = self.tokenizer.tokenize_words(self.normalizer.normalize(doc_string))

	best_o_list = []
	best_candidates_list = []

	yi_1 = None
	merged_before = False

	suggest_list = []

	for i, word in enumerate(words):
	[c_list, o_list] = self.get_possible_words(words, i)
	n_best = self.select_n_best(c_list, o_list, n=15)
	suggest_list.append(n_best)

	for i, candidate_list in enumerate(suggest_list):

	if merged_before:
	continue

	if (i+2) < len(suggest_list):
	next_candidates = suggest_list[i+1]
	next_next_candidates = suggest_list[i+2]
	elif (i+1) < len(suggest_list):
	next_candidates = suggest_list[i+1]
	next_next_candidates = None
	else:
	next_candidates = None
	next_next_candidates = None

	best_candidate, best_operation = self.select_correct_spell(candidate_list, next_candidates,
	next_next_candidates, yi_1, words[i])

	merged_before = False
	if best_operation == "Split":
	begin = best_candidate.split('-')[0]
	end = best_candidate.split('-')[1]
	best_candidate = [begin, end]
	if best_operation == "Merg":
	best_candidate = best_operation.replace("#", "")
	merged_before = True
	if type(best_candidate) == str:
	best_candidate = [best_candidate]

	best_o_list.append(best_operation)
	best_candidates_list.extend(best_candidate)
	yi_1 = best_candidate[-1]

	res = " ".join(best_candidates_list)
	ops = " ".join(best_o_list)

	return res


	if __name__ == "__main__":
	doc_string = "نمازگذاران وارد مسلی شدند."
	myspell_checker = SpellCheck()

	res = myspell_checker.spell_corrector(doc_string)
	print(res)