Spaces:

Project-nlp
/

nlp-bert-team

Sleeping

App Files Files Community

nlp-bert-team / models /model1 /lstm_preprocessor.py

VerVelVel

images

961ee03 over 1 year ago

raw

history blame

2.27 kB

	import string
	import numpy as np
	import torch
	from sklearn.base import BaseEstimator, TransformerMixin
	from nltk.corpus import stopwords
	import joblib
	import re

	class TextPreprocessorWord2Vec(BaseEstimator, TransformerMixin):
	def __init__(self):
	self.stop_words = set(stopwords.words('russian'))
	self.vocab_to_int = joblib.load('models/model1/lstm_vocab_to_int.pkl')

	def preprocess_text(self, text):
	# Преобразование к нижнему регистру
	text = text.lower()
	# Удаление HTML тегов
	text = re.sub('<.*?>', '', text)
	# Удаление пунктуации
	text = ''.join([c for c in text if c not in string.punctuation])
	# Удаление стоп-слов
	text = ' '.join([word for word in text.split() if word not in self.stop_words])
	# Удаление цифр
	text = ' '.join([word for word in text.split() if not word.isdigit()])
	return text

	def padding(review_int: list, seq_len: int) -> np.array:
	features = np.zeros((len(review_int), seq_len), dtype=int)
	for i, review in enumerate(review_int):
	if len(review) <= seq_len:
	zeros = list(np.zeros(seq_len - len(review)))
	new = zeros + review
	else:
	new = review[:seq_len]
	features[i, :] = np.array(new)
	return features

	@staticmethod
	def preprocess_single_string(
	input_string: str,
	seq_len: int,
	vocab_to_int: dict,
	verbose: bool = False
	) -> torch.tensor:
	preprocessed_string = TextPreprocessorWord2Vec().preprocess_text(input_string)
	result_list = []
	for word in preprocessed_string.split():
	try:
	result_list.append(vocab_to_int[word])
	except KeyError as e:
	if verbose:
	print(f'{e}: not in dictionary!')
	pass
	result_padded = TextPreprocessorWord2Vec.padding([result_list], 64)[0]
	return torch.tensor(result_padded)

	def fit(self, X, y=None):
	return self

	def transform(self, X, y=None):
	return self.preprocess_single_string(X, 64, self.vocab_to_int)