nlp-bert-team / models /model1 /lstm_preprocessor.py
VerVelVel's picture
images
961ee03
raw
history blame
2.27 kB
import string
import numpy as np
import torch
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.corpus import stopwords
import joblib
import re
class TextPreprocessorWord2Vec(BaseEstimator, TransformerMixin):
def __init__(self):
self.stop_words = set(stopwords.words('russian'))
self.vocab_to_int = joblib.load('models/model1/lstm_vocab_to_int.pkl')
def preprocess_text(self, text):
# Преобразование к нижнему регистру
text = text.lower()
# Удаление HTML тегов
text = re.sub('<.*?>', '', text)
# Удаление пунктуации
text = ''.join([c for c in text if c not in string.punctuation])
# Удаление стоп-слов
text = ' '.join([word for word in text.split() if word not in self.stop_words])
# Удаление цифр
text = ' '.join([word for word in text.split() if not word.isdigit()])
return text
def padding(review_int: list, seq_len: int) -> np.array:
features = np.zeros((len(review_int), seq_len), dtype=int)
for i, review in enumerate(review_int):
if len(review) <= seq_len:
zeros = list(np.zeros(seq_len - len(review)))
new = zeros + review
else:
new = review[:seq_len]
features[i, :] = np.array(new)
return features
@staticmethod
def preprocess_single_string(
input_string: str,
seq_len: int,
vocab_to_int: dict,
verbose: bool = False
) -> torch.tensor:
preprocessed_string = TextPreprocessorWord2Vec().preprocess_text(input_string)
result_list = []
for word in preprocessed_string.split():
try:
result_list.append(vocab_to_int[word])
except KeyError as e:
if verbose:
print(f'{e}: not in dictionary!')
pass
result_padded = TextPreprocessorWord2Vec.padding([result_list], 64)[0]
return torch.tensor(result_padded)
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return self.preprocess_single_string(X, 64, self.vocab_to_int)