|
from collections import Counter |
|
from typing import Dict, Tuple, List |
|
|
|
import pandas as pd |
|
import torch |
|
|
|
from datasets import load_dataset, load_from_disk |
|
from sklearn.model_selection import train_test_split |
|
|
|
import src.data_utils.dataset_params as dataset_params |
|
|
|
from src.data_utils.config import DatasetConfig, TextProcessorConfig |
|
from src.data_utils.text_processor import TextProcessor |
|
|
|
|
|
class DatasetGenerator: |
|
""" |
|
Main dataset generator class |
|
|
|
Provides methods to load, build vocabulary, convert text datasets |
|
into tensor format suitable for deep learning models. |
|
|
|
Args: |
|
dataset_name: Name of dataset from DatasetName enum |
|
config: Configuration object with preprocessing parameters |
|
device: Torch device to place tensors on (cpu/cuda) |
|
""" |
|
|
|
def __init__( |
|
self, |
|
dataset_name: dataset_params.DatasetName, |
|
config: DatasetConfig = DatasetConfig(), |
|
device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
): |
|
self.dataset_params = dataset_params.get_dataset_params_by_name(dataset_name=dataset_name) |
|
self.config = config |
|
self.device = device |
|
self.text_processor = TextProcessor( |
|
vocab=None, |
|
config=TextProcessorConfig( |
|
max_seq_len=self.config.max_seq_len, |
|
lowercase=self.config.lowercase, |
|
remove_punct=self.config.remove_punct, |
|
pad_token=self.config.pad_token, |
|
unk_token=self.config.unk_token, |
|
) |
|
) |
|
self.vocab = None |
|
self.id2word = None |
|
self.embedding_layer = None |
|
|
|
|
|
def load_raw_data(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: |
|
""" |
|
Load raw dataset from source |
|
|
|
Returns: |
|
Tuple of (train_df, val_df, test_df) DataFrames |
|
""" |
|
if self.config.load_from_disk: |
|
dataset = load_from_disk(f"{self.config.path_to_data}/{self.dataset_params.local_path}") |
|
else: |
|
dataset = load_dataset(self.dataset_params.hugging_face_name) |
|
train_df = pd.DataFrame(dataset["train"]) |
|
test_df = pd.DataFrame(dataset["test"]) |
|
val_df, test_df = train_test_split( |
|
test_df, |
|
test_size=0.5, |
|
random_state=self.config.random_state, |
|
stratify=test_df[self.dataset_params.label_col_name] |
|
) |
|
|
|
|
|
train_df = train_df.sample(n=self.config.train_size, random_state=self.config.random_state) |
|
val_df = val_df.sample(n=self.config.val_size, random_state=self.config.random_state) |
|
test_df = test_df.sample(n=self.config.test_size, random_state=self.config.random_state) |
|
|
|
return train_df, val_df, test_df |
|
|
|
|
|
def build_vocabulary(self, tokenized_texts: List[List[str]]) -> Tuple[Dict[str, int], Dict[int, str]]: |
|
""" |
|
Build vocabulary from tokenized texts |
|
|
|
Args: |
|
tokenized_texts: List of tokenized texts |
|
|
|
Returns: |
|
Tuple of (word_to_id, id_to_word) mappings |
|
""" |
|
|
|
all_tokens = [token for tokens in tokenized_texts for token in tokens] |
|
word_counts = Counter(all_tokens) |
|
|
|
filtered_words = [word for word, count in word_counts.items() |
|
if count >= self.config.min_word_freq] |
|
|
|
word_to_id = {self.config.pad_token: 0, self.config.unk_token: 1} |
|
id_to_word = {0: self.config.pad_token, 1: self.config.unk_token} |
|
|
|
for idx, word in enumerate(filtered_words, start=2): |
|
word_to_id[word] = idx |
|
id_to_word[idx] = word |
|
|
|
self.text_processor.vocab = word_to_id |
|
|
|
return word_to_id, id_to_word |
|
|
|
|
|
def generate_dataset(self) -> Tuple[ |
|
Tuple[torch.Tensor, torch.Tensor], |
|
Tuple[torch.Tensor, torch.Tensor], |
|
Tuple[torch.Tensor, torch.Tensor] |
|
]: |
|
""" |
|
Main method to generate the full dataset |
|
|
|
Returns: |
|
Tuple containing: |
|
- (train_features, train_labels) |
|
- (val_features, val_labels) |
|
- (test_features, test_labels) |
|
- embedding_layer |
|
""" |
|
|
|
train_df, val_df, test_df = self.load_raw_data() |
|
|
|
train_texts = train_df[self.dataset_params.content_col_name].tolist() |
|
train_tokens = [self.text_processor.preprocess_text(text) for text in train_texts] |
|
|
|
if self.config.build_vocab: |
|
self.vocab, self.id2word = self.build_vocabulary(train_tokens) |
|
|
|
X_train = torch.stack([self.text_processor.text_to_tensor(text) for text in train_texts]) |
|
|
|
val_texts = val_df[self.dataset_params.content_col_name].tolist() |
|
X_val = torch.stack([self.text_processor.text_to_tensor(text) for text in val_texts]) |
|
|
|
test_texts = test_df[self.dataset_params.content_col_name].tolist() |
|
X_test = torch.stack([self.text_processor.text_to_tensor(text) for text in test_texts]) |
|
|
|
y_train = torch.tensor(train_df[self.dataset_params.label_col_name].values, dtype=torch.long) |
|
y_val = torch.tensor(val_df[self.dataset_params.label_col_name].values, dtype=torch.long) |
|
y_test = torch.tensor(test_df[self.dataset_params.label_col_name].values, dtype=torch.long) |
|
|
|
return (X_train, y_train), (X_val, y_val), (X_test, y_test) |
|
|
|
|
|
def get_vocabulary(self) -> Tuple[Dict[str, int], Dict[int, str]]: |
|
""" |
|
Get vocabulary mappings |
|
|
|
Returns: |
|
Tuple of (word_to_id, id_to_word) dictionaries |
|
""" |
|
|
|
return self.vocab, self.id2word |
|
|
|
|
|
def get_config(self) -> DatasetConfig: |
|
""" |
|
Get current configuration |
|
|
|
Returns: |
|
DatasetConfig object |
|
""" |
|
|
|
return self.config |
|
|
|
|
|
def get_text_processor(self) -> TextProcessor: |
|
""" |
|
Get the text processor for inference usage |
|
|
|
Returns: |
|
TextProcessor object |
|
""" |
|
return self.text_processor |
|
|