monkey_coding_dl_project / src /data_utils /dataset_generator.py
litvinovmitch11's picture
Synced repo using 'sync_with_huggingface' Github Action
2a591a9 verified
from collections import Counter
from typing import Dict, Tuple, List
import pandas as pd
import torch
from datasets import load_dataset, load_from_disk
from sklearn.model_selection import train_test_split
import src.data_utils.dataset_params as dataset_params
from src.data_utils.config import DatasetConfig, TextProcessorConfig
from src.data_utils.text_processor import TextProcessor
class DatasetGenerator:
"""
Main dataset generator class
Provides methods to load, build vocabulary, convert text datasets
into tensor format suitable for deep learning models.
Args:
dataset_name: Name of dataset from DatasetName enum
config: Configuration object with preprocessing parameters
device: Torch device to place tensors on (cpu/cuda)
"""
def __init__(
self,
dataset_name: dataset_params.DatasetName,
config: DatasetConfig = DatasetConfig(),
device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
):
self.dataset_params = dataset_params.get_dataset_params_by_name(dataset_name=dataset_name)
self.config = config
self.device = device
self.text_processor = TextProcessor(
vocab=None,
config=TextProcessorConfig(
max_seq_len=self.config.max_seq_len,
lowercase=self.config.lowercase,
remove_punct=self.config.remove_punct,
pad_token=self.config.pad_token,
unk_token=self.config.unk_token,
)
)
self.vocab = None
self.id2word = None
self.embedding_layer = None
def load_raw_data(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""
Load raw dataset from source
Returns:
Tuple of (train_df, val_df, test_df) DataFrames
"""
if self.config.load_from_disk:
dataset = load_from_disk(f"{self.config.path_to_data}/{self.dataset_params.local_path}")
else:
dataset = load_dataset(self.dataset_params.hugging_face_name)
train_df = pd.DataFrame(dataset["train"])
test_df = pd.DataFrame(dataset["test"])
val_df, test_df = train_test_split(
test_df,
test_size=0.5,
random_state=self.config.random_state,
stratify=test_df[self.dataset_params.label_col_name]
)
# Sample configured sizes
train_df = train_df.sample(n=self.config.train_size, random_state=self.config.random_state)
val_df = val_df.sample(n=self.config.val_size, random_state=self.config.random_state)
test_df = test_df.sample(n=self.config.test_size, random_state=self.config.random_state)
return train_df, val_df, test_df
def build_vocabulary(self, tokenized_texts: List[List[str]]) -> Tuple[Dict[str, int], Dict[int, str]]:
"""
Build vocabulary from tokenized texts
Args:
tokenized_texts: List of tokenized texts
Returns:
Tuple of (word_to_id, id_to_word) mappings
"""
all_tokens = [token for tokens in tokenized_texts for token in tokens]
word_counts = Counter(all_tokens)
filtered_words = [word for word, count in word_counts.items()
if count >= self.config.min_word_freq]
word_to_id = {self.config.pad_token: 0, self.config.unk_token: 1}
id_to_word = {0: self.config.pad_token, 1: self.config.unk_token}
for idx, word in enumerate(filtered_words, start=2):
word_to_id[word] = idx
id_to_word[idx] = word
self.text_processor.vocab = word_to_id
return word_to_id, id_to_word
def generate_dataset(self) -> Tuple[
Tuple[torch.Tensor, torch.Tensor],
Tuple[torch.Tensor, torch.Tensor],
Tuple[torch.Tensor, torch.Tensor]
]:
"""
Main method to generate the full dataset
Returns:
Tuple containing:
- (train_features, train_labels)
- (val_features, val_labels)
- (test_features, test_labels)
- embedding_layer
"""
train_df, val_df, test_df = self.load_raw_data()
train_texts = train_df[self.dataset_params.content_col_name].tolist()
train_tokens = [self.text_processor.preprocess_text(text) for text in train_texts]
if self.config.build_vocab:
self.vocab, self.id2word = self.build_vocabulary(train_tokens)
X_train = torch.stack([self.text_processor.text_to_tensor(text) for text in train_texts])
val_texts = val_df[self.dataset_params.content_col_name].tolist()
X_val = torch.stack([self.text_processor.text_to_tensor(text) for text in val_texts])
test_texts = test_df[self.dataset_params.content_col_name].tolist()
X_test = torch.stack([self.text_processor.text_to_tensor(text) for text in test_texts])
y_train = torch.tensor(train_df[self.dataset_params.label_col_name].values, dtype=torch.long)
y_val = torch.tensor(val_df[self.dataset_params.label_col_name].values, dtype=torch.long)
y_test = torch.tensor(test_df[self.dataset_params.label_col_name].values, dtype=torch.long)
return (X_train, y_train), (X_val, y_val), (X_test, y_test)
def get_vocabulary(self) -> Tuple[Dict[str, int], Dict[int, str]]:
"""
Get vocabulary mappings
Returns:
Tuple of (word_to_id, id_to_word) dictionaries
"""
return self.vocab, self.id2word
def get_config(self) -> DatasetConfig:
"""
Get current configuration
Returns:
DatasetConfig object
"""
return self.config
def get_text_processor(self) -> TextProcessor:
"""
Get the text processor for inference usage
Returns:
TextProcessor object
"""
return self.text_processor