Spaces:

litvinovmitch11
/

monkey_coding_dl_project

Running

File size: 6,170 Bytes

from collections import Counter
from typing import Dict, Tuple, List

import pandas as pd
import torch

from datasets import load_dataset, load_from_disk
from sklearn.model_selection import train_test_split

import src.data_utils.dataset_params as dataset_params

from src.data_utils.config import DatasetConfig, TextProcessorConfig
from src.data_utils.text_processor import TextProcessor


class DatasetGenerator:
    """
    Main dataset generator class
    
    Provides methods to load, build vocabulary, convert text datasets 
    into tensor format suitable for deep learning models.
    
    Args:
        dataset_name: Name of dataset from DatasetName enum
        config: Configuration object with preprocessing parameters
        device: Torch device to place tensors on (cpu/cuda)
    """

    def __init__(
        self, 
        dataset_name: dataset_params.DatasetName,
        config: DatasetConfig = DatasetConfig(),
        device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    ):
        self.dataset_params = dataset_params.get_dataset_params_by_name(dataset_name=dataset_name)
        self.config = config
        self.device = device
        self.text_processor = TextProcessor(
            vocab=None, 
            config=TextProcessorConfig(
                max_seq_len=self.config.max_seq_len,
                lowercase=self.config.lowercase,
                remove_punct=self.config.remove_punct,
                pad_token=self.config.pad_token,
                unk_token=self.config.unk_token,
            )
        )
        self.vocab = None
        self.id2word = None
        self.embedding_layer = None


    def load_raw_data(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        """
        Load raw dataset from source
        
        Returns:
            Tuple of (train_df, val_df, test_df) DataFrames
        """
        if self.config.load_from_disk:
            dataset = load_from_disk(f"{self.config.path_to_data}/{self.dataset_params.local_path}")
        else:
            dataset = load_dataset(self.dataset_params.hugging_face_name)
        train_df = pd.DataFrame(dataset["train"])
        test_df = pd.DataFrame(dataset["test"])
        val_df, test_df = train_test_split(
            test_df, 
            test_size=0.5, 
            random_state=self.config.random_state, 
            stratify=test_df[self.dataset_params.label_col_name]
        )
                    
        # Sample configured sizes
        train_df = train_df.sample(n=self.config.train_size, random_state=self.config.random_state)
        val_df = val_df.sample(n=self.config.val_size, random_state=self.config.random_state)
        test_df = test_df.sample(n=self.config.test_size, random_state=self.config.random_state)
        
        return train_df, val_df, test_df


    def build_vocabulary(self, tokenized_texts: List[List[str]]) -> Tuple[Dict[str, int], Dict[int, str]]:
        """
        Build vocabulary from tokenized texts
        
        Args:
            tokenized_texts: List of tokenized texts
            
        Returns:
            Tuple of (word_to_id, id_to_word) mappings
        """

        all_tokens = [token for tokens in tokenized_texts for token in tokens]
        word_counts = Counter(all_tokens)
        
        filtered_words = [word for word, count in word_counts.items() 
                         if count >= self.config.min_word_freq]
        
        word_to_id = {self.config.pad_token: 0, self.config.unk_token: 1}
        id_to_word = {0: self.config.pad_token, 1: self.config.unk_token}
        
        for idx, word in enumerate(filtered_words, start=2):
            word_to_id[word] = idx
            id_to_word[idx] = word
        
        self.text_processor.vocab = word_to_id

        return word_to_id, id_to_word


    def generate_dataset(self) -> Tuple[
        Tuple[torch.Tensor, torch.Tensor],
        Tuple[torch.Tensor, torch.Tensor],
        Tuple[torch.Tensor, torch.Tensor]
    ]:
        """
        Main method to generate the full dataset
        
        Returns:
            Tuple containing:
            - (train_features, train_labels)
            - (val_features, val_labels)
            - (test_features, test_labels)
            - embedding_layer
        """
        
        train_df, val_df, test_df = self.load_raw_data()

        train_texts = train_df[self.dataset_params.content_col_name].tolist()
        train_tokens = [self.text_processor.preprocess_text(text) for text in train_texts]
        
        if self.config.build_vocab:
            self.vocab, self.id2word = self.build_vocabulary(train_tokens)

        X_train = torch.stack([self.text_processor.text_to_tensor(text) for text in train_texts])
        
        val_texts = val_df[self.dataset_params.content_col_name].tolist()
        X_val = torch.stack([self.text_processor.text_to_tensor(text) for text in val_texts])
        
        test_texts = test_df[self.dataset_params.content_col_name].tolist()
        X_test = torch.stack([self.text_processor.text_to_tensor(text) for text in test_texts])
        
        y_train = torch.tensor(train_df[self.dataset_params.label_col_name].values, dtype=torch.long)
        y_val = torch.tensor(val_df[self.dataset_params.label_col_name].values, dtype=torch.long)
        y_test = torch.tensor(test_df[self.dataset_params.label_col_name].values, dtype=torch.long)
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test)


    def get_vocabulary(self) -> Tuple[Dict[str, int], Dict[int, str]]:
        """
        Get vocabulary mappings
        
        Returns:
            Tuple of (word_to_id, id_to_word) dictionaries
        """

        return self.vocab, self.id2word


    def get_config(self) -> DatasetConfig:
        """
        Get current configuration
        
        Returns:
            DatasetConfig object
        """

        return self.config


    def get_text_processor(self) -> TextProcessor:
        """
        Get the text processor for inference usage
        
        Returns:
            TextProcessor object
        """
        return self.text_processor