from dataclasses import dataclass @dataclass class DatasetConfig: """ Configuration class for dataset generation parameters Attributes: embedding_dim: Dimension for embedding layer output train_size: Number of samples in training set val_size: Number of samples in validation set test_size: Number of samples in test set random_state: Random seed for reproducibility min_word_freq: Minimum word frequency to include in vocabulary load_from_disk: Load dataset from local dir. If false download from huggin face path_to_data: Path to local dataset data build_vocab: Is build vocabulary necessary max_seq_len: Maximum sequence length (will be padded/truncated to this) lowercase: Whether to convert text to lowercase remove_punct: Whether to remove punctuation pad_token: Padding token unk_token: Unknown token """ embedding_dim: int = 64 train_size: int = 10000 val_size: int = 5000 test_size: int = 5000 random_state: int = 42 min_word_freq: int = 1 load_from_disk: bool = False path_to_data: str = "./datasets" build_vocab: bool = True max_seq_len: int = 300 lowercase: bool = True remove_punct: bool = False pad_token: str = "" unk_token: str = "" @dataclass class TextProcessorConfig: """ Configuration class for text processor parameters (params should be equal dataset config) Attributes: max_seq_len: Maximum sequence length (will be padded/truncated to this) lowercase: Whether to convert text to lowercase remove_punct: Whether to remove punctuation pad_token: Padding token unk_token: Unknown token """ max_seq_len: int = 300 lowercase: bool = True remove_punct: bool = False pad_token: str = "" unk_token: str = ""