Spaces:

litvinovmitch11
/

monkey_coding_dl_project

Running

App Files Files Community

monkey_coding_dl_project / src /data_utils /config.py

litvinovmitch11

Synced repo using 'sync_with_huggingface' Github Action

2a591a9 verified 11 days ago

raw

history blame contribute delete

1.93 kB

	from dataclasses import dataclass


	@dataclass
	class DatasetConfig:
	"""
	Configuration class for dataset generation parameters

	Attributes:
	embedding_dim: Dimension for embedding layer output
	train_size: Number of samples in training set
	val_size: Number of samples in validation set
	test_size: Number of samples in test set
	random_state: Random seed for reproducibility
	min_word_freq: Minimum word frequency to include in vocabulary
	load_from_disk: Load dataset from local dir. If false download from huggin face
	path_to_data: Path to local dataset data
	build_vocab: Is build vocabulary necessary
	max_seq_len: Maximum sequence length (will be padded/truncated to this)
	lowercase: Whether to convert text to lowercase
	remove_punct: Whether to remove punctuation
	pad_token: Padding token
	unk_token: Unknown token
	"""

	embedding_dim: int = 64
	train_size: int = 10000
	val_size: int = 5000
	test_size: int = 5000
	random_state: int = 42
	min_word_freq: int = 1
	load_from_disk: bool = False
	path_to_data: str = "./datasets"
	build_vocab: bool = True

	max_seq_len: int = 300
	lowercase: bool = True
	remove_punct: bool = False
	pad_token: str = "<PAD>"
	unk_token: str = "<UNK>"


	@dataclass
	class TextProcessorConfig:
	"""
	Configuration class for text processor parameters (params should be equal dataset config)

	Attributes:
	max_seq_len: Maximum sequence length (will be padded/truncated to this)
	lowercase: Whether to convert text to lowercase
	remove_punct: Whether to remove punctuation
	pad_token: Padding token
	unk_token: Unknown token
	"""

	max_seq_len: int = 300
	lowercase: bool = True
	remove_punct: bool = False
	pad_token: str = "<PAD>"
	unk_token: str = "<UNK>"