Spaces:

litvinovmitch11
/

monkey_coding_dl_project

Running

App Files Files Community

monkey_coding_dl_project / src /data_utils /dataset_generator.py

litvinovmitch11

Synced repo using 'sync_with_huggingface' Github Action

2a591a9 verified 11 days ago

raw

history blame contribute delete

6.17 kB

	from collections import Counter
	from typing import Dict, Tuple, List

	import pandas as pd
	import torch

	from datasets import load_dataset, load_from_disk
	from sklearn.model_selection import train_test_split

	import src.data_utils.dataset_params as dataset_params

	from src.data_utils.config import DatasetConfig, TextProcessorConfig
	from src.data_utils.text_processor import TextProcessor


	class DatasetGenerator:
	"""
	Main dataset generator class

	Provides methods to load, build vocabulary, convert text datasets
	into tensor format suitable for deep learning models.

	Args:
	dataset_name: Name of dataset from DatasetName enum
	config: Configuration object with preprocessing parameters
	device: Torch device to place tensors on (cpu/cuda)
	"""

	def __init__(
	self,
	dataset_name: dataset_params.DatasetName,
	config: DatasetConfig = DatasetConfig(),
	device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	):
	self.dataset_params = dataset_params.get_dataset_params_by_name(dataset_name=dataset_name)
	self.config = config
	self.device = device
	self.text_processor = TextProcessor(
	vocab=None,
	config=TextProcessorConfig(
	max_seq_len=self.config.max_seq_len,
	lowercase=self.config.lowercase,
	remove_punct=self.config.remove_punct,
	pad_token=self.config.pad_token,
	unk_token=self.config.unk_token,
	)
	)
	self.vocab = None
	self.id2word = None
	self.embedding_layer = None


	def load_raw_data(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
	"""
	Load raw dataset from source

	Returns:
	Tuple of (train_df, val_df, test_df) DataFrames
	"""
	if self.config.load_from_disk:
	dataset = load_from_disk(f"{self.config.path_to_data}/{self.dataset_params.local_path}")
	else:
	dataset = load_dataset(self.dataset_params.hugging_face_name)
	train_df = pd.DataFrame(dataset["train"])
	test_df = pd.DataFrame(dataset["test"])
	val_df, test_df = train_test_split(
	test_df,
	test_size=0.5,
	random_state=self.config.random_state,
	stratify=test_df[self.dataset_params.label_col_name]
	)

	# Sample configured sizes
	train_df = train_df.sample(n=self.config.train_size, random_state=self.config.random_state)
	val_df = val_df.sample(n=self.config.val_size, random_state=self.config.random_state)
	test_df = test_df.sample(n=self.config.test_size, random_state=self.config.random_state)

	return train_df, val_df, test_df


	def build_vocabulary(self, tokenized_texts: List[List[str]]) -> Tuple[Dict[str, int], Dict[int, str]]:
	"""
	Build vocabulary from tokenized texts

	Args:
	tokenized_texts: List of tokenized texts

	Returns:
	Tuple of (word_to_id, id_to_word) mappings
	"""

	all_tokens = [token for tokens in tokenized_texts for token in tokens]
	word_counts = Counter(all_tokens)

	filtered_words = [word for word, count in word_counts.items()
	if count >= self.config.min_word_freq]

	word_to_id = {self.config.pad_token: 0, self.config.unk_token: 1}
	id_to_word = {0: self.config.pad_token, 1: self.config.unk_token}

	for idx, word in enumerate(filtered_words, start=2):
	word_to_id[word] = idx
	id_to_word[idx] = word

	self.text_processor.vocab = word_to_id

	return word_to_id, id_to_word


	def generate_dataset(self) -> Tuple[
	Tuple[torch.Tensor, torch.Tensor],
	Tuple[torch.Tensor, torch.Tensor],
	Tuple[torch.Tensor, torch.Tensor]
	]:
	"""
	Main method to generate the full dataset

	Returns:
	Tuple containing:
	- (train_features, train_labels)
	- (val_features, val_labels)
	- (test_features, test_labels)
	- embedding_layer
	"""

	train_df, val_df, test_df = self.load_raw_data()

	train_texts = train_df[self.dataset_params.content_col_name].tolist()
	train_tokens = [self.text_processor.preprocess_text(text) for text in train_texts]

	if self.config.build_vocab:
	self.vocab, self.id2word = self.build_vocabulary(train_tokens)

	X_train = torch.stack([self.text_processor.text_to_tensor(text) for text in train_texts])

	val_texts = val_df[self.dataset_params.content_col_name].tolist()
	X_val = torch.stack([self.text_processor.text_to_tensor(text) for text in val_texts])

	test_texts = test_df[self.dataset_params.content_col_name].tolist()
	X_test = torch.stack([self.text_processor.text_to_tensor(text) for text in test_texts])

	y_train = torch.tensor(train_df[self.dataset_params.label_col_name].values, dtype=torch.long)
	y_val = torch.tensor(val_df[self.dataset_params.label_col_name].values, dtype=torch.long)
	y_test = torch.tensor(test_df[self.dataset_params.label_col_name].values, dtype=torch.long)

	return (X_train, y_train), (X_val, y_val), (X_test, y_test)


	def get_vocabulary(self) -> Tuple[Dict[str, int], Dict[int, str]]:
	"""
	Get vocabulary mappings

	Returns:
	Tuple of (word_to_id, id_to_word) dictionaries
	"""

	return self.vocab, self.id2word


	def get_config(self) -> DatasetConfig:
	"""
	Get current configuration

	Returns:
	DatasetConfig object
	"""

	return self.config


	def get_text_processor(self) -> TextProcessor:
	"""
	Get the text processor for inference usage

	Returns:
	TextProcessor object
	"""
	return self.text_processor