Math_gpt / dataset_utils.py
MalikAyaanAhmed1123's picture
Create dataset_utils.py
6980a29 verified
import json
import torch
from torch.utils.data import Dataset
class CharTokenizer:
def __init__(self, text_list):
chars = sorted(list(set("".join(text_list))))
self.char2idx = {c:i for i,c in enumerate(chars)}
self.idx2char = {i:c for i,c in enumerate(chars)}
self.vocab_size = len(chars)
def encode(self, text):
return [self.char2idx[c] for c in text]
def decode(self, indices):
return "".join([self.idx2char[i] for i in indices])
class MathDataset(Dataset):
def __init__(self, file_path, tokenizer):
self.data = []
self.tokenizer = tokenizer
with open(file_path, "r", encoding="utf-8") as f:
dataset = json.load(f)
for item in dataset:
self.data.append({
"input": torch.tensor(tokenizer.encode(item['input']), dtype=torch.long),
"output": torch.tensor(tokenizer.encode(item['output']), dtype=torch.long)
})
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
return self.data[idx]