import json import torch from torch.utils.data import Dataset from tokenizer import CharTokenizer class MathDataset(Dataset): def __init__(self, path, tokenizer=None, max_len=512): self.tokenizer = tokenizer or CharTokenizer() with open(path,"r",encoding="utf-8") as f: self.data = json.load(f) self.max_len = max_len def __len__(self): return len(self.data) def __getitem__(self, idx): item = self.data[idx] q = item.get("question","") s = item.get("solution","") text = q + " " + s ids = self.tokenizer.encode(text, self.max_len) return torch.tensor(ids)