Spaces:
Build error
Build error
| import json | |
| import torch | |
| from torch.utils.data import Dataset | |
| class CharTokenizer: | |
| def __init__(self, text_list): | |
| chars = sorted(list(set("".join(text_list)))) | |
| self.char2idx = {c:i for i,c in enumerate(chars)} | |
| self.idx2char = {i:c for i,c in enumerate(chars)} | |
| self.vocab_size = len(chars) | |
| def encode(self, text): | |
| return [self.char2idx[c] for c in text] | |
| def decode(self, indices): | |
| return "".join([self.idx2char[i] for i in indices]) | |
| class MathDataset(Dataset): | |
| def __init__(self, file_path, tokenizer): | |
| self.data = [] | |
| self.tokenizer = tokenizer | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| dataset = json.load(f) | |
| for item in dataset: | |
| self.data.append({ | |
| "input": torch.tensor(tokenizer.encode(item['input']), dtype=torch.long), | |
| "output": torch.tensor(tokenizer.encode(item['output']), dtype=torch.long) | |
| }) | |
| def __len__(self): | |
| return len(self.data) | |
| def __getitem__(self, idx): | |
| return self.data[idx] |