MalikAyaanAhmed1123 commited on
Commit
6980a29
·
verified ·
1 Parent(s): be7af3d

Create dataset_utils.py

Browse files
Files changed (1) hide show
  1. dataset_utils.py +24 -10
dataset_utils.py CHANGED
@@ -1,18 +1,32 @@
1
  import json
 
2
  from torch.utils.data import Dataset
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  class MathDataset(Dataset):
5
- def __init__(self, filepath):
6
  self.data = []
7
- if filepath.endswith(".json"):
8
- with open(filepath, "r", encoding="utf-8") as f:
9
- self.data = json.load(f)
10
- elif filepath.endswith(".txt"):
11
- with open(filepath, "r", encoding="utf-8") as f:
12
- lines = f.readlines()
13
- for line in lines:
14
- self.data.append({"input": line.strip(), "output": line.strip()})
15
-
16
  def __len__(self):
17
  return len(self.data)
18
 
 
1
  import json
2
+ import torch
3
  from torch.utils.data import Dataset
4
 
5
+ class CharTokenizer:
6
+ def __init__(self, text_list):
7
+ chars = sorted(list(set("".join(text_list))))
8
+ self.char2idx = {c:i for i,c in enumerate(chars)}
9
+ self.idx2char = {i:c for i,c in enumerate(chars)}
10
+ self.vocab_size = len(chars)
11
+
12
+ def encode(self, text):
13
+ return [self.char2idx[c] for c in text]
14
+
15
+ def decode(self, indices):
16
+ return "".join([self.idx2char[i] for i in indices])
17
+
18
  class MathDataset(Dataset):
19
+ def __init__(self, file_path, tokenizer):
20
  self.data = []
21
+ self.tokenizer = tokenizer
22
+ with open(file_path, "r", encoding="utf-8") as f:
23
+ dataset = json.load(f)
24
+ for item in dataset:
25
+ self.data.append({
26
+ "input": torch.tensor(tokenizer.encode(item['input']), dtype=torch.long),
27
+ "output": torch.tensor(tokenizer.encode(item['output']), dtype=torch.long)
28
+ })
29
+
30
  def __len__(self):
31
  return len(self.data)
32