chbsaikiran commited on
Commit
561a912
·
1 Parent(s): 3f819c5

trained a new model as the old model was not performing well

Browse files
Files changed (2) hide show
  1. app.py +19 -8
  2. model.py +31 -23
app.py CHANGED
@@ -10,13 +10,24 @@ if tokenizer.pad_token is None:
10
  tokenizer.pad_token = tokenizer.eos_token if tokenizer.eos_token else "[PAD]"
11
 
12
  # Initialize model with reduced parameters (135M config)
13
- model = LlamaForCausalLM(
14
- vocab_size=tokenizer.vocab_size,
15
- dim=576,
16
- num_layers=30,
17
- hidden_dim=1536,
18
- num_heads=9
19
- )
 
 
 
 
 
 
 
 
 
 
 
20
  device = "cpu"
21
  model_id = "chbsaikiran/smollm2_135M_model"
22
  checkpoint_path = hf_hub_download(repo_id=model_id, filename="model_bin.pt")
@@ -60,7 +71,7 @@ demo = gr.Interface(
60
  ],
61
  outputs=gr.Textbox(label="Generated Text", lines=5),
62
  title="SmolLM2 Demo",
63
- description="A 135M parameter language model trained on smollm-corpus"
64
  )
65
 
66
  if __name__ == "__main__":
 
10
  tokenizer.pad_token = tokenizer.eos_token if tokenizer.eos_token else "[PAD]"
11
 
12
  # Initialize model with reduced parameters (135M config)
13
+ class Config:
14
+ pass
15
+
16
+ config = Config()
17
+ config.vocab_size = AutoTokenizer.from_pretrained("HuggingFaceTB/cosmo2-tokenizer").vocab_size
18
+ config.num_layers = 30
19
+ config.hidden_size = 576
20
+ config.num_attention_heads = 8
21
+ config.rms_norm_eps = 1.0e-05
22
+ config.max_position_embeddings = 2048
23
+ config.rope_theta = 500000.0
24
+ config.hidden_act = False
25
+ config.intermediate_size = 1536
26
+ config.rope_interleaved = False
27
+ #config.rope_scaling = null
28
+ config.rope_theta = 10000.0
29
+
30
+ model = LlamaForCausalLM(config)
31
  device = "cpu"
32
  model_id = "chbsaikiran/smollm2_135M_model"
33
  checkpoint_path = hf_hub_download(repo_id=model_id, filename="model_bin.pt")
 
71
  ],
72
  outputs=gr.Textbox(label="Generated Text", lines=5),
73
  title="SmolLM2 Demo",
74
+ description="A 135M parameter language model trained on Shakespeare's text"
75
  )
76
 
77
  if __name__ == "__main__":
model.py CHANGED
@@ -42,7 +42,7 @@ class LlamaMLP(nn.Module):
42
  return self.down_proj(self.act_fn(gated * hidden)) # apply the activation function to the gated and hidden values and then apply the down projection
43
 
44
  class LlamaAttention(nn.Module):
45
- def __init__(self, dim, num_heads=8):
46
  super().__init__()
47
  self.num_heads = num_heads
48
  self.head_dim = dim // num_heads
@@ -51,6 +51,7 @@ class LlamaAttention(nn.Module):
51
  self.k_proj = nn.Linear(dim, dim, bias=False)
52
  self.v_proj = nn.Linear(dim, dim, bias=False)
53
  self.o_proj = nn.Linear(dim, dim, bias=False)
 
54
 
55
  def forward(self, x):
56
  batch_size, seq_len, dim = x.size() # [batch_size, seq_len, dim] -> [4, 128, 576]
@@ -66,6 +67,7 @@ class LlamaAttention(nn.Module):
66
 
67
  # Scaled dot-product attention
68
  scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
 
69
  attention = torch.softmax(scores, dim=-1)
70
  context = torch.matmul(attention, v)
71
 
@@ -74,9 +76,9 @@ class LlamaAttention(nn.Module):
74
  return self.o_proj(context)
75
 
76
  class LlamaDecoderLayer(nn.Module):
77
- def __init__(self, dim, hidden_dim, num_heads):
78
  super().__init__()
79
- self.self_attn = LlamaAttention(dim, num_heads)
80
  self.mlp = LlamaMLP(dim, hidden_dim)
81
  self.input_layernorm = LlamaRMSNorm(dim)
82
  self.post_attention_layernorm = LlamaRMSNorm(dim)
@@ -95,40 +97,46 @@ class LlamaDecoderLayer(nn.Module):
95
 
96
 
97
  class LlamaModel(nn.Module):
98
- def __init__(self, vocab_size, dim, num_layers, hidden_dim, num_heads):
99
  super().__init__()
100
  self.embed_tokens = nn.Embedding(vocab_size, dim)
101
  self.layers = nn.ModuleList([
102
- LlamaDecoderLayer(dim, hidden_dim, num_heads) for _ in range(num_layers)
103
  ])
104
  self.norm = LlamaRMSNorm(dim)
105
  self.rotary_emb = LlamaRotaryEmbedding(dim)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
- def forward(self, x):
108
- x = self.embed_tokens(x)
109
  for layer in self.layers:
110
  x = layer(x)
111
  return self.norm(x)
112
 
113
  class LlamaForCausalLM(nn.Module):
114
- def __init__(self, vocab_size, dim, num_layers, hidden_dim, num_heads):
115
  super().__init__()
116
- self.model = LlamaModel(vocab_size, dim, num_layers, hidden_dim, num_heads)
 
 
 
 
 
 
 
117
  self.lm_head = nn.Linear(dim, vocab_size, bias=False)
118
 
119
  def forward(self, x):
120
  x = self.model(x)
121
- return self.lm_head(x)
122
-
123
- def get_model(tokenizer):
124
- vocab_size = tokenizer.vocab_size # Use actual tokenizer vocab size
125
- return LlamaForCausalLM(
126
- vocab_size=vocab_size,
127
- dim=576,
128
- num_layers=30,
129
- hidden_dim=1536,
130
- num_heads=8
131
- )
132
-
133
- # model = get_model()
134
- # print(model)
 
42
  return self.down_proj(self.act_fn(gated * hidden)) # apply the activation function to the gated and hidden values and then apply the down projection
43
 
44
  class LlamaAttention(nn.Module):
45
+ def __init__(self, dim, num_heads=8,max_seq_len=2048):
46
  super().__init__()
47
  self.num_heads = num_heads
48
  self.head_dim = dim // num_heads
 
51
  self.k_proj = nn.Linear(dim, dim, bias=False)
52
  self.v_proj = nn.Linear(dim, dim, bias=False)
53
  self.o_proj = nn.Linear(dim, dim, bias=False)
54
+ self.register_buffer("bias", torch.tril(torch.ones(max_seq_len, max_seq_len)).view(1, 1, max_seq_len, max_seq_len))
55
 
56
  def forward(self, x):
57
  batch_size, seq_len, dim = x.size() # [batch_size, seq_len, dim] -> [4, 128, 576]
 
67
 
68
  # Scaled dot-product attention
69
  scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
70
+ scores = scores.masked_fill(self.bias[:, :, :seq_len, :seq_len] == 0, float('-inf'))
71
  attention = torch.softmax(scores, dim=-1)
72
  context = torch.matmul(attention, v)
73
 
 
76
  return self.o_proj(context)
77
 
78
  class LlamaDecoderLayer(nn.Module):
79
+ def __init__(self, dim, hidden_dim, num_heads,max_position_embeddings):
80
  super().__init__()
81
+ self.self_attn = LlamaAttention(dim, num_heads,max_position_embeddings)
82
  self.mlp = LlamaMLP(dim, hidden_dim)
83
  self.input_layernorm = LlamaRMSNorm(dim)
84
  self.post_attention_layernorm = LlamaRMSNorm(dim)
 
97
 
98
 
99
  class LlamaModel(nn.Module):
100
+ def __init__(self, vocab_size, dim, num_layers, hidden_dim, num_heads,max_position_embeddings):
101
  super().__init__()
102
  self.embed_tokens = nn.Embedding(vocab_size, dim)
103
  self.layers = nn.ModuleList([
104
+ LlamaDecoderLayer(dim, hidden_dim, num_heads,max_position_embeddings) for _ in range(num_layers)
105
  ])
106
  self.norm = LlamaRMSNorm(dim)
107
  self.rotary_emb = LlamaRotaryEmbedding(dim)
108
+ self.vocab_size = vocab_size
109
+ self.dim = dim
110
+ self.max_position_embeddings = max_position_embeddings
111
+ self.wte = nn.Embedding(self.vocab_size, self.dim)
112
+ self.wpe = nn.Embedding(self.max_position_embeddings, self.dim)
113
+
114
+ def forward(self, tokens):
115
+ B, T = tokens.size()
116
+ assert T <= self.max_position_embeddings, f"Cannot forward sequence of length {T}, block size is only {self.max_position_embeddings}"
117
+
118
+ pos = torch.arange(0, T, dtype=torch.long, device=tokens.device) # shape (T)
119
+ pos_emb = self.wpe(pos) # position embeddings of shape (T, n_embd)
120
+ tok_emb = self.wte(tokens) # token embeddings of shape (B, T, n_embd)
121
+ x = tok_emb + pos_emb
122
 
 
 
123
  for layer in self.layers:
124
  x = layer(x)
125
  return self.norm(x)
126
 
127
  class LlamaForCausalLM(nn.Module):
128
+ def __init__(self, config):
129
  super().__init__()
130
+ vocab_size = config.vocab_size
131
+ dim = config.hidden_size
132
+ num_layers = config.num_layers
133
+ hidden_dim = config.intermediate_size
134
+ num_heads = config.num_attention_heads
135
+ max_position_embeddings = config.max_position_embeddings
136
+
137
+ self.model = LlamaModel(vocab_size, dim, num_layers, hidden_dim, num_heads,max_position_embeddings)
138
  self.lm_head = nn.Linear(dim, vocab_size, bias=False)
139
 
140
  def forward(self, x):
141
  x = self.model(x)
142
+ return self.lm_head(x)