Spaces:

chbsaikiran
/

SMOLLM2_105M

Sleeping

App Files Files Community

chbsaikiran commited on Feb 17

Commit

561a912

1 Parent(s): 3f819c5

trained a new model as the old model was not performing well

Browse files

Files changed (2) hide show

app.py +19 -8
model.py +31 -23

app.py CHANGED Viewed

@@ -10,13 +10,24 @@ if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token if tokenizer.eos_token else "[PAD]"
 # Initialize model with reduced parameters (135M config)
-model = LlamaForCausalLM(
-    vocab_size=tokenizer.vocab_size,
-    dim=576,
-    num_layers=30,
-    hidden_dim=1536,
-    num_heads=9
-)
 device = "cpu"
 model_id = "chbsaikiran/smollm2_135M_model"
 checkpoint_path = hf_hub_download(repo_id=model_id, filename="model_bin.pt")
@@ -60,7 +71,7 @@ demo = gr.Interface(
     ],
     outputs=gr.Textbox(label="Generated Text", lines=5),
     title="SmolLM2 Demo",
-    description="A 135M parameter language model trained on smollm-corpus"
 )
 if __name__ == "__main__":

     tokenizer.pad_token = tokenizer.eos_token if tokenizer.eos_token else "[PAD]"
 # Initialize model with reduced parameters (135M config)
+class Config:
+    pass
+config = Config()
+config.vocab_size = AutoTokenizer.from_pretrained("HuggingFaceTB/cosmo2-tokenizer").vocab_size
+config.num_layers = 30
+config.hidden_size = 576
+config.num_attention_heads = 8
+config.rms_norm_eps = 1.0e-05
+config.max_position_embeddings = 2048
+config.rope_theta = 500000.0
+config.hidden_act = False
+config.intermediate_size = 1536
+config.rope_interleaved = False
+#config.rope_scaling = null
+config.rope_theta = 10000.0
+model = LlamaForCausalLM(config)
 device = "cpu"
 model_id = "chbsaikiran/smollm2_135M_model"
 checkpoint_path = hf_hub_download(repo_id=model_id, filename="model_bin.pt")
     ],
     outputs=gr.Textbox(label="Generated Text", lines=5),
     title="SmolLM2 Demo",
+    description="A 135M parameter language model trained on Shakespeare's text"
 )
 if __name__ == "__main__":

model.py CHANGED Viewed

@@ -42,7 +42,7 @@ class LlamaMLP(nn.Module):
         return self.down_proj(self.act_fn(gated * hidden)) # apply the activation function to the gated and hidden values and then apply the down projection
 class LlamaAttention(nn.Module):
-    def __init__(self, dim, num_heads=8):
         super().__init__()
         self.num_heads = num_heads
         self.head_dim = dim // num_heads
@@ -51,6 +51,7 @@ class LlamaAttention(nn.Module):
         self.k_proj = nn.Linear(dim, dim, bias=False)
         self.v_proj = nn.Linear(dim, dim, bias=False)
         self.o_proj = nn.Linear(dim, dim, bias=False)
     def forward(self, x):
         batch_size, seq_len, dim = x.size() # [batch_size, seq_len, dim] -> [4, 128, 576]
@@ -66,6 +67,7 @@ class LlamaAttention(nn.Module):
         # Scaled dot-product attention
         scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
         attention = torch.softmax(scores, dim=-1)
         context = torch.matmul(attention, v)
@@ -74,9 +76,9 @@ class LlamaAttention(nn.Module):
         return self.o_proj(context)
 class LlamaDecoderLayer(nn.Module):
-    def __init__(self, dim, hidden_dim, num_heads):
         super().__init__()
-        self.self_attn = LlamaAttention(dim, num_heads)
         self.mlp = LlamaMLP(dim, hidden_dim)
         self.input_layernorm = LlamaRMSNorm(dim)
         self.post_attention_layernorm = LlamaRMSNorm(dim)
@@ -95,40 +97,46 @@ class LlamaDecoderLayer(nn.Module):
 class LlamaModel(nn.Module):
-    def __init__(self, vocab_size, dim, num_layers, hidden_dim, num_heads):
         super().__init__()
         self.embed_tokens = nn.Embedding(vocab_size, dim)
         self.layers = nn.ModuleList([
-            LlamaDecoderLayer(dim, hidden_dim, num_heads) for _ in range(num_layers)
         ])
         self.norm = LlamaRMSNorm(dim)
         self.rotary_emb = LlamaRotaryEmbedding(dim)
-    def forward(self, x):
-        x = self.embed_tokens(x)
         for layer in self.layers:
             x = layer(x)
         return self.norm(x)
 class LlamaForCausalLM(nn.Module):
-    def __init__(self, vocab_size, dim, num_layers, hidden_dim, num_heads):
         super().__init__()
-        self.model = LlamaModel(vocab_size, dim, num_layers, hidden_dim, num_heads)
         self.lm_head = nn.Linear(dim, vocab_size, bias=False)
     def forward(self, x):
         x = self.model(x)
-        return self.lm_head(x)
-def get_model(tokenizer):
-    vocab_size = tokenizer.vocab_size  # Use actual tokenizer vocab size
-    return LlamaForCausalLM(
-        vocab_size=vocab_size,
-        dim=576,
-        num_layers=30,
-        hidden_dim=1536,
-        num_heads=8
-    )
-# model = get_model()
-# print(model)

         return self.down_proj(self.act_fn(gated * hidden)) # apply the activation function to the gated and hidden values and then apply the down projection
 class LlamaAttention(nn.Module):
+    def __init__(self, dim, num_heads=8,max_seq_len=2048):
         super().__init__()
         self.num_heads = num_heads
         self.head_dim = dim // num_heads
         self.k_proj = nn.Linear(dim, dim, bias=False)
         self.v_proj = nn.Linear(dim, dim, bias=False)
         self.o_proj = nn.Linear(dim, dim, bias=False)
+        self.register_buffer("bias", torch.tril(torch.ones(max_seq_len, max_seq_len)).view(1, 1, max_seq_len, max_seq_len))
     def forward(self, x):
         batch_size, seq_len, dim = x.size() # [batch_size, seq_len, dim] -> [4, 128, 576]
         # Scaled dot-product attention
         scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
+        scores = scores.masked_fill(self.bias[:, :, :seq_len, :seq_len] == 0, float('-inf'))
         attention = torch.softmax(scores, dim=-1)
         context = torch.matmul(attention, v)
         return self.o_proj(context)
 class LlamaDecoderLayer(nn.Module):
+    def __init__(self, dim, hidden_dim, num_heads,max_position_embeddings):
         super().__init__()
+        self.self_attn = LlamaAttention(dim, num_heads,max_position_embeddings)
         self.mlp = LlamaMLP(dim, hidden_dim)
         self.input_layernorm = LlamaRMSNorm(dim)
         self.post_attention_layernorm = LlamaRMSNorm(dim)
 class LlamaModel(nn.Module):
+    def __init__(self, vocab_size, dim, num_layers, hidden_dim, num_heads,max_position_embeddings):
         super().__init__()
         self.embed_tokens = nn.Embedding(vocab_size, dim)
         self.layers = nn.ModuleList([
+            LlamaDecoderLayer(dim, hidden_dim, num_heads,max_position_embeddings) for _ in range(num_layers)
         ])
         self.norm = LlamaRMSNorm(dim)
         self.rotary_emb = LlamaRotaryEmbedding(dim)
+        self.vocab_size = vocab_size
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.wte = nn.Embedding(self.vocab_size, self.dim)
+        self.wpe = nn.Embedding(self.max_position_embeddings, self.dim)
+    def forward(self, tokens):
+        B, T = tokens.size()
+        assert T <= self.max_position_embeddings, f"Cannot forward sequence of length {T}, block size is only {self.max_position_embeddings}"
+        pos = torch.arange(0, T, dtype=torch.long, device=tokens.device) # shape (T)
+        pos_emb = self.wpe(pos) # position embeddings of shape (T, n_embd)
+        tok_emb = self.wte(tokens) # token embeddings of shape (B, T, n_embd)
+        x = tok_emb + pos_emb
         for layer in self.layers:
             x = layer(x)
         return self.norm(x)
 class LlamaForCausalLM(nn.Module):
+    def __init__(self, config):
         super().__init__()
+        vocab_size = config.vocab_size
+        dim = config.hidden_size
+        num_layers = config.num_layers
+        hidden_dim = config.intermediate_size
+        num_heads = config.num_attention_heads
+        max_position_embeddings = config.max_position_embeddings
+        self.model = LlamaModel(vocab_size, dim, num_layers, hidden_dim, num_heads,max_position_embeddings)
         self.lm_head = nn.Linear(dim, vocab_size, bias=False)
     def forward(self, x):
         x = self.model(x)
+        return self.lm_head(x)