Spaces:

samith-a
/

Django-code-model

Sleeping

App Files Files Community

samith-a commited on May 28

Commit

8a813f7

1 Parent(s): 14dbea3

unsloth no gpu error fix

Browse files

Files changed (1) hide show

app.py +63 -30

app.py CHANGED Viewed

@@ -2,17 +2,25 @@
 import gradio as gr
 import torch
-from unsloth import FastLanguageModel
 from peft import PeftModel
-from transformers import AutoTokenizer
 class ModelManager:
     _instance = None
     def __init__(self):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.model, self.tokenizer = self.load_model()
     @classmethod
     def get_instance(cls):
         if cls._instance is None:
@@ -20,28 +28,50 @@ class ModelManager:
         return cls._instance
     def load_model(self):
-        # Load base model
-        backbone, tokenizer = FastLanguageModel.from_pretrained(
-            "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
-            load_in_4bit=True,
-            dtype=torch.float16,
-            device_map=self.device,
-        )
-        # Load your fine-tuned adapter
         try:
             model = PeftModel.from_pretrained(
-                backbone,
                 "samith-a/Django-orm-code-gen",
-                torch_dtype=torch.float16,
-                device_map=self.device,
             )
-            print("Adapter weights loaded successfully")
         except Exception as e:
-            print(f"Error loading adapter: {e}")
-            model = backbone
-        FastLanguageModel.for_inference(model)
         return model, tokenizer
     def generate(self, instruction: str, input_text: str, max_new_tokens: int = 128) -> str:
@@ -51,30 +81,33 @@ class ModelManager:
             "### Response:\n"
         )
         prompt = alpaca_template.format(instruction, input_text)
-        encoded = self.tokenizer([prompt], return_tensors="pt").to(self.device)
-        outputs = self.model.generate(**encoded, max_new_tokens=max_new_tokens)
         raw = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
         return raw.split("### Response:")[-1].strip()
-# Instantiate once
 manager = ModelManager.get_instance()
-def predict(instruction, context, max_tokens=128):
     return manager.generate(instruction, context, max_new_tokens=int(max_tokens))
-# Gradio UI / API
 demo = gr.Interface(
     fn=predict,
     inputs=[
-        gr.Textbox(lines=2, label="Instruction", placeholder="Describe what you want…"),
-        gr.Textbox(lines=5, label="Input (code/context)", placeholder="Optional context…"),
-        gr.Slider(minimum=16, maximum=512, step=16, label="Max new tokens", value=128),
     ],
     outputs=gr.Textbox(label="Generated Code"),
     title="Django-ORM Code Generator",
-    description="Ask the LoRA-finetuned LLaMA3.2 model to generate or modify Django ORM code.",
 )
 if __name__ == "__main__":

 import gradio as gr
 import torch
 from peft import PeftModel
+from transformers import AutoModelForCausalLM, AutoTokenizer
+# Try to import Unsloth; if it fails, we’ll fallback
+try:
+    from unsloth import FastLanguageModel
+    HAS_UNSLOTH = True
+except NotImplementedError:
+    HAS_UNSLOTH = False
+except ImportError:
+    HAS_UNSLOTH = False
 class ModelManager:
     _instance = None
     def __init__(self):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.model, self.tokenizer = self.load_model()
     @classmethod
     def get_instance(cls):
         if cls._instance is None:
         return cls._instance
     def load_model(self):
+        if HAS_UNSLOTH and self.device != "cpu":
+            # GPU via Unsloth + LoRA
+            backbone, tokenizer = FastLanguageModel.from_pretrained(
+                "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
+                load_in_4bit=True,
+                dtype=torch.float16,
+                device_map="auto",
+            )
+            try:
+                model = PeftModel.from_pretrained(
+                    backbone,
+                    "samith-a/Django-orm-code-gen",
+                    torch_dtype=torch.float16,
+                    device_map="auto",
+                )
+                print("Loaded LoRA adapter via Unsloth.")
+            except Exception as e:
+                print(f"❗ Adapter load failed, using backbone only: {e}")
+                model = backbone
+            FastLanguageModel.for_inference(model)
+            return model, tokenizer
+        # --- Fallback: CPU-only via HF Transformers + PEFT ---
+        print("Falling back to CPU-only Transformers + PEFT")
+        base_name = "unsloth/Llama-3.2-1B-Instruct"  # non-4bit to run on CPU
+        tokenizer = AutoTokenizer.from_pretrained(base_name, use_fast=True)
+        base = AutoModelForCausalLM.from_pretrained(
+            base_name,
+            device_map={"": "cpu"},
+            torch_dtype=torch.float32,
+        )
         try:
             model = PeftModel.from_pretrained(
+                base,
                 "samith-a/Django-orm-code-gen",
+                device_map={"": "cpu"},
+                torch_dtype=torch.float32,
             )
+            print("Loaded LoRA adapter via PEFT.")
         except Exception as e:
+            print(f"❗ Adapter load failed, using base model: {e}")
+            model = base
+        model.eval()
         return model, tokenizer
     def generate(self, instruction: str, input_text: str, max_new_tokens: int = 128) -> str:
             "### Response:\n"
         )
         prompt = alpaca_template.format(instruction, input_text)
+        inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)
+        outputs = self.model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            do_sample=True,
+            temperature=0.7
+        )
         raw = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
         return raw.split("### Response:")[-1].strip()
+# Initialize once
 manager = ModelManager.get_instance()
+def predict(instruction, context, max_tokens):
     return manager.generate(instruction, context, max_new_tokens=int(max_tokens))
 demo = gr.Interface(
     fn=predict,
     inputs=[
+        gr.Textbox(lines=2, label="Instruction"),
+        gr.Textbox(lines=5, label="Context / Code"),
+        gr.Slider(16, 512, step=16, label="Max new tokens", value=128),
     ],
     outputs=gr.Textbox(label="Generated Code"),
     title="Django-ORM Code Generator",
+    description="LoRA-finetuned LLaMA3.2 for Django ORM code (CPU/GPU fallback)."
 )
 if __name__ == "__main__":