samith-a commited on
Commit
8a813f7
·
1 Parent(s): 14dbea3

unsloth no gpu error fix

Browse files
Files changed (1) hide show
  1. app.py +63 -30
app.py CHANGED
@@ -2,17 +2,25 @@
2
 
3
  import gradio as gr
4
  import torch
5
- from unsloth import FastLanguageModel
6
  from peft import PeftModel
7
- from transformers import AutoTokenizer
 
 
 
 
 
 
 
 
 
8
 
9
  class ModelManager:
10
  _instance = None
11
-
12
  def __init__(self):
13
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
14
  self.model, self.tokenizer = self.load_model()
15
-
16
  @classmethod
17
  def get_instance(cls):
18
  if cls._instance is None:
@@ -20,28 +28,50 @@ class ModelManager:
20
  return cls._instance
21
 
22
  def load_model(self):
23
- # Load base model
24
- backbone, tokenizer = FastLanguageModel.from_pretrained(
25
- "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
26
- load_in_4bit=True,
27
- dtype=torch.float16,
28
- device_map=self.device,
29
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
- # Load your fine-tuned adapter
 
 
 
 
 
 
 
 
32
  try:
33
  model = PeftModel.from_pretrained(
34
- backbone,
35
  "samith-a/Django-orm-code-gen",
36
- torch_dtype=torch.float16,
37
- device_map=self.device,
38
  )
39
- print("Adapter weights loaded successfully")
40
  except Exception as e:
41
- print(f"Error loading adapter: {e}")
42
- model = backbone
43
 
44
- FastLanguageModel.for_inference(model)
45
  return model, tokenizer
46
 
47
  def generate(self, instruction: str, input_text: str, max_new_tokens: int = 128) -> str:
@@ -51,30 +81,33 @@ class ModelManager:
51
  "### Response:\n"
52
  )
53
  prompt = alpaca_template.format(instruction, input_text)
54
-
55
- encoded = self.tokenizer([prompt], return_tensors="pt").to(self.device)
56
- outputs = self.model.generate(**encoded, max_new_tokens=max_new_tokens)
57
-
 
 
 
 
58
  raw = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
59
  return raw.split("### Response:")[-1].strip()
60
 
61
- # Instantiate once
62
  manager = ModelManager.get_instance()
63
 
64
- def predict(instruction, context, max_tokens=128):
65
  return manager.generate(instruction, context, max_new_tokens=int(max_tokens))
66
 
67
- # Gradio UI / API
68
  demo = gr.Interface(
69
  fn=predict,
70
  inputs=[
71
- gr.Textbox(lines=2, label="Instruction", placeholder="Describe what you want…"),
72
- gr.Textbox(lines=5, label="Input (code/context)", placeholder="Optional context…"),
73
- gr.Slider(minimum=16, maximum=512, step=16, label="Max new tokens", value=128),
74
  ],
75
  outputs=gr.Textbox(label="Generated Code"),
76
  title="Django-ORM Code Generator",
77
- description="Ask the LoRA-finetuned LLaMA3.2 model to generate or modify Django ORM code.",
78
  )
79
 
80
  if __name__ == "__main__":
 
2
 
3
  import gradio as gr
4
  import torch
 
5
  from peft import PeftModel
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer
7
+
8
+ # Try to import Unsloth; if it fails, we’ll fallback
9
+ try:
10
+ from unsloth import FastLanguageModel
11
+ HAS_UNSLOTH = True
12
+ except NotImplementedError:
13
+ HAS_UNSLOTH = False
14
+ except ImportError:
15
+ HAS_UNSLOTH = False
16
 
17
  class ModelManager:
18
  _instance = None
19
+
20
  def __init__(self):
21
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
22
  self.model, self.tokenizer = self.load_model()
23
+
24
  @classmethod
25
  def get_instance(cls):
26
  if cls._instance is None:
 
28
  return cls._instance
29
 
30
  def load_model(self):
31
+ if HAS_UNSLOTH and self.device != "cpu":
32
+ # GPU via Unsloth + LoRA
33
+ backbone, tokenizer = FastLanguageModel.from_pretrained(
34
+ "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
35
+ load_in_4bit=True,
36
+ dtype=torch.float16,
37
+ device_map="auto",
38
+ )
39
+ try:
40
+ model = PeftModel.from_pretrained(
41
+ backbone,
42
+ "samith-a/Django-orm-code-gen",
43
+ torch_dtype=torch.float16,
44
+ device_map="auto",
45
+ )
46
+ print("Loaded LoRA adapter via Unsloth.")
47
+ except Exception as e:
48
+ print(f"❗ Adapter load failed, using backbone only: {e}")
49
+ model = backbone
50
+ FastLanguageModel.for_inference(model)
51
+ return model, tokenizer
52
 
53
+ # --- Fallback: CPU-only via HF Transformers + PEFT ---
54
+ print("Falling back to CPU-only Transformers + PEFT")
55
+ base_name = "unsloth/Llama-3.2-1B-Instruct" # non-4bit to run on CPU
56
+ tokenizer = AutoTokenizer.from_pretrained(base_name, use_fast=True)
57
+ base = AutoModelForCausalLM.from_pretrained(
58
+ base_name,
59
+ device_map={"": "cpu"},
60
+ torch_dtype=torch.float32,
61
+ )
62
  try:
63
  model = PeftModel.from_pretrained(
64
+ base,
65
  "samith-a/Django-orm-code-gen",
66
+ device_map={"": "cpu"},
67
+ torch_dtype=torch.float32,
68
  )
69
+ print("Loaded LoRA adapter via PEFT.")
70
  except Exception as e:
71
+ print(f" Adapter load failed, using base model: {e}")
72
+ model = base
73
 
74
+ model.eval()
75
  return model, tokenizer
76
 
77
  def generate(self, instruction: str, input_text: str, max_new_tokens: int = 128) -> str:
 
81
  "### Response:\n"
82
  )
83
  prompt = alpaca_template.format(instruction, input_text)
84
+
85
+ inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)
86
+ outputs = self.model.generate(
87
+ **inputs,
88
+ max_new_tokens=max_new_tokens,
89
+ do_sample=True,
90
+ temperature=0.7
91
+ )
92
  raw = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
93
  return raw.split("### Response:")[-1].strip()
94
 
95
+ # Initialize once
96
  manager = ModelManager.get_instance()
97
 
98
+ def predict(instruction, context, max_tokens):
99
  return manager.generate(instruction, context, max_new_tokens=int(max_tokens))
100
 
 
101
  demo = gr.Interface(
102
  fn=predict,
103
  inputs=[
104
+ gr.Textbox(lines=2, label="Instruction"),
105
+ gr.Textbox(lines=5, label="Context / Code"),
106
+ gr.Slider(16, 512, step=16, label="Max new tokens", value=128),
107
  ],
108
  outputs=gr.Textbox(label="Generated Code"),
109
  title="Django-ORM Code Generator",
110
+ description="LoRA-finetuned LLaMA3.2 for Django ORM code (CPU/GPU fallback)."
111
  )
112
 
113
  if __name__ == "__main__":