null and void commited on
Commit
212c5a9
·
verified ·
1 Parent(s): abaffb5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -24
app.py CHANGED
@@ -3,6 +3,12 @@ import torch
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import time
5
 
 
 
 
 
 
 
6
  class ConversationManager:
7
  def __init__(self):
8
  self.models = {}
@@ -24,7 +30,12 @@ class ConversationManager:
24
  try:
25
  print(f"Attempting to load model: {model_name}")
26
  tokenizer = AutoTokenizer.from_pretrained(model_name)
27
- model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
 
 
 
 
 
28
  self.models[model_name] = (model, tokenizer)
29
  print(f"Successfully loaded model: {model_name}")
30
  return self.models[model_name]
@@ -36,28 +47,14 @@ class ConversationManager:
36
 
37
  def generate_response(self, model_name, prompt):
38
  model, tokenizer = self.load_model(model_name)
39
-
40
- if "llama" in model_name.lower():
41
- formatted_prompt = self.format_llama2_prompt(prompt)
42
- else:
43
- formatted_prompt = self.format_general_prompt(prompt)
44
-
45
  inputs = tokenizer(formatted_prompt, return_tensors="pt", max_length=1024, truncation=True)
46
  with torch.no_grad():
47
  outputs = model.generate(**inputs, max_length=200, num_return_sequences=1, do_sample=True)
48
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
49
 
50
- def format_llama2_prompt(self, prompt):
51
- B_INST, E_INST = "[INST]", "[/INST]"
52
- B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
53
- system_prompt = "You are a helpful AI assistant. Please provide a concise and relevant response."
54
-
55
- formatted_prompt = f"{B_INST} {B_SYS}{system_prompt}{E_SYS}{prompt.strip()} {E_INST}"
56
- return formatted_prompt
57
-
58
- def format_general_prompt(self, prompt):
59
- return f"Human: {prompt.strip()}\n\nAssistant:"
60
-
61
  def add_to_conversation(self, model_name, response):
62
  self.conversation.append((model_name, response))
63
  if "task complete?" in response.lower():
@@ -181,20 +178,21 @@ def rewind_and_insert(steps, inserted_response, history):
181
  return manager.get_conversation_history(), ""
182
 
183
  open_source_models = [
184
- "meta-llama/Llama-2-7b-chat-hf",
185
- "meta-llama/Llama-2-13b-chat-hf",
186
- "meta-llama/Llama-2-70b-chat-hf",
187
  "mistralai/Mixtral-8x7B-Instruct-v0.1",
188
  "bigcode/starcoder2-15b",
189
  "bigcode/starcoder2-3b",
190
  "tiiuae/falcon-7b",
191
- "tiiuae/falcon-40b",
192
  "EleutherAI/gpt-neox-20b",
193
  "google/flan-ul2",
194
  "stabilityai/stablelm-zephyr-3b",
195
  "HuggingFaceH4/zephyr-7b-beta",
196
  "microsoft/phi-2",
197
- "google/gemma-7b-it"
 
 
 
 
 
198
  ]
199
 
200
  with gr.Blocks() as demo:
@@ -303,4 +301,4 @@ with gr.Blocks() as demo:
303
  delay_slider.change(lambda x: setattr(manager, 'delay', x), inputs=[delay_slider])
304
 
305
  if __name__ == "__main__":
306
- demo.launch()
 
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import time
5
 
6
+ print(f"CUDA is available: {torch.cuda.is_available()}")
7
+ print(f"CUDA device count: {torch.cuda.device_count()}")
8
+ if torch.cuda.is_available():
9
+ print(f"Current CUDA device: {torch.cuda.current_device()}")
10
+ print(f"CUDA device name: {torch.cuda.get_device_name(0)}")
11
+
12
  class ConversationManager:
13
  def __init__(self):
14
  self.models = {}
 
30
  try:
31
  print(f"Attempting to load model: {model_name}")
32
  tokenizer = AutoTokenizer.from_pretrained(model_name)
33
+ try:
34
+ # Try to load the model with GPU support
35
+ model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
36
+ except RuntimeError as e:
37
+ print(f"GPU loading failed, falling back to CPU: {e}")
38
+ model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cpu")
39
  self.models[model_name] = (model, tokenizer)
40
  print(f"Successfully loaded model: {model_name}")
41
  return self.models[model_name]
 
47
 
48
  def generate_response(self, model_name, prompt):
49
  model, tokenizer = self.load_model(model_name)
50
+
51
+ formatted_prompt = f"Human: {prompt.strip()}\n\nAssistant:"
52
+
 
 
 
53
  inputs = tokenizer(formatted_prompt, return_tensors="pt", max_length=1024, truncation=True)
54
  with torch.no_grad():
55
  outputs = model.generate(**inputs, max_length=200, num_return_sequences=1, do_sample=True)
56
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
57
 
 
 
 
 
 
 
 
 
 
 
 
58
  def add_to_conversation(self, model_name, response):
59
  self.conversation.append((model_name, response))
60
  if "task complete?" in response.lower():
 
178
  return manager.get_conversation_history(), ""
179
 
180
  open_source_models = [
 
 
 
181
  "mistralai/Mixtral-8x7B-Instruct-v0.1",
182
  "bigcode/starcoder2-15b",
183
  "bigcode/starcoder2-3b",
184
  "tiiuae/falcon-7b",
 
185
  "EleutherAI/gpt-neox-20b",
186
  "google/flan-ul2",
187
  "stabilityai/stablelm-zephyr-3b",
188
  "HuggingFaceH4/zephyr-7b-beta",
189
  "microsoft/phi-2",
190
+ "google/gemma-7b-it",
191
+ "OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5",
192
+ "mosaicml/mpt-7b-chat",
193
+ "databricks/dolly-v2-12b",
194
+ "thebloke/Wizard-Vicuna-13B-Uncensored-HF",
195
+ "bigscience/bloom-560m"
196
  ]
197
 
198
  with gr.Blocks() as demo:
 
301
  delay_slider.change(lambda x: setattr(manager, 'delay', x), inputs=[delay_slider])
302
 
303
  if __name__ == "__main__":
304
+ demo.launch()