Spaces:
Build error
Build error
null and void
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -3,6 +3,12 @@ import torch
|
|
3 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
4 |
import time
|
5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
class ConversationManager:
|
7 |
def __init__(self):
|
8 |
self.models = {}
|
@@ -24,7 +30,12 @@ class ConversationManager:
|
|
24 |
try:
|
25 |
print(f"Attempting to load model: {model_name}")
|
26 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
28 |
self.models[model_name] = (model, tokenizer)
|
29 |
print(f"Successfully loaded model: {model_name}")
|
30 |
return self.models[model_name]
|
@@ -36,28 +47,14 @@ class ConversationManager:
|
|
36 |
|
37 |
def generate_response(self, model_name, prompt):
|
38 |
model, tokenizer = self.load_model(model_name)
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
else:
|
43 |
-
formatted_prompt = self.format_general_prompt(prompt)
|
44 |
-
|
45 |
inputs = tokenizer(formatted_prompt, return_tensors="pt", max_length=1024, truncation=True)
|
46 |
with torch.no_grad():
|
47 |
outputs = model.generate(**inputs, max_length=200, num_return_sequences=1, do_sample=True)
|
48 |
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|
49 |
|
50 |
-
def format_llama2_prompt(self, prompt):
|
51 |
-
B_INST, E_INST = "[INST]", "[/INST]"
|
52 |
-
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
|
53 |
-
system_prompt = "You are a helpful AI assistant. Please provide a concise and relevant response."
|
54 |
-
|
55 |
-
formatted_prompt = f"{B_INST} {B_SYS}{system_prompt}{E_SYS}{prompt.strip()} {E_INST}"
|
56 |
-
return formatted_prompt
|
57 |
-
|
58 |
-
def format_general_prompt(self, prompt):
|
59 |
-
return f"Human: {prompt.strip()}\n\nAssistant:"
|
60 |
-
|
61 |
def add_to_conversation(self, model_name, response):
|
62 |
self.conversation.append((model_name, response))
|
63 |
if "task complete?" in response.lower():
|
@@ -181,20 +178,21 @@ def rewind_and_insert(steps, inserted_response, history):
|
|
181 |
return manager.get_conversation_history(), ""
|
182 |
|
183 |
open_source_models = [
|
184 |
-
"meta-llama/Llama-2-7b-chat-hf",
|
185 |
-
"meta-llama/Llama-2-13b-chat-hf",
|
186 |
-
"meta-llama/Llama-2-70b-chat-hf",
|
187 |
"mistralai/Mixtral-8x7B-Instruct-v0.1",
|
188 |
"bigcode/starcoder2-15b",
|
189 |
"bigcode/starcoder2-3b",
|
190 |
"tiiuae/falcon-7b",
|
191 |
-
"tiiuae/falcon-40b",
|
192 |
"EleutherAI/gpt-neox-20b",
|
193 |
"google/flan-ul2",
|
194 |
"stabilityai/stablelm-zephyr-3b",
|
195 |
"HuggingFaceH4/zephyr-7b-beta",
|
196 |
"microsoft/phi-2",
|
197 |
-
"google/gemma-7b-it"
|
|
|
|
|
|
|
|
|
|
|
198 |
]
|
199 |
|
200 |
with gr.Blocks() as demo:
|
@@ -303,4 +301,4 @@ with gr.Blocks() as demo:
|
|
303 |
delay_slider.change(lambda x: setattr(manager, 'delay', x), inputs=[delay_slider])
|
304 |
|
305 |
if __name__ == "__main__":
|
306 |
-
demo.launch()
|
|
|
3 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
4 |
import time
|
5 |
|
6 |
+
print(f"CUDA is available: {torch.cuda.is_available()}")
|
7 |
+
print(f"CUDA device count: {torch.cuda.device_count()}")
|
8 |
+
if torch.cuda.is_available():
|
9 |
+
print(f"Current CUDA device: {torch.cuda.current_device()}")
|
10 |
+
print(f"CUDA device name: {torch.cuda.get_device_name(0)}")
|
11 |
+
|
12 |
class ConversationManager:
|
13 |
def __init__(self):
|
14 |
self.models = {}
|
|
|
30 |
try:
|
31 |
print(f"Attempting to load model: {model_name}")
|
32 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
33 |
+
try:
|
34 |
+
# Try to load the model with GPU support
|
35 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
|
36 |
+
except RuntimeError as e:
|
37 |
+
print(f"GPU loading failed, falling back to CPU: {e}")
|
38 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cpu")
|
39 |
self.models[model_name] = (model, tokenizer)
|
40 |
print(f"Successfully loaded model: {model_name}")
|
41 |
return self.models[model_name]
|
|
|
47 |
|
48 |
def generate_response(self, model_name, prompt):
|
49 |
model, tokenizer = self.load_model(model_name)
|
50 |
+
|
51 |
+
formatted_prompt = f"Human: {prompt.strip()}\n\nAssistant:"
|
52 |
+
|
|
|
|
|
|
|
53 |
inputs = tokenizer(formatted_prompt, return_tensors="pt", max_length=1024, truncation=True)
|
54 |
with torch.no_grad():
|
55 |
outputs = model.generate(**inputs, max_length=200, num_return_sequences=1, do_sample=True)
|
56 |
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|
57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
def add_to_conversation(self, model_name, response):
|
59 |
self.conversation.append((model_name, response))
|
60 |
if "task complete?" in response.lower():
|
|
|
178 |
return manager.get_conversation_history(), ""
|
179 |
|
180 |
open_source_models = [
|
|
|
|
|
|
|
181 |
"mistralai/Mixtral-8x7B-Instruct-v0.1",
|
182 |
"bigcode/starcoder2-15b",
|
183 |
"bigcode/starcoder2-3b",
|
184 |
"tiiuae/falcon-7b",
|
|
|
185 |
"EleutherAI/gpt-neox-20b",
|
186 |
"google/flan-ul2",
|
187 |
"stabilityai/stablelm-zephyr-3b",
|
188 |
"HuggingFaceH4/zephyr-7b-beta",
|
189 |
"microsoft/phi-2",
|
190 |
+
"google/gemma-7b-it",
|
191 |
+
"OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5",
|
192 |
+
"mosaicml/mpt-7b-chat",
|
193 |
+
"databricks/dolly-v2-12b",
|
194 |
+
"thebloke/Wizard-Vicuna-13B-Uncensored-HF",
|
195 |
+
"bigscience/bloom-560m"
|
196 |
]
|
197 |
|
198 |
with gr.Blocks() as demo:
|
|
|
301 |
delay_slider.change(lambda x: setattr(manager, 'delay', x), inputs=[delay_slider])
|
302 |
|
303 |
if __name__ == "__main__":
|
304 |
+
demo.launch()
|