Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -41,18 +41,21 @@ def retrieve_context_faiss(query, top_k=3):
|
|
41 |
distances, indices = index.search(query_vec, top_k)
|
42 |
return "\n".join([menu_chunks[i] for i in indices[0]])
|
43 |
|
|
|
44 |
# === Generate LLM Response ===
|
45 |
-
@GPU
|
46 |
def generate_response(message, history, system_message, max_tokens, temperature, top_p):
|
47 |
context = retrieve_context_faiss(message)
|
|
|
48 |
messages = [{"role": "system", "content": system_message}]
|
49 |
-
for
|
50 |
-
messages.append({"role": "user", "content":
|
51 |
-
messages.append({"role": "assistant", "content":
|
|
|
52 |
messages.append({"role": "user", "content": f"{message}\n\nRelevant info:\n{context}"})
|
53 |
-
|
54 |
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
55 |
-
inputs = tokenizer(prompt, return_tensors="pt").to(
|
56 |
|
57 |
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
|
58 |
generate_kwargs = dict(
|
@@ -71,7 +74,8 @@ def generate_response(message, history, system_message, max_tokens, temperature,
|
|
71 |
for token in streamer:
|
72 |
output += token
|
73 |
yield output
|
74 |
-
|
|
|
75 |
# === UI ===
|
76 |
demo = gr.ChatInterface(
|
77 |
fn=generate_response,
|
|
|
41 |
distances, indices = index.search(query_vec, top_k)
|
42 |
return "\n".join([menu_chunks[i] for i in indices[0]])
|
43 |
|
44 |
+
|
45 |
# === Generate LLM Response ===
|
46 |
+
@spaces.GPU # Only if you're using ZeroGPU
|
47 |
def generate_response(message, history, system_message, max_tokens, temperature, top_p):
|
48 |
context = retrieve_context_faiss(message)
|
49 |
+
|
50 |
messages = [{"role": "system", "content": system_message}]
|
51 |
+
for user_msg, bot_msg in history:
|
52 |
+
messages.append({"role": "user", "content": user_msg})
|
53 |
+
messages.append({"role": "assistant", "content": bot_msg})
|
54 |
+
|
55 |
messages.append({"role": "user", "content": f"{message}\n\nRelevant info:\n{context}"})
|
56 |
+
|
57 |
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
58 |
+
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
59 |
|
60 |
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
|
61 |
generate_kwargs = dict(
|
|
|
74 |
for token in streamer:
|
75 |
output += token
|
76 |
yield output
|
77 |
+
|
78 |
+
print("Inputs received:", message, history, system_message, max_tokens, temperature, top_p)
|
79 |
# === UI ===
|
80 |
demo = gr.ChatInterface(
|
81 |
fn=generate_response,
|