Cafe-Chatbot / app.py
Copain22's picture
Update app.py
c485f88 verified
raw
history blame
3.17 kB
import os
import faiss
import torch
import threading
import gradio as gr
from docx import Document
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
# === Configuration ===
MODEL_ID = "microsoft/phi-2"
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SYSTEM_PROMPT = """You are a friendly café assistant. Help customers place orders, check ingredients, and provide warm service."""
# === Load LLM ===
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(DEVICE)
# === Load Embedder ===
embedder = SentenceTransformer(EMBED_MODEL)
# === Load Menu Text ===
def load_menu(docx_path):
doc = Document(docx_path)
return [p.text.strip() for p in doc.paragraphs if p.text.strip()]
menu_chunks = load_menu("menu.docx")
chunk_embeddings = embedder.encode(menu_chunks, convert_to_tensor=True).cpu().numpy()
# === Build FAISS Index ===
dimension = chunk_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(chunk_embeddings)
# === Retrieval ===
def retrieve_context_faiss(query, top_k=3):
query_vec = embedder.encode([query]).astype("float32")
distances, indices = index.search(query_vec, top_k)
return "\n".join([menu_chunks[i] for i in indices[0]])
# === Generate LLM Response ===
def generate_response(message, history, system_message, max_tokens, temperature, top_p):
context = retrieve_context_faiss(message)
messages = [{"role": "system", "content": system_message}]
for user, bot in history:
messages.append({"role": "user", "content": user})
messages.append({"role": "assistant", "content": bot})
messages.append({"role": "user", "content": f"{message}\n\nRelevant info:\n{context}"})
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
generate_kwargs = dict(
**inputs,
streamer=streamer,
max_new_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
do_sample=True,
)
thread = threading.Thread(target=model.generate, kwargs=generate_kwargs)
thread.start()
output = ""
for token in streamer:
output += token
yield output
# === UI ===
demo = gr.ChatInterface(
fn=generate_response,
title="Café Eleven RAG Assistant",
description="LLM + FAISS powered café chatbot with real-time Word document lookup.",
examples=[
["Do you have vegetarian options?", SYSTEM_PROMPT, 512, 0.7, 0.9],
["What's in the turkey sandwich?", SYSTEM_PROMPT, 512, 0.7, 0.9],
],
additional_inputs=[
gr.Textbox(value=SYSTEM_PROMPT, label="System Prompt"),
gr.Slider(1, 1024, 512, label="Max Tokens"),
gr.Slider(0.1, 2.0, 0.7, label="Temperature"),
gr.Slider(0.1, 1.0, 0.9, label="Top-p"),
]
)
if __name__ == "__main__":
demo.launch(share=True)