Spaces:
Sleeping
Sleeping
File size: 3,169 Bytes
b36fdc1 c485f88 b36fdc1 c485f88 83362fe 53d6350 c485f88 5d4d3e2 c485f88 5d4d3e2 c485f88 83362fe c485f88 5d4d3e2 c485f88 5d4d3e2 c485f88 8ca6b5f c485f88 53d6350 c485f88 f8f0b32 c485f88 8ca6b5f c485f88 83362fe c485f88 5d4d3e2 c485f88 5d4d3e2 c485f88 83362fe c485f88 83362fe 5d4d3e2 c485f88 7a0e378 c485f88 83362fe c485f88 83362fe c485f88 83362fe c485f88 b36fdc1 2f49d9a f8f0b32 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
import os
import faiss
import torch
import threading
import gradio as gr
from docx import Document
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
# === Configuration ===
MODEL_ID = "microsoft/phi-2"
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SYSTEM_PROMPT = """You are a friendly café assistant. Help customers place orders, check ingredients, and provide warm service."""
# === Load LLM ===
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(DEVICE)
# === Load Embedder ===
embedder = SentenceTransformer(EMBED_MODEL)
# === Load Menu Text ===
def load_menu(docx_path):
doc = Document(docx_path)
return [p.text.strip() for p in doc.paragraphs if p.text.strip()]
menu_chunks = load_menu("menu.docx")
chunk_embeddings = embedder.encode(menu_chunks, convert_to_tensor=True).cpu().numpy()
# === Build FAISS Index ===
dimension = chunk_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(chunk_embeddings)
# === Retrieval ===
def retrieve_context_faiss(query, top_k=3):
query_vec = embedder.encode([query]).astype("float32")
distances, indices = index.search(query_vec, top_k)
return "\n".join([menu_chunks[i] for i in indices[0]])
# === Generate LLM Response ===
def generate_response(message, history, system_message, max_tokens, temperature, top_p):
context = retrieve_context_faiss(message)
messages = [{"role": "system", "content": system_message}]
for user, bot in history:
messages.append({"role": "user", "content": user})
messages.append({"role": "assistant", "content": bot})
messages.append({"role": "user", "content": f"{message}\n\nRelevant info:\n{context}"})
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
generate_kwargs = dict(
**inputs,
streamer=streamer,
max_new_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
do_sample=True,
)
thread = threading.Thread(target=model.generate, kwargs=generate_kwargs)
thread.start()
output = ""
for token in streamer:
output += token
yield output
# === UI ===
demo = gr.ChatInterface(
fn=generate_response,
title="Café Eleven RAG Assistant",
description="LLM + FAISS powered café chatbot with real-time Word document lookup.",
examples=[
["Do you have vegetarian options?", SYSTEM_PROMPT, 512, 0.7, 0.9],
["What's in the turkey sandwich?", SYSTEM_PROMPT, 512, 0.7, 0.9],
],
additional_inputs=[
gr.Textbox(value=SYSTEM_PROMPT, label="System Prompt"),
gr.Slider(1, 1024, 512, label="Max Tokens"),
gr.Slider(0.1, 2.0, 0.7, label="Temperature"),
gr.Slider(0.1, 1.0, 0.9, label="Top-p"),
]
)
if __name__ == "__main__":
demo.launch(share=True)
|