File size: 3,169 Bytes
b36fdc1
c485f88
b36fdc1
c485f88
83362fe
53d6350
c485f88
 
 
5d4d3e2
c485f88
 
 
 
 
5d4d3e2
c485f88
83362fe
c485f88
5d4d3e2
c485f88
 
5d4d3e2
c485f88
 
8ca6b5f
c485f88
 
 
 
53d6350
c485f88
 
 
 
f8f0b32
c485f88
 
 
 
 
8ca6b5f
c485f88
 
 
83362fe
c485f88
 
 
 
5d4d3e2
c485f88
 
5d4d3e2
c485f88
83362fe
 
 
 
 
 
 
 
c485f88
83362fe
 
5d4d3e2
c485f88
 
 
 
7a0e378
c485f88
83362fe
c485f88
 
 
83362fe
c485f88
 
83362fe
 
c485f88
 
 
 
 
b36fdc1
 
2f49d9a
f8f0b32
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import os
import faiss
import torch
import threading
import gradio as gr

from docx import Document
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer

# === Configuration ===
MODEL_ID = "microsoft/phi-2"
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SYSTEM_PROMPT = """You are a friendly café assistant. Help customers place orders, check ingredients, and provide warm service."""

# === Load LLM ===
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(DEVICE)

# === Load Embedder ===
embedder = SentenceTransformer(EMBED_MODEL)

# === Load Menu Text ===
def load_menu(docx_path):
    doc = Document(docx_path)
    return [p.text.strip() for p in doc.paragraphs if p.text.strip()]

menu_chunks = load_menu("menu.docx")
chunk_embeddings = embedder.encode(menu_chunks, convert_to_tensor=True).cpu().numpy()

# === Build FAISS Index ===
dimension = chunk_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(chunk_embeddings)

# === Retrieval ===
def retrieve_context_faiss(query, top_k=3):
    query_vec = embedder.encode([query]).astype("float32")
    distances, indices = index.search(query_vec, top_k)
    return "\n".join([menu_chunks[i] for i in indices[0]])

# === Generate LLM Response ===
def generate_response(message, history, system_message, max_tokens, temperature, top_p):
    context = retrieve_context_faiss(message)
    messages = [{"role": "system", "content": system_message}]
    for user, bot in history:
        messages.append({"role": "user", "content": user})
        messages.append({"role": "assistant", "content": bot})
    messages.append({"role": "user", "content": f"{message}\n\nRelevant info:\n{context}"})

    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)

    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    generate_kwargs = dict(
        **inputs,
        streamer=streamer,
        max_new_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        do_sample=True,
    )

    thread = threading.Thread(target=model.generate, kwargs=generate_kwargs)
    thread.start()

    output = ""
    for token in streamer:
        output += token
        yield output

# === UI ===
demo = gr.ChatInterface(
    fn=generate_response,
    title="Café Eleven RAG Assistant",
    description="LLM + FAISS powered café chatbot with real-time Word document lookup.",
    examples=[
        ["Do you have vegetarian options?", SYSTEM_PROMPT, 512, 0.7, 0.9],
        ["What's in the turkey sandwich?", SYSTEM_PROMPT, 512, 0.7, 0.9],
    ],
    additional_inputs=[
        gr.Textbox(value=SYSTEM_PROMPT, label="System Prompt"),
        gr.Slider(1, 1024, 512, label="Max Tokens"),
        gr.Slider(0.1, 2.0, 0.7, label="Temperature"),
        gr.Slider(0.1, 1.0, 0.9, label="Top-p"),
    ]
)

if __name__ == "__main__":
    demo.launch(share=True)