File size: 4,583 Bytes
2975595
 
 
 
 
 
 
bfb4fda
2975595
 
df365ca
bfb4fda
df365ca
 
835a614
2975595
 
 
 
 
 
 
 
835a614
bc25066
bfb4fda
2975595
bfb4fda
2975595
bfb4fda
2975595
 
bfb4fda
2975595
bfb4fda
 
 
2975595
 
 
bfb4fda
 
 
 
2975595
 
bfb4fda
bc25066
 
bfb4fda
bc25066
 
2975595
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6f368e7
c76542a
bc25066
2975595
 
 
 
bc25066
2975595
bfb4fda
2975595
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bfb4fda
 
2975595
 
 
bfb4fda
 
bc25066
df365ca
bc25066
2975595
 
 
bfb4fda
2975595
 
bfb4fda
2975595
 
 
 
 
 
 
df365ca
2975595
 
 
 
 
 
 
df365ca
2975595
 
 
 
bfb4fda
2975595
 
 
 
 
df365ca
bc25066
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import os
import json
import fitz  # PyMuPDF
import docx
import chromadb
import torch
import nltk
import gradio as gr
from tqdm import tqdm
from typing import List
from PIL import Image
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# --- Ensure punkt tokenizer is available ---
try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("punkt")

# --- Configuration ---
MANUALS_FOLDER = "./Manuals"
CHROMA_PATH = "./chroma_store"
COLLECTION_NAME = "manual_chunks"
MODEL_OPTIONS = {
    "LLaMA 3.1 8B": "meta-llama/Llama-3.1-8B-Instruct",
    "Mistral 7B": "mistralai/Mistral-7B-Instruct-v0.3",
    "Gemma 7B": "google/gemma-1.1-7b-it"
}
HF_TOKEN = os.environ.get("HF_TOKEN")
MAX_CONTEXT_CHUNKS = 3

# --- Utility Functions ---
def extract_text_from_pdf(path):
    try:
        doc = fitz.open(path)
        return "\n".join([page.get_text().strip() for page in doc])
    except:
        return ""

def extract_text_from_docx(path):
    try:
        doc = docx.Document(path)
        return "\n".join([para.text.strip() for para in doc.paragraphs])
    except:
        return ""

def clean(text):
    return "\n".join([line.strip() for line in text.splitlines() if line.strip()])

def split_sentences(text):
    try:
        return sent_tokenize(text)
    except Exception as e:
        print(f"[Tokenizer Error] {e}. Falling back to simple split.")
        return text.split(". ")

def chunk_sentences(sentences, max_tokens=500, overlap=50):
    chunks = []
    current = []
    total = 0
    for sentence in sentences:
        count = len(sentence.split())
        if total + count > max_tokens:
            chunks.append(" ".join(current))
            current = current[-overlap:]
            total = sum(len(s.split()) for s in current)
        current.append(sentence)
        total += count
    if current:
        chunks.append(" ".join(current))
    return chunks

def embed_all():
    db = chromadb.PersistentClient(path=CHROMA_PATH)
    if COLLECTION_NAME in [c.name for c in db.list_collections()]:
        db.delete_collection(COLLECTION_NAME)
    collection = db.create_collection(COLLECTION_NAME)

    embedder = SentenceTransformer("all-MiniLM-L6-v2")
    all_chunks = []

    for fname in os.listdir(MANUALS_FOLDER):
        path = os.path.join(MANUALS_FOLDER, fname)
        text = ""
        if fname.lower().endswith(".pdf"):
            text = extract_text_from_pdf(path)
        elif fname.lower().endswith(".docx"):
            text = extract_text_from_docx(path)
        else:
            continue

        sents = split_sentences(clean(text))
        chunks = chunk_sentences(sents)
        for idx, chunk in enumerate(chunks):
            chunk_id = f"{fname}::chunk_{idx}"
            all_chunks.append({"id": chunk_id, "text": chunk, "metadata": {"source": fname}})

    for i in range(0, len(all_chunks), 16):
        batch = all_chunks[i:i+16]
        docs = [x["text"] for x in batch]
        ids = [x["id"] for x in batch]
        metas = [x["metadata"] for x in batch]
        embs = embedder.encode(docs).tolist()
        collection.add(documents=docs, ids=ids, metadatas=metas, embeddings=embs)

    return collection, embedder

def answer_query(query, model_choice):
    db, embedder = embed_all()
    results = db.get_collection(COLLECTION_NAME).query(query_texts=[query], n_results=MAX_CONTEXT_CHUNKS)

    context = "\n\n".join(results["documents"][0])
    model_id = MODEL_OPTIONS.get(model_choice)

    tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
    model = AutoModelForCausalLM.from_pretrained(model_id, token=HF_TOKEN)
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

    prompt = f"""
    Context:
    {context}

    Question: {query}
    Answer:"""

    out = pipe(prompt, max_new_tokens=300, do_sample=False)
    return out[0]["generated_text"].split("Answer:")[-1].strip()

# --- UI ---
with gr.Blocks() as demo:
    gr.Markdown("""# 📘 SmartManuals-AI
    Ask technical questions from manuals (PDF & DOCX) with LLM + OCR + RAG.
    """)

    with gr.Row():
        question = gr.Textbox(label="Your Question", placeholder="e.g., How do I reset the console?")
        model_choice = gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), value="LLaMA 3.1 8B", label="Model")
    answer = gr.Textbox(label="Answer")
    submit = gr.Button("Ask")
    submit.click(fn=answer_query, inputs=[question, model_choice], outputs=answer)

demo.launch()