Spaces:
Running
Running
File size: 4,583 Bytes
2975595 bfb4fda 2975595 df365ca bfb4fda df365ca 835a614 2975595 835a614 bc25066 bfb4fda 2975595 bfb4fda 2975595 bfb4fda 2975595 bfb4fda 2975595 bfb4fda 2975595 bfb4fda 2975595 bfb4fda bc25066 bfb4fda bc25066 2975595 6f368e7 c76542a bc25066 2975595 bc25066 2975595 bfb4fda 2975595 bfb4fda 2975595 bfb4fda bc25066 df365ca bc25066 2975595 bfb4fda 2975595 bfb4fda 2975595 df365ca 2975595 df365ca 2975595 bfb4fda 2975595 df365ca bc25066 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import os
import json
import fitz # PyMuPDF
import docx
import chromadb
import torch
import nltk
import gradio as gr
from tqdm import tqdm
from typing import List
from PIL import Image
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
# --- Ensure punkt tokenizer is available ---
try:
nltk.data.find("tokenizers/punkt")
except LookupError:
nltk.download("punkt")
# --- Configuration ---
MANUALS_FOLDER = "./Manuals"
CHROMA_PATH = "./chroma_store"
COLLECTION_NAME = "manual_chunks"
MODEL_OPTIONS = {
"LLaMA 3.1 8B": "meta-llama/Llama-3.1-8B-Instruct",
"Mistral 7B": "mistralai/Mistral-7B-Instruct-v0.3",
"Gemma 7B": "google/gemma-1.1-7b-it"
}
HF_TOKEN = os.environ.get("HF_TOKEN")
MAX_CONTEXT_CHUNKS = 3
# --- Utility Functions ---
def extract_text_from_pdf(path):
try:
doc = fitz.open(path)
return "\n".join([page.get_text().strip() for page in doc])
except:
return ""
def extract_text_from_docx(path):
try:
doc = docx.Document(path)
return "\n".join([para.text.strip() for para in doc.paragraphs])
except:
return ""
def clean(text):
return "\n".join([line.strip() for line in text.splitlines() if line.strip()])
def split_sentences(text):
try:
return sent_tokenize(text)
except Exception as e:
print(f"[Tokenizer Error] {e}. Falling back to simple split.")
return text.split(". ")
def chunk_sentences(sentences, max_tokens=500, overlap=50):
chunks = []
current = []
total = 0
for sentence in sentences:
count = len(sentence.split())
if total + count > max_tokens:
chunks.append(" ".join(current))
current = current[-overlap:]
total = sum(len(s.split()) for s in current)
current.append(sentence)
total += count
if current:
chunks.append(" ".join(current))
return chunks
def embed_all():
db = chromadb.PersistentClient(path=CHROMA_PATH)
if COLLECTION_NAME in [c.name for c in db.list_collections()]:
db.delete_collection(COLLECTION_NAME)
collection = db.create_collection(COLLECTION_NAME)
embedder = SentenceTransformer("all-MiniLM-L6-v2")
all_chunks = []
for fname in os.listdir(MANUALS_FOLDER):
path = os.path.join(MANUALS_FOLDER, fname)
text = ""
if fname.lower().endswith(".pdf"):
text = extract_text_from_pdf(path)
elif fname.lower().endswith(".docx"):
text = extract_text_from_docx(path)
else:
continue
sents = split_sentences(clean(text))
chunks = chunk_sentences(sents)
for idx, chunk in enumerate(chunks):
chunk_id = f"{fname}::chunk_{idx}"
all_chunks.append({"id": chunk_id, "text": chunk, "metadata": {"source": fname}})
for i in range(0, len(all_chunks), 16):
batch = all_chunks[i:i+16]
docs = [x["text"] for x in batch]
ids = [x["id"] for x in batch]
metas = [x["metadata"] for x in batch]
embs = embedder.encode(docs).tolist()
collection.add(documents=docs, ids=ids, metadatas=metas, embeddings=embs)
return collection, embedder
def answer_query(query, model_choice):
db, embedder = embed_all()
results = db.get_collection(COLLECTION_NAME).query(query_texts=[query], n_results=MAX_CONTEXT_CHUNKS)
context = "\n\n".join(results["documents"][0])
model_id = MODEL_OPTIONS.get(model_choice)
tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(model_id, token=HF_TOKEN)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
prompt = f"""
Context:
{context}
Question: {query}
Answer:"""
out = pipe(prompt, max_new_tokens=300, do_sample=False)
return out[0]["generated_text"].split("Answer:")[-1].strip()
# --- UI ---
with gr.Blocks() as demo:
gr.Markdown("""# 📘 SmartManuals-AI
Ask technical questions from manuals (PDF & DOCX) with LLM + OCR + RAG.
""")
with gr.Row():
question = gr.Textbox(label="Your Question", placeholder="e.g., How do I reset the console?")
model_choice = gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), value="LLaMA 3.1 8B", label="Model")
answer = gr.Textbox(label="Answer")
submit = gr.Button("Ask")
submit.click(fn=answer_query, inputs=[question, model_choice], outputs=answer)
demo.launch()
|