Spaces:

damoojeje
/

SmartManuals-AI

Running

App Files Files Community

SmartManuals-AI / app.py

damoojeje

Update app.py

df365ca verified 4 months ago

raw

history blame

6.1 kB

	import os
	import json
	import fitz # PyMuPDF
	import re
	from tqdm import tqdm
	from docx import Document
	from PIL import Image
	import pytesseract
	import io
	import torch
	import chromadb
	from sentence_transformers import SentenceTransformer, util
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
	import gradio as gr

	# ---------------------------
	# 📁 Configuration
	# ---------------------------
	MANUALS_FOLDER = "./Manuals"
	CHROMA_PATH = "./chroma_store"
	COLLECTION_NAME = "manual_chunks"
	CHUNK_SIZE = 750
	CHUNK_OVERLAP = 100
	MAX_CONTEXT_CHUNKS = 3
	HF_MODEL = "meta-llama/Llama-3.1-8B-Instruct"
	HF_TOKEN = os.environ.get("HF_TOKEN")

	# ---------------------------
	# 🧹 Helpers
	# ---------------------------
	def clean(text):
	lines = text.splitlines()
	return "\n".join(line.strip() for line in lines if line.strip())

	def split_sentences(text):
	return re.split(r'(?<=[.!?])\s+', text.strip())

	def chunk_sentences(sentences, max_len=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
	chunks, chunk, length = [], [], 0
	for sent in sentences:
	tokens = len(sent.split())
	if length + tokens > max_len and chunk:
	chunks.append(" ".join(chunk))
	chunk = chunk[-overlap:] if overlap else []
	length = sum(len(s.split()) for s in chunk)
	chunk.append(sent)
	length += tokens
	if chunk:
	chunks.append(" ".join(chunk))
	return chunks

	def extract_text_from_pdf(path):
	doc = fitz.open(path)
	full_text = []
	for page in doc:
	text = page.get_text().strip()
	if not text:
	try:
	pix = page.get_pixmap(dpi=300)
	img_data = pix.tobytes("png")
	img = Image.open(io.BytesIO(img_data))
	text = pytesseract.image_to_string(img).strip()
	except Exception:
	text = ""
	full_text.append(text)
	return "\n".join(full_text)

	def extract_text_from_docx(path):
	doc = Document(path)
	return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])

	def extract_metadata(filename):
	name = filename.lower()
	model = next((m for m in ["se3hd", "se3", "se4", "symbio", "explore", "integrity x", "integrity sl", "everest", "engage", "inspire", "discover", "95t", "95x", "95c", "95r", "97c"] if m in name), "unknown")
	if "om" in name or "owner" in name:
	doc_type = "owner manual"
	elif "sm" in name or "service" in name:
	doc_type = "service manual"
	elif "assembly" in name:
	doc_type = "assembly instructions"
	elif "alert" in name:
	doc_type = "installer alert"
	elif "parts" in name:
	doc_type = "parts manual"
	elif "bulletin" in name:
	doc_type = "service bulletin"
	else:
	doc_type = "unknown"
	return model, doc_type

	# ---------------------------
	# 🚀 Build ChromaDB at Startup
	# ---------------------------
	def embed_all():
	client = chromadb.PersistentClient(path=CHROMA_PATH)
	if COLLECTION_NAME in [c.name for c in client.list_collections()]:
	client.delete_collection(COLLECTION_NAME)
	collection = client.create_collection(COLLECTION_NAME)

	embedder = SentenceTransformer("all-MiniLM-L6-v2")
	records = []

	for fname in os.listdir(MANUALS_FOLDER):
	path = os.path.join(MANUALS_FOLDER, fname)
	if not fname.lower().endswith((".pdf", ".docx")):
	continue
	text = extract_text_from_pdf(path) if fname.endswith(".pdf") else extract_text_from_docx(path)
	sents = split_sentences(clean(text))
	chunks = chunk_sentences(sents)
	model, doc_type = extract_metadata(fname)
	for i, chunk in enumerate(chunks):
	records.append({
	"id": f"{fname}::chunk_{i+1}",
	"text": chunk,
	"metadata": {"source_file": fname, "model": model, "doc_type": doc_type}
	})

	for i in range(0, len(records), 16):
	batch = records[i:i+16]
	texts = [r["text"] for r in batch]
	ids = [r["id"] for r in batch]
	metas = [r["metadata"] for r in batch]
	embeddings = embedder.encode(texts).tolist()
	collection.add(documents=texts, ids=ids, metadatas=metas, embeddings=embeddings)

	return collection, embedder

	# ---------------------------
	# 💬 Load HF Model
	# ---------------------------
	llm_pipe = None
	if HF_TOKEN:
	tokenizer = AutoTokenizer.from_pretrained(HF_MODEL, token=HF_TOKEN)
	model = AutoModelForCausalLM.from_pretrained(HF_MODEL, token=HF_TOKEN, torch_dtype=torch.float32)
	llm_pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)

	# ---------------------------
	# 🔎 RAG Function
	# ---------------------------
	def run_query(question):
	if not question.strip():
	return "Please enter a question."
	if not db or not embedder:
	return "Chroma or embedder not ready."

	q_embed = embedder.encode(question).tolist()
	res = db.query(query_embeddings=[q_embed], n_results=MAX_CONTEXT_CHUNKS)
	contexts = res["documents"][0]
	prompt = """
	You are a technical assistant.
	Answer only using the context below.
	Say 'I don't know' if not found.

	"""
	context_text = "\n\n".join(contexts)
	final_prompt = prompt + f"Context:\n{context_text}\n\nQuestion: {question}\nAnswer:"
	if llm_pipe:
	result = llm_pipe(final_prompt, max_new_tokens=300)[0]['generated_text']
	return result.split("Answer:")[-1].strip()
	return "Model not loaded."

	# ---------------------------
	# 🧠 Init embeddings once
	# ---------------------------
	db, embedder = embed_all()

	# ---------------------------
	# 🎛️ Gradio Interface
	# ---------------------------
	with gr.Blocks() as demo:
	gr.Markdown("# 🤖 SmartManuals-AI: Ask Technical Questions about Your Manuals")
	question = gr.Textbox(placeholder="e.g. How do I reset the treadmill console?", label="Enter Question")
	submit = gr.Button("Get Answer")
	output = gr.Textbox(label="Answer")
	submit.click(fn=run_query, inputs=question, outputs=output)

	demo.launch()