Spaces:

Copain22
/

Cafe-Chatbot

Sleeping

App Files Files Community

Copain22 commited on Apr 27

Commit

f8f0b32

verified ·

1 Parent(s): c408510

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -11

app.py CHANGED Viewed

@@ -18,6 +18,7 @@ import gradio as gr
 import spaces
 from pathlib import Path
 # 1. System prompt
 SYSTEM_PROMPT = """
@@ -44,22 +45,40 @@ model = AutoModelForCausalLM.from_pretrained(
 print(f"Model loaded on device: {model.device}")
-# 3. Load PDF files and create simple document store
-from PyPDF2 import PdfReader
-# Read all PDFs into a list of small chunks
 def load_pdfs(folder_path="."):
     docs = []
     for pdf_file in Path(folder_path).glob("*.pdf"):
         reader = PdfReader(str(pdf_file))
         for page in reader.pages:
             text = page.extract_text()
             if text:
-                for para in text.split("\n\n"):
-                    if len(para.strip()) > 20:  # keep meaningful text
-                        docs.append(para.strip())
     return docs
 document_chunks = load_pdfs(".")
 print(f"Loaded {len(document_chunks)} text chunks from PDFs.")
@@ -67,10 +86,12 @@ print(f"Loaded {len(document_chunks)} text chunks from PDFs.")
 embedder = SentenceTransformer("all-MiniLM-L6-v2")  # Fast small model
 doc_embeddings = embedder.encode(document_chunks, normalize_embeddings=True)
-# 5. Retrieval function
 def retrieve_context(question, top_k=3):
     question_embedding = embedder.encode(question, normalize_embeddings=True)
-    scores = torch.tensor(doc_embeddings) @ torch.tensor(question_embedding)
     top_indices = torch.topk(scores, k=min(top_k, len(scores))).indices.tolist()
     return "\n\n".join([document_chunks[idx] for idx in top_indices])
@@ -118,7 +139,7 @@ def respond(
         response += new_text
         yield response
-# 7. Gradio UI
 demo = gr.ChatInterface(
     fn=respond,
     title="Café Eleven Assistant",
@@ -170,4 +191,4 @@ demo = gr.ChatInterface(
 # 8. Launch
 if __name__ == "__main__":
-    demo.launch()

 import spaces
 from pathlib import Path
+from PyPDF2 import PdfReader
 # 1. System prompt
 SYSTEM_PROMPT = """
 print(f"Model loaded on device: {model.device}")
+# 3. Load PDF files
 def load_pdfs(folder_path="."):
     docs = []
+    current_section = None
     for pdf_file in Path(folder_path).glob("*.pdf"):
         reader = PdfReader(str(pdf_file))
         for page in reader.pages:
             text = page.extract_text()
             if text:
+                lines = text.split("\n")
+                for line in lines:
+                    line = line.strip()
+                    if not line:
+                        continue
+                    # New smarter heading detection:
+                    # If the line is mostly UPPERCASE and not too long
+                    if line.isupper() and len(line.split()) <= 6:
+                        if current_section:
+                            docs.append(current_section)
+                        current_section = line
+                    else:
+                        if current_section:
+                            current_section += f" | {line}"
+                        else:
+                            current_section = line
+                if current_section:
+                    docs.append(current_section)
+                    current_section = None
     return docs
 document_chunks = load_pdfs(".")
 print(f"Loaded {len(document_chunks)} text chunks from PDFs.")
 embedder = SentenceTransformer("all-MiniLM-L6-v2")  # Fast small model
 doc_embeddings = embedder.encode(document_chunks, normalize_embeddings=True)
+# 5. Retrieval function with float32 fix
 def retrieve_context(question, top_k=3):
     question_embedding = embedder.encode(question, normalize_embeddings=True)
+    question_embedding = torch.tensor(question_embedding, dtype=torch.float32)
+    doc_embeds = torch.tensor(doc_embeddings, dtype=torch.float32)
+    scores = doc_embeds @ question_embedding
     top_indices = torch.topk(scores, k=min(top_k, len(scores))).indices.tolist()
     return "\n\n".join([document_chunks[idx] for idx in top_indices])
         response += new_text
         yield response
+# 7. Gradio ChatInterface
 demo = gr.ChatInterface(
     fn=respond,
     title="Café Eleven Assistant",
 # 8. Launch
 if __name__ == "__main__":
+    demo.launch(share=True)