tdecae commited on
Commit
53a7eb8
·
verified ·
1 Parent(s): c8ff351

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -0
app.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from langchain.chains import ConversationalRetrievalChain
4
+ from langchain.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
5
+ from langchain.text_splitter import CharacterTextSplitter
6
+ from langchain.vectorstores import Chroma
7
+ import gradio as gr
8
+ from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
9
+ from sentence_transformers import SentenceTransformer
10
+ import torch
11
+
12
+ # sqlite workaround for HuggingFace Spaces
13
+ __import__('pysqlite3')
14
+ sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
15
+
16
+ # Load documents
17
+ docs = []
18
+ for f in os.listdir("multiple_docs"):
19
+ if f.endswith(".pdf"):
20
+ loader = PyPDFLoader(os.path.join("multiple_docs", f))
21
+ docs.extend(loader.load())
22
+ elif f.endswith(".docx") or f.endswith(".doc"):
23
+ loader = Docx2txtLoader(os.path.join("multiple_docs", f))
24
+ docs.extend(loader.load())
25
+ elif f.endswith(".txt"):
26
+ loader = TextLoader(os.path.join("multiple_docs", f))
27
+ docs.extend(loader.load())
28
+
29
+ # Split docs
30
+ splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
31
+ docs = splitter.split_documents(docs)
32
+
33
+ # Embeddings
34
+ embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
35
+ texts = [doc.page_content for doc in docs]
36
+ metadatas = [{"id": i} for i in range(len(texts))]
37
+ embeddings = embedding_model.encode(texts)
38
+
39
+ # Vectorstore
40
+ vectorstore = Chroma(persist_directory="./db")
41
+ vectorstore.add_texts(texts=texts, metadatas=metadatas, embeddings=embeddings)
42
+ vectorstore.persist()
43
+
44
+
45
+ model_name = "deepseek-ai/deepseek-llm-7b-instruct"
46
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
47
+ model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
48
+
49
+ def generate(prompt):
50
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
51
+ outputs = model.generate(**inputs, max_new_tokens=512)
52
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)
53
+
54
+ class HuggingFaceLLMWrapper:
55
+ def __call__(self, prompt, **kwargs):
56
+ return generate(prompt)
57
+
58
+ llm = HuggingFaceLLMWrapper()
59
+
60
+ # QA chain
61
+ chain = ConversationalRetrievalChain.from_llm(
62
+ llm,
63
+ retriever=vectorstore.as_retriever(search_kwargs={'k': 6}),
64
+ return_source_documents=True,
65
+ verbose=False
66
+ )
67
+
68
+ chat_history = []
69
+
70
+ with gr.Blocks() as demo:
71
+ chatbot = gr.Chatbot([("", "Hello, I'm Thierry Decae's chatbot. Ask me about my experience, skills, eligibility, etc.")],
72
+ avatar_images=["./multiple_docs/Guest.jpg", "./multiple_docs/Thierry Picture.jpg"])
73
+ msg = gr.Textbox()
74
+ clear = gr.Button("Clear")
75
+
76
+ def user(query, chat_history):
77
+ chat_history_tuples = [(m[0], m[1]) for m in chat_history]
78
+ result = chain({"question": query, "chat_history": chat_history_tuples})
79
+ chat_history.append((query, result["answer"]))
80
+ return gr.update(value=""), chat_history
81
+
82
+ msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False)
83
+ clear.click(lambda: None, None, chatbot, queue=False)
84
+
85
+ demo.launch(debug=True)