Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,137 +1,274 @@
|
|
1 |
-
import os
|
2 |
-
import gradio as gr
|
3 |
-
import bs4
|
4 |
-
|
5 |
-
from langchain_community.document_loaders import WebBaseLoader
|
6 |
-
from langchain.text_splitter import CharacterTextSplitter
|
7 |
-
from langchain_community.embeddings import HuggingFaceEmbeddings
|
8 |
-
from langchain.vectorstores import FAISS
|
9 |
-
from langchain.chains import RetrievalQA
|
10 |
-
from langchain_groq import ChatGroq
|
11 |
-
from langchain_community.document_loaders import UnstructuredExcelLoader
|
12 |
-
|
13 |
-
# ํ๊ฒฝ ๋ณ์๋ก๋ถํฐ Groq API Key ๋ถ๋ฌ์ค๊ธฐ
|
14 |
-
groq_api_key = os.environ.get("GROQ_API_KEY", "")
|
15 |
-
|
16 |
-
# ๊ตญ๊ฐ๊ธฐ๋ก์ ์น ๋ฌธ์ ๋ชฉ๋ก
|
17 |
-
urls = [
|
18 |
-
"https://archives.go.kr/next/newsearch/listSubjectContent.do?subjectFieldId=000011",
|
19 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003140&pageFlag=A&sitePage=1-2-1",
|
20 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003288&pageFlag=A&sitePage=1-2-1",
|
21 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003290&pageFlag=A&sitePage=1-2-1",
|
22 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003292&pageFlag=A&sitePage=1-2-1",
|
23 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008757&pageFlag=A&sitePage=1-2-1",
|
24 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003293&pageFlag=A&sitePage=1-2-1",
|
25 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003294&pageFlag=A&sitePage=1-2-1",
|
26 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003295&pageFlag=A&sitePage=1-2-1",
|
27 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003289&pageFlag=A&sitePage=1-2-1",
|
28 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010816&pageFlag=A&sitePage=1-2-1",
|
29 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010817&pageFlag=A&sitePage=1-2-1",
|
30 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=009154&pageFlag=A&sitePage=1-2-1",
|
31 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003260&pageFlag=A&sitePage=1-2-1",
|
32 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003278&pageFlag=A&sitePage=1-2-1",
|
33 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003281&pageFlag=A&sitePage=1-2-1",
|
34 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003283&pageFlag=A&sitePage=1-2-1",
|
35 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003284&pageFlag=A&sitePage=1-2-1",
|
36 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003280&pageFlag=A&sitePage=1-2-1",
|
37 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003282&pageFlag=A&sitePage=1-2-1",
|
38 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003287&pageFlag=A&sitePage=1-2-1",
|
39 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003286&pageFlag=A&sitePage=1-2-1",
|
40 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003285&pageFlag=A&sitePage=1-2-1",
|
41 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003279&pageFlag=A&sitePage=1-2-1",
|
42 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003141&pageFlag=A&sitePage=1-2-1",
|
43 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003143&pageFlag=A&sitePage=1-2-1",
|
44 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003144&pageFlag=A&sitePage=1-2-1",
|
45 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003142&pageFlag=A&sitePage=1-2-1",
|
46 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008653&pageFlag=A&sitePage=1-2-1",
|
47 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010827&pageFlag=A&sitePage=1-2-1",
|
48 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008582&pageFlag=A&sitePage=1-2-1",
|
49 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008663&pageFlag=A&sitePage=1-2-1",
|
50 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008581&pageFlag=A&sitePage=1-2-1",
|
51 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010828&pageFlag=A&sitePage=1-2-1",
|
52 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010830&pageFlag=A&sitePage=1-2-1",
|
53 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010831&pageFlag=A&sitePage=1-2-1",
|
54 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003145&pageFlag=A&sitePage=1-2-1",
|
55 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=009425&pageFlag=A&sitePage=1-2-1",
|
56 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003146&pageFlag=A&sitePage=1-2-1",
|
57 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010821&pageFlag=A&sitePage=1-2-1",
|
58 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003151&pageFlag=A&sitePage=1-2-1",
|
59 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003149&pageFlag=A&sitePage=1-2-1",
|
60 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003148&pageFlag=A&sitePage=1-2-1",
|
61 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008655&pageFlag=A&sitePage=1-2-1",
|
62 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008654&pageFlag=A&sitePage=1-2-1",
|
63 |
-
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003150&pageFlag=A&sitePage=1-2-1",
|
64 |
-
"https://archives.go.kr/next/newmanager/recodeRegister.do",
|
65 |
-
"https://archives.go.kr/next/newtour/tourCourse.do",
|
66 |
-
"https://archives.go.kr/next/newrecordsMngPro/recordsDonateInfo.do",
|
67 |
-
"https://archives.go.kr/next/newdata/pepoleRecodPresentIntro.do",
|
68 |
-
"https://archives.go.kr/next/newsearch/searchGuideList.do",
|
69 |
-
"https://archives.go.kr/next/newsearch/searchGuideList.do?page=2",
|
70 |
-
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=441",
|
71 |
-
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=381",
|
72 |
-
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=341",
|
73 |
-
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=261",
|
74 |
-
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=227",
|
75 |
-
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=59",
|
76 |
-
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=30",
|
77 |
-
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=64",
|
78 |
-
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=321",
|
79 |
-
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=124",
|
80 |
-
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=267",
|
81 |
-
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=141",
|
82 |
-
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=149",
|
83 |
-
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=22"
|
84 |
-
]
|
85 |
-
|
86 |
-
# ์น๋ฌธ์ ๋ก๋ฉ
|
87 |
-
loader = WebBaseLoader(web_paths=urls, bs_kwargs=dict(parse_only=bs4.SoupStrainer()))
|
88 |
-
docs = loader.load()
|
89 |
-
|
90 |
-
# ๊ธฐ๋ก๋ฌผ ๋ชฉ๋ก ์์
ํ์ผ
|
91 |
-
excel_files = [
|
92 |
-
"๊ต์ก ์ ๋ฐ ๊ด๋ จ ๊ธฐ๋ก๋ฌผ ๋ชฉ๋ก1.xls",
|
93 |
-
"๊ต์ก ์ ๋ฐ ๊ด๋ จ ๊ธฐ๋ก๋ฌผ ๋ชฉ๋ก2.xls",
|
94 |
-
"๊ต์ก ์ ๋ฐ ๊ด๋ จ ๊ธฐ๋ก๋ฌผ ๋ชฉ๋ก3.xls"
|
95 |
-
]
|
96 |
-
|
97 |
-
# ์์
๋ฌธ์ ๋ก๋ฉ
|
98 |
-
excel_docs = []
|
99 |
-
for file in excel_files:
|
100 |
-
loader = UnstructuredExcelLoader(file)
|
101 |
-
excel_docs.extend(loader.load())
|
102 |
-
|
103 |
-
# ์น๋ฌธ์ + ์์
๋ฌธ์ ๊ฒฐํฉ
|
104 |
-
docs.extend(excel_docs)
|
105 |
-
|
106 |
-
# ๋ฌธ์ ๋ถํ
|
107 |
-
splitter = CharacterTextSplitter(separator="\n", chunk_size=500, chunk_overlap=50)
|
108 |
-
split_docs = splitter.split_documents(docs)
|
109 |
-
|
110 |
-
# ์๋ฒ ๋ฉ ๋ฐ ๋ฒกํฐ ์ ์ฅ ๋ฐ ๋ฆฌํธ๋ฆฌ๋ฒ ์ค์
|
111 |
-
embedding_model = HuggingFaceEmbeddings(model_name="snunlp/KR-SBERT-V40K-klueNLI-augSTS")
|
112 |
-
vectorstore = FAISS.from_documents(split_docs, embedding_model)
|
113 |
-
retriever = vectorstore.as_retriever()
|
114 |
-
|
115 |
-
# LLM + QA ์ฒด์ธ
|
116 |
-
llm = ChatGroq(groq_api_key=groq_api_key, model_name="llama3-70b-8192")
|
117 |
-
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type="stuff")
|
118 |
-
|
119 |
-
#
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
demo.launch()
|
|
|
1 |
+
import os
|
2 |
+
import gradio as gr
|
3 |
+
import bs4
|
4 |
+
|
5 |
+
from langchain_community.document_loaders import WebBaseLoader
|
6 |
+
from langchain.text_splitter import CharacterTextSplitter
|
7 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
8 |
+
from langchain.vectorstores import FAISS
|
9 |
+
from langchain.chains import RetrievalQA
|
10 |
+
from langchain_groq import ChatGroq
|
11 |
+
from langchain_community.document_loaders import UnstructuredExcelLoader
|
12 |
+
|
13 |
+
# ํ๊ฒฝ ๋ณ์๋ก๋ถํฐ Groq API Key ๋ถ๋ฌ์ค๊ธฐ
|
14 |
+
groq_api_key = os.environ.get("GROQ_API_KEY", "")
|
15 |
+
|
16 |
+
# ๊ตญ๊ฐ๊ธฐ๋ก์ ์น ๋ฌธ์ ๋ชฉ๋ก
|
17 |
+
urls = [
|
18 |
+
"https://archives.go.kr/next/newsearch/listSubjectContent.do?subjectFieldId=000011",
|
19 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003140&pageFlag=A&sitePage=1-2-1",
|
20 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003288&pageFlag=A&sitePage=1-2-1",
|
21 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003290&pageFlag=A&sitePage=1-2-1",
|
22 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003292&pageFlag=A&sitePage=1-2-1",
|
23 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008757&pageFlag=A&sitePage=1-2-1",
|
24 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003293&pageFlag=A&sitePage=1-2-1",
|
25 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003294&pageFlag=A&sitePage=1-2-1",
|
26 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003295&pageFlag=A&sitePage=1-2-1",
|
27 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003289&pageFlag=A&sitePage=1-2-1",
|
28 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010816&pageFlag=A&sitePage=1-2-1",
|
29 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010817&pageFlag=A&sitePage=1-2-1",
|
30 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=009154&pageFlag=A&sitePage=1-2-1",
|
31 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003260&pageFlag=A&sitePage=1-2-1",
|
32 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003278&pageFlag=A&sitePage=1-2-1",
|
33 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003281&pageFlag=A&sitePage=1-2-1",
|
34 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003283&pageFlag=A&sitePage=1-2-1",
|
35 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003284&pageFlag=A&sitePage=1-2-1",
|
36 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003280&pageFlag=A&sitePage=1-2-1",
|
37 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003282&pageFlag=A&sitePage=1-2-1",
|
38 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003287&pageFlag=A&sitePage=1-2-1",
|
39 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003286&pageFlag=A&sitePage=1-2-1",
|
40 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003285&pageFlag=A&sitePage=1-2-1",
|
41 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003279&pageFlag=A&sitePage=1-2-1",
|
42 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003141&pageFlag=A&sitePage=1-2-1",
|
43 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003143&pageFlag=A&sitePage=1-2-1",
|
44 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003144&pageFlag=A&sitePage=1-2-1",
|
45 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003142&pageFlag=A&sitePage=1-2-1",
|
46 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008653&pageFlag=A&sitePage=1-2-1",
|
47 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010827&pageFlag=A&sitePage=1-2-1",
|
48 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008582&pageFlag=A&sitePage=1-2-1",
|
49 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008663&pageFlag=A&sitePage=1-2-1",
|
50 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008581&pageFlag=A&sitePage=1-2-1",
|
51 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010828&pageFlag=A&sitePage=1-2-1",
|
52 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010830&pageFlag=A&sitePage=1-2-1",
|
53 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010831&pageFlag=A&sitePage=1-2-1",
|
54 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003145&pageFlag=A&sitePage=1-2-1",
|
55 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=009425&pageFlag=A&sitePage=1-2-1",
|
56 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003146&pageFlag=A&sitePage=1-2-1",
|
57 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010821&pageFlag=A&sitePage=1-2-1",
|
58 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003151&pageFlag=A&sitePage=1-2-1",
|
59 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003149&pageFlag=A&sitePage=1-2-1",
|
60 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003148&pageFlag=A&sitePage=1-2-1",
|
61 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008655&pageFlag=A&sitePage=1-2-1",
|
62 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008654&pageFlag=A&sitePage=1-2-1",
|
63 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003150&pageFlag=A&sitePage=1-2-1",
|
64 |
+
"https://archives.go.kr/next/newmanager/recodeRegister.do",
|
65 |
+
"https://archives.go.kr/next/newtour/tourCourse.do",
|
66 |
+
"https://archives.go.kr/next/newrecordsMngPro/recordsDonateInfo.do",
|
67 |
+
"https://archives.go.kr/next/newdata/pepoleRecodPresentIntro.do",
|
68 |
+
"https://archives.go.kr/next/newsearch/searchGuideList.do",
|
69 |
+
"https://archives.go.kr/next/newsearch/searchGuideList.do?page=2",
|
70 |
+
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=441",
|
71 |
+
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=381",
|
72 |
+
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=341",
|
73 |
+
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=261",
|
74 |
+
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=227",
|
75 |
+
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=59",
|
76 |
+
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=30",
|
77 |
+
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=64",
|
78 |
+
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=321",
|
79 |
+
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=124",
|
80 |
+
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=267",
|
81 |
+
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=141",
|
82 |
+
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=149",
|
83 |
+
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=22"
|
84 |
+
]
|
85 |
+
|
86 |
+
# ์น๋ฌธ์ ๋ก๋ฉ
|
87 |
+
loader = WebBaseLoader(web_paths=urls, bs_kwargs=dict(parse_only=bs4.SoupStrainer()))
|
88 |
+
docs = loader.load()
|
89 |
+
|
90 |
+
# ๊ธฐ๋ก๋ฌผ ๋ชฉ๋ก ์์
ํ์ผ
|
91 |
+
excel_files = [
|
92 |
+
"๊ต์ก ์ ๋ฐ ๊ด๋ จ ๊ธฐ๋ก๋ฌผ ๋ชฉ๋ก1.xls",
|
93 |
+
"๊ต์ก ์ ๋ฐ ๊ด๋ จ ๊ธฐ๋ก๋ฌผ ๋ชฉ๋ก2.xls",
|
94 |
+
"๊ต์ก ์ ๋ฐ ๊ด๋ จ ๊ธฐ๋ก๋ฌผ ๋ชฉ๋ก3.xls"
|
95 |
+
]
|
96 |
+
|
97 |
+
# ์์
๋ฌธ์ ๋ก๋ฉ
|
98 |
+
excel_docs = []
|
99 |
+
for file in excel_files:
|
100 |
+
loader = UnstructuredExcelLoader(file)
|
101 |
+
excel_docs.extend(loader.load())
|
102 |
+
|
103 |
+
# ์น๋ฌธ์ + ์์
๋ฌธ์ ๊ฒฐํฉ
|
104 |
+
docs.extend(excel_docs)
|
105 |
+
|
106 |
+
# ๋ฌธ์ ๋ถํ
|
107 |
+
splitter = CharacterTextSplitter(separator="\n", chunk_size=500, chunk_overlap=50)
|
108 |
+
split_docs = splitter.split_documents(docs)
|
109 |
+
|
110 |
+
# ์๋ฒ ๋ฉ ๋ฐ ๋ฒกํฐ ์ ์ฅ ๋ฐ ๋ฆฌํธ๋ฆฌ๋ฒ ์ค์
|
111 |
+
embedding_model = HuggingFaceEmbeddings(model_name="snunlp/KR-SBERT-V40K-klueNLI-augSTS")
|
112 |
+
vectorstore = FAISS.from_documents(split_docs, embedding_model)
|
113 |
+
retriever = vectorstore.as_retriever()
|
114 |
+
|
115 |
+
# LLM + QA ์ฒด์ธ
|
116 |
+
llm = ChatGroq(groq_api_key=groq_api_key, model_name="llama3-70b-8192")
|
117 |
+
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type="stuff")
|
118 |
+
|
119 |
+
# ์์ ์ง๋ฌธ
|
120 |
+
example_questions = [
|
121 |
+
"๊ธฐ๋ก๋ฌผ ์ด๋ ๋ฐฉ๋ฒ์ ์ด๋ป๊ฒ ๋๋์?",
|
122 |
+
"๊ฒฌํ์ ์ฒญ์ ๋ํด ์ ์ ์๋์?",
|
123 |
+
"๊ธฐ๋ก๋ฌผ ๊ธฐ์ฆ ๋ฐฉ๋ฒ์ ๋ฌด์์ธ๊ฐ์?",
|
124 |
+
"๊ธฐ๋ก๋ฌผ ๊ฒ์ ๊ธธ์ก์ด๊ฐ ๋ฌด์์ธ๊ฐ์?",
|
125 |
+
"ใ
ใ
ใ
์ ์ฃผ์ ์ ํ์ ๋ฌด์์ธ๊ฐ์?",
|
126 |
+
"ใ
ใ
ใ
์ ํ์์ฃผ์ ๋ ๋ฌด์์ธ๊ฐ์?",
|
127 |
+
"ใ
ใ
ใ
์ ๋ฐฐ๊ฒฝ์ ๋ฌด์์ธ๊ฐ์?",
|
128 |
+
"ใ
ใ
ใ
์ ์ญ์ฌ์ ์์๋ ๋ฌด์์ธ๊ฐ์?",
|
129 |
+
"ใ
ใ
ใ
์ ์งํ์๋ ๋๊ตฌ์ธ๊ฐ์?",
|
130 |
+
"ใ
ใ
ใ
์ ์ฌ์ง ๊ธฐ๋ก๋ฌผ์ด ์๋์?",
|
131 |
+
"ใ
ใ
ใ
์ ๊ธฐ๋ก๋ฌผ ์ค์ ์ ๋ถ๊ฐํ๋ฌผ์ ๋ช๊ฐ์ธ๊ฐ์?",
|
132 |
+
"ใ
ใ
ใ
์ ๊ธฐ๋ก๋ฌผ์ ๋ช๊ฑด์ธ๊ฐ์?",
|
133 |
+
"ใ
ใ
ใ
์ ๊ธฐ๋ก๋ฌผ๋ค์ ์ ๋ชฉ์ ์ ์ ์๋์?"
|
134 |
+
]
|
135 |
+
|
136 |
+
# ํค์๋ ๊ณ์ธต ๊ตฌ์กฐ
|
137 |
+
keyword_tree = {
|
138 |
+
"๊ต์ก ์ ๋ฐ": {
|
139 |
+
"๊ต์ก ๋ฏผ์ฃผํ์ด๋": {
|
140 |
+
"๊ต์์ด๋": {
|
141 |
+
"๊ต์ก ๋ฏผ์ฃผํ์ ์ธ": {},
|
142 |
+
"๋ฏผ์ค๊ต์ก์ง ์ฌ๊ฑด": {},
|
143 |
+
"์ฐธ๊ต์ก ์ด๋": {}
|
144 |
+
},
|
145 |
+
"ํ๋ถ๋ชจ ์ด๋": {
|
146 |
+
"๊ต์ก๊ฐํ์๋ฏผ์ด๋์ฐ๋": {}
|
147 |
+
},
|
148 |
+
"ํ์์ด๋": {}
|
149 |
+
},
|
150 |
+
"๊ต์ก ์ ๋ณดํ ์ ์ฑ
": {
|
151 |
+
"AI ๋์งํธ ๊ต๊ณผ์": {},
|
152 |
+
"e๋ฌ๋ํ์ฑํ": {},
|
153 |
+
"๊ต์กํ์ ์ ๋ณด์์คํ
(NEIS)": {}
|
154 |
+
},
|
155 |
+
"๊ต์ก๊ฐํ": {
|
156 |
+
"๊ต์ก๊ฐํ์ฌ์ํ": {},
|
157 |
+
"๊ต์ก๊ฐํ์์ํ": {
|
158 |
+
"5ยท31 ๊ต์ก๊ฐํ": {}
|
159 |
+
},
|
160 |
+
"๊ต์ก์ ์ฑ
์ฌ์ํ": {},
|
161 |
+
"๊ต์ก์ ์ฑ
์๋ฌธํ์": {},
|
162 |
+
"๊ต์กํ์ ์์ํ": {},
|
163 |
+
"์๊ต์ก๊ณต๋์ฒด์์ํ": {},
|
164 |
+
"์ธ๋ ฅ์์๊ฐ๋ฐํ์": {},
|
165 |
+
"์ฅ๊ธฐ์ข
ํฉ๊ณํ์ฌ์ํ": {}
|
166 |
+
},
|
167 |
+
"๊ต์ก์ด๋
": {
|
168 |
+
"๊ตญ๋ฏผ๊ต์กํ์ฅ": {
|
169 |
+
"ํ๋ํธ๊ตญ๋จ": {}
|
170 |
+
},
|
171 |
+
"ํ์ต์ธ๊ฐ ๊ต์ก์ด๋
": {
|
172 |
+
"์ผ๋ฏผ์ฃผ์": {}
|
173 |
+
}
|
174 |
+
},
|
175 |
+
"๊ต์ก์ ์ฑ
๊ด๋ จ ๊ธฐ๊ด": {
|
176 |
+
"ํ๊ตญ๊ต์ก๊ฐ๋ฐ์": {},
|
177 |
+
"ํ๊ตญ๊ต์ก๊ณผ์ ํ๊ฐ์": {},
|
178 |
+
"ํ๊ตญ๊ต์ก๋ฐฉ์ก๊ณต์ฌ": {}
|
179 |
+
},
|
180 |
+
"ํ์ ์งํฅ ์ ์ฑ
": {
|
181 |
+
"KERIS": {},
|
182 |
+
"๋ํ๋ฏผ๊ตญํ์ ์": {}
|
183 |
+
},
|
184 |
+
"ํ์ ": {
|
185 |
+
"ํ๋ น์ธ๊ตฌ ๊ฐ์": {},
|
186 |
+
"ํ์ ํ์ ": {}
|
187 |
+
},
|
188 |
+
"ํ๋ฒ์ ๊ต์ก์กฐํญ๊ณผ ๋ณ์ฒ": {
|
189 |
+
"๊ณ ๋ฑ๊ต์ก๋ฒ": {},
|
190 |
+
"๊ต์ก๊ธฐ๋ณธ๋ฒ": {},
|
191 |
+
"๊ต์ก๋ฒ ์ ์ ": {},
|
192 |
+
"๊ต์ก์ ๊ดํ ์์ํน๋ก๋ฒ": {},
|
193 |
+
"์ฌ๋ฆฝํ๊ต๋ฒ": {},
|
194 |
+
"์ดยท์ค๋ฑ๊ต์ก๋ฒ": {}
|
195 |
+
}
|
196 |
+
}
|
197 |
+
}
|
198 |
+
|
199 |
+
# ๊ฒฝ๋ก์์ ํ์ ํค์๋ ๋ฐํ
|
200 |
+
def get_keywords(path):
|
201 |
+
node = keyword_tree
|
202 |
+
for key in path:
|
203 |
+
node = node.get(key, {})
|
204 |
+
return list(node.keys())
|
205 |
+
|
206 |
+
def format_path(path):
|
207 |
+
return " > ".join(path) if path else "๊ต์ก ์ ๋ฐ"
|
208 |
+
|
209 |
+
def on_keyword_select(selected, path):
|
210 |
+
new_path = path + [selected]
|
211 |
+
next_keywords = get_keywords(new_path)
|
212 |
+
formatted = format_path(new_path)
|
213 |
+
return formatted, new_path, gr.update(choices=next_keywords)
|
214 |
+
|
215 |
+
# Gradio ์ฑํ
ํจ์
|
216 |
+
def chat_with_history(user_input, history):
|
217 |
+
if history is None:
|
218 |
+
history = []
|
219 |
+
query = user_input.strip() + " ํ๊ตญ์ด๋ก ๋ตํด์ฃผ์ธ์."
|
220 |
+
result = qa_chain({"query": query})
|
221 |
+
answer = result.get("result", "๋ต๋ณ์ ์ฐพ์ ์ ์์ต๋๋ค.")
|
222 |
+
history.append((user_input, answer))
|
223 |
+
return "", history, history
|
224 |
+
|
225 |
+
# Gradio ์ธํฐํ์ด์ค ๊ตฌ์ฑ
|
226 |
+
with gr.Blocks() as demo:
|
227 |
+
gr.Markdown("## ๐ ๊ตญ๊ฐ๊ธฐ๋ก์ ์ฑ๋ด")
|
228 |
+
gr.Markdown(
|
229 |
+
"""### **๊ตญ๊ฐ๊ธฐ๋ก์ ์ ๋ณด ์ฑ๋ด์ ์ค์ ๊ฒ์ ํ์ํฉ๋๋ค!**
|
230 |
+
|
231 |
+
์ด ์ฑ๋ด์ ๊ตญ๊ฐ๊ธฐ๋ก์์ ๋ณด๊ด๋ ๋ค์ํ ๊ธฐ๋ก๋ฌผ์ ๋ฐํ์ผ๋ก ์ฌ๋ฌ๋ถ์ ๊ถ๊ธ์ฆ์ ์ฝ๊ณ ๋น ๋ฅด๊ฒ ํด๊ฒฐํด ๋๋ฆฝ๋๋ค.
|
232 |
+
|
233 |
+
๊ตญ๊ฐ๊ธฐ๋ก์์ ์ญํ , ๊ธฐ๋ก๋ฌผ ์ด๋ ๋ฐฉ๋ฒ, ๊ฒฌํ ์ ์ฒญ, ๊ธฐ์ฆ ์ ์ฐจ ๋ฑ์ ๊ณต์ ์ ๋ณด๋ฅผ ํ์ธํ ์ ์์ผ๋ฉฐ, ๊ต์ก ์ ๋ฐ ๋ถ์ผ์ ๊ดํ ์ ๋ณด๋ฅผ ์๋ดํด ๋๋ฆฝ๋๋ค.
|
234 |
+
|
235 |
+
์๋ ์
๋ ฅ์ฐฝ์ ๊ถ๊ธํ ๋ด์ฉ์ ์์ ๋กญ๊ฒ ์
๋ ฅํด ๋ณด์ธ์.
|
236 |
+
|
237 |
+
๐ก ์ง๋ฌธ์ ์ด๋ป๊ฒ ์์ํ ์ง ๊ณ ๋ฏผ ์ค์ด์ ๊ฐ์?
|
238 |
+
- **์์ ์ง๋ฌธ ๋ณด๊ธฐ**์์ ์ง๋ฌธ์ ๊ณจ๋ผ๋ณด์ธ์. (์์ ์ง๋ฌธ์ ใ
ใ
ใ
์ ๊ด์ฌ ์๋ ์ฃผ์ ๋ฅผ ์
๋ ฅํด๋ณด์ธ์.)
|
239 |
+
- ๋๋ ์๋์ **๊ฒ์ ํค์๋ ํ์** ๊ธฐ๋ฅ์ ํ์ฉํด ์ฃผ์ ๋ณ ํค์๋๋ฅผ ์ฐพ์๋ณด์ธ์.
|
240 |
+
"""
|
241 |
+
)
|
242 |
+
chatbot = gr.Chatbot(label="๊ธฐ๋ก์ ์ฑ๋ด", type="messages")
|
243 |
+
with gr.Row():
|
244 |
+
dropdown = gr.Dropdown(choices=example_questions, label="๐ ์์ ์ง๋ฌธ ๋ณด๊ธฐ")
|
245 |
+
msg = gr.Textbox(placeholder="์ง๋ฌธ์ ์
๋ ฅํ์ธ์", label="๐ฌ ์ง๋ฌธ ์
๋ ฅ", lines=1)
|
246 |
+
|
247 |
+
state = gr.State([]) # ์ฑํ
๊ธฐ๋ก
|
248 |
+
path_state = gr.State([]) # ํค์๋ ๊ฒฝ๋ก ์ํ
|
249 |
+
|
250 |
+
dropdown.change(lambda q: q, inputs=dropdown, outputs=msg)
|
251 |
+
msg.submit(chat_with_history, inputs=[msg, state], outputs=[msg, chatbot, state])
|
252 |
+
|
253 |
+
with gr.Column():
|
254 |
+
gr.Markdown("### ๐ ๊ฒ์ ํค์๋ ํ์")
|
255 |
+
gr.Markdown(
|
256 |
+
""" ๊ต์ก ์ ๋ฐ์ ํค์๋๋ฅผ ๋ณด์ค ์ ์์ต๋๋ค.
|
257 |
+
|
258 |
+
์ฃผ์ ๋ณ๋ก ์ ๋ฆฌ๋ **๊ฒ์ ํค์๋ ๊ณ์ธต ๊ตฌ์กฐ**๋ฅผ ๋ฐ๋ผ๊ฐ๋ฉฐ ๊ด์ฌ ์๋ ๋ถ์ผ๋ฅผ ๋จ๊ณ์ ์ผ๋ก ํ์ํ ์ ์๋ ๊ธฐ๋ฅ์
๋๋ค.
|
259 |
+
|
260 |
+
- ๋จผ์ ์์ ์ฃผ์ ๋ถํฐ ์์ํด ๋ณด์ธ์.
|
261 |
+
- ์ํ๋ ์ฃผ์ ๋ฅผ ํด๋ฆญํ๋ฉด ๊ทธ ์๋์ ์ธ๋ถ ํญ๋ชฉ์ด ํผ์ณ์ง๋๋ค.
|
262 |
+
- ํด๋ฆญ์ ๊ณ์ํด ๋ค์ด๊ฐ๋ฉด, ์ ์ ๋ ๊ตฌ์ฒด์ ์ธ ํค์๋์ ๋๋ฌํ ์ ์์ด์.
|
263 |
+
"""
|
264 |
+
)
|
265 |
+
|
266 |
+
keyword_selector = gr.Radio(choices=get_keywords([]), label="ํค์๋ ์ ํ", value=None)
|
267 |
+
|
268 |
+
keyword_selector.change(
|
269 |
+
fn=on_keyword_select,
|
270 |
+
inputs=[keyword_selector, path_state],
|
271 |
+
outputs=[keyword_path_display, path_state, keyword_selector]
|
272 |
+
)
|
273 |
+
|
274 |
demo.launch()
|