Spaces:
Running
Running
import os | |
import gradio as gr | |
import bs4 | |
from langchain_community.document_loaders import WebBaseLoader | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
from langchain.vectorstores import FAISS | |
from langchain.chains import RetrievalQA | |
from langchain_groq import ChatGroq | |
# ํ๊ฒฝ ๋ณ์๋ก๋ถํฐ Groq API Key ๋ถ๋ฌ์ค๊ธฐ | |
groq_api_key = os.environ.get("GROQ_API_KEY", "") | |
# ๊ตญ๊ฐ๊ธฐ๋ก์ ์น ๋ฌธ์ ๋ชฉ๋ก | |
urls = [ | |
"https://archives.go.kr/next/newsearch/listSubjectContent.do?subjectFieldId=000011", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003140&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003288&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003290&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003292&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008757&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003293&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003294&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003295&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003289&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010816&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010817&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=009154&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003260&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003278&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003281&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003283&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003284&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003280&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003282&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003287&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003286&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003285&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003279&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003141&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003143&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003144&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003142&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008653&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010827&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008582&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008663&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008581&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010828&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010830&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010831&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003145&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=009425&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003146&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010821&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003151&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003149&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003148&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008655&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008654&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003150&pageFlag=A&sitePage=1-2-1", | |
"https://archives.go.kr/next/newmanager/recodeRegister.do", | |
"https://archives.go.kr/next/newtour/tourCourse.do", | |
"https://archives.go.kr/next/newrecordsMngPro/recordsDonateInfo.do", | |
"https://archives.go.kr/next/newdata/pepoleRecodPresentIntro.do", | |
"https://archives.go.kr/next/newsearch/searchGuideList.do", | |
"https://archives.go.kr/next/newsearch/searchGuideList.do?page=2", | |
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=441", | |
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=381", | |
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=341", | |
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=261", | |
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=227", | |
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=59", | |
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=30", | |
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=64", | |
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=321", | |
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=124", | |
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=267", | |
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=141", | |
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=149", | |
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=22" | |
] | |
# ์น๋ฌธ์ ๋ก๋ฉ | |
loader = WebBaseLoader(web_paths=urls, bs_kwargs=dict(parse_only=bs4.SoupStrainer())) | |
docs = loader.load() | |
# ๋ฌธ์ ๋ถํ | |
splitter = CharacterTextSplitter(separator="\n", chunk_size=500, chunk_overlap=50) | |
split_docs = splitter.split_documents(docs) | |
# ์๋ฒ ๋ฉ ๋ฐ ๋ฒกํฐ ์ ์ฅ ๋ฐ ๋ฆฌํธ๋ฆฌ๋ฒ ์ค์ | |
embedding_model = HuggingFaceEmbeddings(model_name="snunlp/KR-SBERT-V40K-klueNLI-augSTS") | |
vectorstore = FAISS.from_documents(split_docs, embedding_model) | |
retriever = vectorstore.as_retriever() | |
# LLM + QA ์ฒด์ธ | |
llm = ChatGroq(groq_api_key=groq_api_key, model_name="llama3-70b-8192") | |
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type="stuff") | |
# Gradio ์ฑํ ํจ์ | |
def chat_with_history(user_input, history): | |
if history is None: | |
history = [] | |
query = user_input.strip() + " ํ๊ตญ์ด๋ก ๋ตํด์ฃผ์ธ์." | |
result = qa_chain({"query": query}) | |
answer = result.get("result", "๋ต๋ณ์ ์ฐพ์ ์ ์์ต๋๋ค.") | |
history.append((user_input, answer)) | |
return "", history, history | |
# Gradio ์ธํฐํ์ด์ค ๊ตฌ์ฑ | |
with gr.Blocks() as demo: | |
gr.Markdown("## ๐ ๊ตญ๊ฐ๊ธฐ๋ก์ ์ ๋ณด ์ฑ๋ด") | |
chatbot = gr.Chatbot(label="๊ธฐ๋ก์ ์ฑ๋ด") | |
msg = gr.Textbox(placeholder="์ง๋ฌธ์ ์ ๋ ฅํ์ธ์", label="๐ฌ ์ง๋ฌธ ์ ๋ ฅ") | |
state = gr.State([]) | |
msg.submit(chat_with_history, inputs=[msg, state], outputs=[msg, chatbot, state]) | |
demo.launch() |