import os import gradio as gr import bs4 from langchain_community.document_loaders import WebBaseLoader from langchain.text_splitter import CharacterTextSplitter from langchain_community.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import FAISS from langchain.chains import RetrievalQA from langchain_groq import ChatGroq # 환경 변수로부터 Groq API Key 불러오기 groq_api_key = os.environ.get("GROQ_API_KEY", "") # 국가기록원 웹 문서 목록 urls = [ "https://archives.go.kr/next/newsearch/listSubjectContent.do?subjectFieldId=000011", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003140&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003288&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003290&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003292&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008757&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003293&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003294&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003295&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003289&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010816&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010817&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=009154&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003260&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003278&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003281&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003283&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003284&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003280&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003282&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003287&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003286&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003285&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003279&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003141&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003143&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003144&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003142&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008653&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010827&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008582&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008663&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008581&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010828&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010830&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010831&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003145&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=009425&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003146&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010821&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003151&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003149&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003148&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008655&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008654&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003150&pageFlag=A&sitePage=1-2-1", "https://archives.go.kr/next/newmanager/recodeRegister.do", "https://archives.go.kr/next/newtour/tourCourse.do", "https://archives.go.kr/next/newrecordsMngPro/recordsDonateInfo.do", "https://archives.go.kr/next/newdata/pepoleRecodPresentIntro.do", "https://archives.go.kr/next/newsearch/searchGuideList.do", "https://archives.go.kr/next/newsearch/searchGuideList.do?page=2", "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=441", "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=381", "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=341", "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=261", "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=227", "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=59", "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=30", "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=64", "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=321", "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=124", "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=267", "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=141", "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=149", "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=22" ] # 웹문서 로딩 loader = WebBaseLoader(web_paths=urls, bs_kwargs=dict(parse_only=bs4.SoupStrainer())) docs = loader.load() # 문서 분할 splitter = CharacterTextSplitter(separator="\n", chunk_size=500, chunk_overlap=50) split_docs = splitter.split_documents(docs) # 임베딩 및 벡터 저장 및 리트리버 설정 embedding_model = HuggingFaceEmbeddings(model_name="snunlp/KR-SBERT-V40K-klueNLI-augSTS") vectorstore = FAISS.from_documents(split_docs, embedding_model) retriever = vectorstore.as_retriever() # LLM + QA 체인 llm = ChatGroq(groq_api_key=groq_api_key, model_name="llama3-70b-8192") qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type="stuff") # Gradio 채팅 함수 def chat_with_history(user_input, history): if history is None: history = [] query = user_input.strip() + " 한국어로 답해주세요." result = qa_chain({"query": query}) answer = result.get("result", "답변을 찾을 수 없습니다.") history.append((user_input, answer)) return "", history, history # Gradio 인터페이스 구성 with gr.Blocks() as demo: gr.Markdown("## 📚 국가기록원 정보 챗봇") chatbot = gr.Chatbot(label="기록원 챗봇") msg = gr.Textbox(placeholder="질문을 입력하세요", label="💬 질문 입력") state = gr.State([]) msg.submit(chat_with_history, inputs=[msg, state], outputs=[msg, chatbot, state]) demo.launch()