Spaces:
Sleeping
Sleeping
Upload 5 files
Browse files- .gitattributes +3 -0
- app.py +137 -0
- requirements.txt +11 -0
- 교육 전반 관련 기록물 목록1.xls +3 -0
- 교육 전반 관련 기록물 목록2.xls +3 -0
- 교육 전반 관련 기록물 목록3.xls +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
교육[[:space:]]전반[[:space:]]관련[[:space:]]기록물[[:space:]]목록1.xls filter=lfs diff=lfs merge=lfs -text
|
37 |
+
교육[[:space:]]전반[[:space:]]관련[[:space:]]기록물[[:space:]]목록2.xls filter=lfs diff=lfs merge=lfs -text
|
38 |
+
교육[[:space:]]전반[[:space:]]관련[[:space:]]기록물[[:space:]]목록3.xls filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import gradio as gr
|
3 |
+
import bs4
|
4 |
+
|
5 |
+
from langchain_community.document_loaders import WebBaseLoader
|
6 |
+
from langchain.text_splitter import CharacterTextSplitter
|
7 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
8 |
+
from langchain.vectorstores import FAISS
|
9 |
+
from langchain.chains import RetrievalQA
|
10 |
+
from langchain_groq import ChatGroq
|
11 |
+
from langchain_community.document_loaders import UnstructuredExcelLoader
|
12 |
+
|
13 |
+
# 환경 변수로부터 Groq API Key 불러오기
|
14 |
+
groq_api_key = os.environ.get("GROQ_API_KEY", "")
|
15 |
+
|
16 |
+
# 국가기록원 웹 문서 목록
|
17 |
+
urls = [
|
18 |
+
"https://archives.go.kr/next/newsearch/listSubjectContent.do?subjectFieldId=000011",
|
19 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003140&pageFlag=A&sitePage=1-2-1",
|
20 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003288&pageFlag=A&sitePage=1-2-1",
|
21 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003290&pageFlag=A&sitePage=1-2-1",
|
22 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003292&pageFlag=A&sitePage=1-2-1",
|
23 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008757&pageFlag=A&sitePage=1-2-1",
|
24 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003293&pageFlag=A&sitePage=1-2-1",
|
25 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003294&pageFlag=A&sitePage=1-2-1",
|
26 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003295&pageFlag=A&sitePage=1-2-1",
|
27 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003289&pageFlag=A&sitePage=1-2-1",
|
28 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010816&pageFlag=A&sitePage=1-2-1",
|
29 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010817&pageFlag=A&sitePage=1-2-1",
|
30 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=009154&pageFlag=A&sitePage=1-2-1",
|
31 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003260&pageFlag=A&sitePage=1-2-1",
|
32 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003278&pageFlag=A&sitePage=1-2-1",
|
33 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003281&pageFlag=A&sitePage=1-2-1",
|
34 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003283&pageFlag=A&sitePage=1-2-1",
|
35 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003284&pageFlag=A&sitePage=1-2-1",
|
36 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003280&pageFlag=A&sitePage=1-2-1",
|
37 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003282&pageFlag=A&sitePage=1-2-1",
|
38 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003287&pageFlag=A&sitePage=1-2-1",
|
39 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003286&pageFlag=A&sitePage=1-2-1",
|
40 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003285&pageFlag=A&sitePage=1-2-1",
|
41 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003279&pageFlag=A&sitePage=1-2-1",
|
42 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003141&pageFlag=A&sitePage=1-2-1",
|
43 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003143&pageFlag=A&sitePage=1-2-1",
|
44 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003144&pageFlag=A&sitePage=1-2-1",
|
45 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003142&pageFlag=A&sitePage=1-2-1",
|
46 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008653&pageFlag=A&sitePage=1-2-1",
|
47 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010827&pageFlag=A&sitePage=1-2-1",
|
48 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008582&pageFlag=A&sitePage=1-2-1",
|
49 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008663&pageFlag=A&sitePage=1-2-1",
|
50 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008581&pageFlag=A&sitePage=1-2-1",
|
51 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010828&pageFlag=A&sitePage=1-2-1",
|
52 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010830&pageFlag=A&sitePage=1-2-1",
|
53 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010831&pageFlag=A&sitePage=1-2-1",
|
54 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003145&pageFlag=A&sitePage=1-2-1",
|
55 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=009425&pageFlag=A&sitePage=1-2-1",
|
56 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003146&pageFlag=A&sitePage=1-2-1",
|
57 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010821&pageFlag=A&sitePage=1-2-1",
|
58 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003151&pageFlag=A&sitePage=1-2-1",
|
59 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003149&pageFlag=A&sitePage=1-2-1",
|
60 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003148&pageFlag=A&sitePage=1-2-1",
|
61 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008655&pageFlag=A&sitePage=1-2-1",
|
62 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008654&pageFlag=A&sitePage=1-2-1",
|
63 |
+
"https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003150&pageFlag=A&sitePage=1-2-1",
|
64 |
+
"https://archives.go.kr/next/newmanager/recodeRegister.do",
|
65 |
+
"https://archives.go.kr/next/newtour/tourCourse.do",
|
66 |
+
"https://archives.go.kr/next/newrecordsMngPro/recordsDonateInfo.do",
|
67 |
+
"https://archives.go.kr/next/newdata/pepoleRecodPresentIntro.do",
|
68 |
+
"https://archives.go.kr/next/newsearch/searchGuideList.do",
|
69 |
+
"https://archives.go.kr/next/newsearch/searchGuideList.do?page=2",
|
70 |
+
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=441",
|
71 |
+
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=381",
|
72 |
+
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=341",
|
73 |
+
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=261",
|
74 |
+
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=227",
|
75 |
+
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=59",
|
76 |
+
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=30",
|
77 |
+
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=64",
|
78 |
+
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=321",
|
79 |
+
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=124",
|
80 |
+
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=267",
|
81 |
+
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=141",
|
82 |
+
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=149",
|
83 |
+
"https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=22"
|
84 |
+
]
|
85 |
+
|
86 |
+
# 웹문서 로딩
|
87 |
+
loader = WebBaseLoader(web_paths=urls, bs_kwargs=dict(parse_only=bs4.SoupStrainer()))
|
88 |
+
docs = loader.load()
|
89 |
+
|
90 |
+
# 기록물 목록 엑셀 파일
|
91 |
+
excel_files = [
|
92 |
+
"교육 전반 관련 기록물 목록1.xls",
|
93 |
+
"교육 전반 관련 기록물 목록2.xls",
|
94 |
+
"교육 전반 관련 기록물 목록3.xls"
|
95 |
+
]
|
96 |
+
|
97 |
+
# 엑셀 문서 로딩
|
98 |
+
excel_docs = []
|
99 |
+
for file in excel_files:
|
100 |
+
loader = UnstructuredExcelLoader(file)
|
101 |
+
excel_docs.extend(loader.load())
|
102 |
+
|
103 |
+
# 웹문서 + 엑셀문서 결합
|
104 |
+
docs.extend(excel_docs)
|
105 |
+
|
106 |
+
# 문서 분할
|
107 |
+
splitter = CharacterTextSplitter(separator="\n", chunk_size=500, chunk_overlap=50)
|
108 |
+
split_docs = splitter.split_documents(docs)
|
109 |
+
|
110 |
+
# 임베딩 및 벡터 저장 및 리트리버 설정
|
111 |
+
embedding_model = HuggingFaceEmbeddings(model_name="snunlp/KR-SBERT-V40K-klueNLI-augSTS")
|
112 |
+
vectorstore = FAISS.from_documents(split_docs, embedding_model)
|
113 |
+
retriever = vectorstore.as_retriever()
|
114 |
+
|
115 |
+
# LLM + QA 체인
|
116 |
+
llm = ChatGroq(groq_api_key=groq_api_key, model_name="llama3-70b-8192")
|
117 |
+
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type="stuff")
|
118 |
+
|
119 |
+
# Gradio 채팅 함수
|
120 |
+
def chat_with_history(user_input, history):
|
121 |
+
if history is None:
|
122 |
+
history = []
|
123 |
+
query = user_input.strip() + " 한국어로 답해주세요."
|
124 |
+
result = qa_chain({"query": query})
|
125 |
+
answer = result.get("result", "답변을 찾을 수 없습니다.")
|
126 |
+
history.append((user_input, answer))
|
127 |
+
return "", history, history
|
128 |
+
|
129 |
+
# Gradio 인터페이스 구성
|
130 |
+
with gr.Blocks() as demo:
|
131 |
+
gr.Markdown("## 📚 국가기록원 정보 챗봇")
|
132 |
+
chatbot = gr.Chatbot(label="기록원 챗봇")
|
133 |
+
msg = gr.Textbox(placeholder="질문을 입력하세요", label="💬 질문 입력")
|
134 |
+
state = gr.State([])
|
135 |
+
msg.submit(chat_with_history, inputs=[msg, state], outputs=[msg, chatbot, state])
|
136 |
+
|
137 |
+
demo.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
langchain
|
3 |
+
langchain-community
|
4 |
+
langchain-groq
|
5 |
+
faiss-cpu
|
6 |
+
beautifulsoup4
|
7 |
+
transformers
|
8 |
+
sentence-transformers
|
9 |
+
pandas
|
10 |
+
openpyxl
|
11 |
+
unstructured
|
교육 전반 관련 기록물 목록1.xls
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c1a55238219a4236b1b1a649ef024f7f71a324f4734eda8d392b92d8a3857721
|
3 |
+
size 192000
|
교육 전반 관련 기록물 목록2.xls
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b12fb2c236e03bc261db25633672198871d680f450fd1a4e1ba3979162a6780d
|
3 |
+
size 298496
|
교육 전반 관련 기록물 목록3.xls
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cf6baa7cfc4d262f94a9ce6f10db7797020da8c4308a4a250f0f2a75b27f44d6
|
3 |
+
size 459264
|