suhyun1 commited on
Commit
07982cc
·
verified ·
1 Parent(s): 9b29d4b

Upload 5 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ 교육[[:space:]]전반[[:space:]]관련[[:space:]]기록물[[:space:]]목록1.xls filter=lfs diff=lfs merge=lfs -text
37
+ 교육[[:space:]]전반[[:space:]]관련[[:space:]]기록물[[:space:]]목록2.xls filter=lfs diff=lfs merge=lfs -text
38
+ 교육[[:space:]]전반[[:space:]]관련[[:space:]]기록물[[:space:]]목록3.xls filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import bs4
4
+
5
+ from langchain_community.document_loaders import WebBaseLoader
6
+ from langchain.text_splitter import CharacterTextSplitter
7
+ from langchain_community.embeddings import HuggingFaceEmbeddings
8
+ from langchain.vectorstores import FAISS
9
+ from langchain.chains import RetrievalQA
10
+ from langchain_groq import ChatGroq
11
+ from langchain_community.document_loaders import UnstructuredExcelLoader
12
+
13
+ # 환경 변수로부터 Groq API Key 불러오기
14
+ groq_api_key = os.environ.get("GROQ_API_KEY", "")
15
+
16
+ # 국가기록원 웹 문서 목록
17
+ urls = [
18
+ "https://archives.go.kr/next/newsearch/listSubjectContent.do?subjectFieldId=000011",
19
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003140&pageFlag=A&sitePage=1-2-1",
20
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003288&pageFlag=A&sitePage=1-2-1",
21
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003290&pageFlag=A&sitePage=1-2-1",
22
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003292&pageFlag=A&sitePage=1-2-1",
23
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008757&pageFlag=A&sitePage=1-2-1",
24
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003293&pageFlag=A&sitePage=1-2-1",
25
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003294&pageFlag=A&sitePage=1-2-1",
26
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003295&pageFlag=A&sitePage=1-2-1",
27
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003289&pageFlag=A&sitePage=1-2-1",
28
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010816&pageFlag=A&sitePage=1-2-1",
29
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010817&pageFlag=A&sitePage=1-2-1",
30
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=009154&pageFlag=A&sitePage=1-2-1",
31
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003260&pageFlag=A&sitePage=1-2-1",
32
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003278&pageFlag=A&sitePage=1-2-1",
33
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003281&pageFlag=A&sitePage=1-2-1",
34
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003283&pageFlag=A&sitePage=1-2-1",
35
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003284&pageFlag=A&sitePage=1-2-1",
36
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003280&pageFlag=A&sitePage=1-2-1",
37
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003282&pageFlag=A&sitePage=1-2-1",
38
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003287&pageFlag=A&sitePage=1-2-1",
39
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003286&pageFlag=A&sitePage=1-2-1",
40
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003285&pageFlag=A&sitePage=1-2-1",
41
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003279&pageFlag=A&sitePage=1-2-1",
42
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003141&pageFlag=A&sitePage=1-2-1",
43
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003143&pageFlag=A&sitePage=1-2-1",
44
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003144&pageFlag=A&sitePage=1-2-1",
45
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003142&pageFlag=A&sitePage=1-2-1",
46
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008653&pageFlag=A&sitePage=1-2-1",
47
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010827&pageFlag=A&sitePage=1-2-1",
48
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008582&pageFlag=A&sitePage=1-2-1",
49
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008663&pageFlag=A&sitePage=1-2-1",
50
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008581&pageFlag=A&sitePage=1-2-1",
51
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010828&pageFlag=A&sitePage=1-2-1",
52
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010830&pageFlag=A&sitePage=1-2-1",
53
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010831&pageFlag=A&sitePage=1-2-1",
54
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003145&pageFlag=A&sitePage=1-2-1",
55
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=009425&pageFlag=A&sitePage=1-2-1",
56
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003146&pageFlag=A&sitePage=1-2-1",
57
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010821&pageFlag=A&sitePage=1-2-1",
58
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003151&pageFlag=A&sitePage=1-2-1",
59
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003149&pageFlag=A&sitePage=1-2-1",
60
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003148&pageFlag=A&sitePage=1-2-1",
61
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008655&pageFlag=A&sitePage=1-2-1",
62
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008654&pageFlag=A&sitePage=1-2-1",
63
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003150&pageFlag=A&sitePage=1-2-1",
64
+ "https://archives.go.kr/next/newmanager/recodeRegister.do",
65
+ "https://archives.go.kr/next/newtour/tourCourse.do",
66
+ "https://archives.go.kr/next/newrecordsMngPro/recordsDonateInfo.do",
67
+ "https://archives.go.kr/next/newdata/pepoleRecodPresentIntro.do",
68
+ "https://archives.go.kr/next/newsearch/searchGuideList.do",
69
+ "https://archives.go.kr/next/newsearch/searchGuideList.do?page=2",
70
+ "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=441",
71
+ "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=381",
72
+ "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=341",
73
+ "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=261",
74
+ "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=227",
75
+ "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=59",
76
+ "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=30",
77
+ "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=64",
78
+ "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=321",
79
+ "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=124",
80
+ "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=267",
81
+ "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=141",
82
+ "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=149",
83
+ "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=22"
84
+ ]
85
+
86
+ # 웹문서 로딩
87
+ loader = WebBaseLoader(web_paths=urls, bs_kwargs=dict(parse_only=bs4.SoupStrainer()))
88
+ docs = loader.load()
89
+
90
+ # 기록물 목록 엑셀 파일
91
+ excel_files = [
92
+ "교육 전반 관련 기록물 목록1.xls",
93
+ "교육 전반 관련 기록물 목록2.xls",
94
+ "교육 전반 관련 기록물 목록3.xls"
95
+ ]
96
+
97
+ # 엑셀 문서 로딩
98
+ excel_docs = []
99
+ for file in excel_files:
100
+ loader = UnstructuredExcelLoader(file)
101
+ excel_docs.extend(loader.load())
102
+
103
+ # 웹문서 + 엑셀문서 결합
104
+ docs.extend(excel_docs)
105
+
106
+ # 문서 분할
107
+ splitter = CharacterTextSplitter(separator="\n", chunk_size=500, chunk_overlap=50)
108
+ split_docs = splitter.split_documents(docs)
109
+
110
+ # 임베딩 및 벡터 저장 및 리트리버 설정
111
+ embedding_model = HuggingFaceEmbeddings(model_name="snunlp/KR-SBERT-V40K-klueNLI-augSTS")
112
+ vectorstore = FAISS.from_documents(split_docs, embedding_model)
113
+ retriever = vectorstore.as_retriever()
114
+
115
+ # LLM + QA 체인
116
+ llm = ChatGroq(groq_api_key=groq_api_key, model_name="llama3-70b-8192")
117
+ qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type="stuff")
118
+
119
+ # Gradio 채팅 함수
120
+ def chat_with_history(user_input, history):
121
+ if history is None:
122
+ history = []
123
+ query = user_input.strip() + " 한국어로 답해주세요."
124
+ result = qa_chain({"query": query})
125
+ answer = result.get("result", "답변을 찾을 수 없습니다.")
126
+ history.append((user_input, answer))
127
+ return "", history, history
128
+
129
+ # Gradio 인터페이스 구성
130
+ with gr.Blocks() as demo:
131
+ gr.Markdown("## 📚 국가기록원 정보 챗봇")
132
+ chatbot = gr.Chatbot(label="기록원 챗봇")
133
+ msg = gr.Textbox(placeholder="질문을 입력하세요", label="💬 질문 입력")
134
+ state = gr.State([])
135
+ msg.submit(chat_with_history, inputs=[msg, state], outputs=[msg, chatbot, state])
136
+
137
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ langchain
3
+ langchain-community
4
+ langchain-groq
5
+ faiss-cpu
6
+ beautifulsoup4
7
+ transformers
8
+ sentence-transformers
9
+ pandas
10
+ openpyxl
11
+ unstructured
교육 전반 관련 기록물 목록1.xls ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1a55238219a4236b1b1a649ef024f7f71a324f4734eda8d392b92d8a3857721
3
+ size 192000
교육 전반 관련 기록물 목록2.xls ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b12fb2c236e03bc261db25633672198871d680f450fd1a4e1ba3979162a6780d
3
+ size 298496
교육 전반 관련 기록물 목록3.xls ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf6baa7cfc4d262f94a9ce6f10db7797020da8c4308a4a250f0f2a75b27f44d6
3
+ size 459264