suhyun1 commited on
Commit
6169786
ยท
verified ยท
1 Parent(s): b81d0e5

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +120 -0
app.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import bs4
4
+
5
+ from langchain_community.document_loaders import WebBaseLoader
6
+ from langchain.text_splitter import CharacterTextSplitter
7
+ from langchain_community.embeddings import HuggingFaceEmbeddings
8
+ from langchain.vectorstores import FAISS
9
+ from langchain.chains import RetrievalQA
10
+ from langchain_groq import ChatGroq
11
+
12
+ # ํ™˜๊ฒฝ ๋ณ€์ˆ˜๋กœ๋ถ€ํ„ฐ Groq API Key ๋ถˆ๋Ÿฌ์˜ค๊ธฐ
13
+ groq_api_key = os.environ.get("GROQ_API_KEY", "")
14
+
15
+ # ๊ตญ๊ฐ€๊ธฐ๋ก์› ์›น ๋ฌธ์„œ ๋ชฉ๋ก
16
+ urls = [
17
+ "https://archives.go.kr/next/newsearch/listSubjectContent.do?subjectFieldId=000011",
18
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003140&pageFlag=A&sitePage=1-2-1",
19
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003288&pageFlag=A&sitePage=1-2-1",
20
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003290&pageFlag=A&sitePage=1-2-1",
21
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003292&pageFlag=A&sitePage=1-2-1",
22
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008757&pageFlag=A&sitePage=1-2-1",
23
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003293&pageFlag=A&sitePage=1-2-1",
24
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003294&pageFlag=A&sitePage=1-2-1",
25
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003295&pageFlag=A&sitePage=1-2-1",
26
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003289&pageFlag=A&sitePage=1-2-1",
27
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010816&pageFlag=A&sitePage=1-2-1",
28
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010817&pageFlag=A&sitePage=1-2-1",
29
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=009154&pageFlag=A&sitePage=1-2-1",
30
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003260&pageFlag=A&sitePage=1-2-1",
31
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003278&pageFlag=A&sitePage=1-2-1",
32
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003281&pageFlag=A&sitePage=1-2-1",
33
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003283&pageFlag=A&sitePage=1-2-1",
34
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003284&pageFlag=A&sitePage=1-2-1",
35
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003280&pageFlag=A&sitePage=1-2-1",
36
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003282&pageFlag=A&sitePage=1-2-1",
37
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003287&pageFlag=A&sitePage=1-2-1",
38
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003286&pageFlag=A&sitePage=1-2-1",
39
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003285&pageFlag=A&sitePage=1-2-1",
40
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003279&pageFlag=A&sitePage=1-2-1",
41
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003141&pageFlag=A&sitePage=1-2-1",
42
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003143&pageFlag=A&sitePage=1-2-1",
43
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003144&pageFlag=A&sitePage=1-2-1",
44
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003142&pageFlag=A&sitePage=1-2-1",
45
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008653&pageFlag=A&sitePage=1-2-1",
46
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010827&pageFlag=A&sitePage=1-2-1",
47
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008582&pageFlag=A&sitePage=1-2-1",
48
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008663&pageFlag=A&sitePage=1-2-1",
49
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008581&pageFlag=A&sitePage=1-2-1",
50
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010828&pageFlag=A&sitePage=1-2-1",
51
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010830&pageFlag=A&sitePage=1-2-1",
52
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010831&pageFlag=A&sitePage=1-2-1",
53
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003145&pageFlag=A&sitePage=1-2-1",
54
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=009425&pageFlag=A&sitePage=1-2-1",
55
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003146&pageFlag=A&sitePage=1-2-1",
56
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010821&pageFlag=A&sitePage=1-2-1",
57
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003151&pageFlag=A&sitePage=1-2-1",
58
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003149&pageFlag=A&sitePage=1-2-1",
59
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003148&pageFlag=A&sitePage=1-2-1",
60
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008655&pageFlag=A&sitePage=1-2-1",
61
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008654&pageFlag=A&sitePage=1-2-1",
62
+ "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003150&pageFlag=A&sitePage=1-2-1",
63
+ "https://archives.go.kr/next/newmanager/recodeRegister.do",
64
+ "https://archives.go.kr/next/newtour/tourCourse.do",
65
+ "https://archives.go.kr/next/newrecordsMngPro/recordsDonateInfo.do",
66
+ "https://archives.go.kr/next/newdata/pepoleRecodPresentIntro.do",
67
+ "https://archives.go.kr/next/newsearch/searchGuideList.do",
68
+ "https://archives.go.kr/next/newsearch/searchGuideList.do?page=2",
69
+ "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=441",
70
+ "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=381",
71
+ "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=341",
72
+ "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=261",
73
+ "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=227",
74
+ "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=59",
75
+ "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=30",
76
+ "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=64",
77
+ "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=321",
78
+ "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=124",
79
+ "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=267",
80
+ "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=141",
81
+ "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=149",
82
+ "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=22"
83
+ ]
84
+
85
+ # ์›น๋ฌธ์„œ ๋กœ๋”ฉ
86
+ loader = WebBaseLoader(web_paths=urls, bs_kwargs=dict(parse_only=bs4.SoupStrainer()))
87
+ docs = loader.load()
88
+
89
+ # ๋ฌธ์„œ ๋ถ„ํ• 
90
+ splitter = CharacterTextSplitter(separator="\n", chunk_size=500, chunk_overlap=50)
91
+ split_docs = splitter.split_documents(docs)
92
+
93
+ # ์ž„๋ฒ ๋”ฉ ๋ฐ ๋ฒกํ„ฐ ์ €์žฅ ๋ฐ ๋ฆฌํŠธ๋ฆฌ๋ฒ„ ์„ค์ •
94
+ embedding_model = HuggingFaceEmbeddings(model_name="snunlp/KR-SBERT-V40K-klueNLI-augSTS")
95
+ vectorstore = FAISS.from_documents(split_docs, embedding_model)
96
+ retriever = vectorstore.as_retriever()
97
+
98
+ # LLM + QA ์ฒด์ธ
99
+ llm = ChatGroq(groq_api_key=groq_api_key, model_name="llama3-70b-8192")
100
+ qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type="stuff")
101
+
102
+ # Gradio ์ฑ„ํŒ… ํ•จ์ˆ˜
103
+ def chat_with_history(user_input, history):
104
+ if history is None:
105
+ history = []
106
+ query = user_input.strip() + " ํ•œ๊ตญ์–ด๋กœ ๋‹ตํ•ด์ฃผ์„ธ์š”."
107
+ result = qa_chain({"query": query})
108
+ answer = result.get("result", "๋‹ต๋ณ€์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
109
+ history.append((user_input, answer))
110
+ return "", history, history
111
+
112
+ # Gradio ์ธํ„ฐํŽ˜์ด์Šค ๊ตฌ์„ฑ
113
+ with gr.Blocks() as demo:
114
+ gr.Markdown("## ๐Ÿ“š ๊ตญ๊ฐ€๊ธฐ๋ก์› ์ •๋ณด ์ฑ—๋ด‡")
115
+ chatbot = gr.Chatbot(label="๊ธฐ๋ก์› ์ฑ—๋ด‡")
116
+ msg = gr.Textbox(placeholder="์งˆ๋ฌธ์„ ์ž…๋ ฅํ•˜์„ธ์š”", label="๐Ÿ’ฌ ์งˆ๋ฌธ ์ž…๋ ฅ")
117
+ state = gr.State([])
118
+ msg.submit(chat_with_history, inputs=[msg, state], outputs=[msg, chatbot, state])
119
+
120
+ demo.launch()