Keisuke Yamanaka - CNC
commited on
Commit
·
92ea1dc
1
Parent(s):
4e09058
update app.py
Browse files
app.py
CHANGED
@@ -89,12 +89,15 @@ class CNC_QA:
|
|
89 |
fpath, fname = split(fullpath)
|
90 |
fpath += '/'
|
91 |
# Get elements
|
|
|
92 |
raw_pdf_elements = self.extract_pdf_elements(fpath, fname)
|
93 |
|
94 |
# Get text, tables
|
|
|
95 |
texts, tables = self.categorize_elements(raw_pdf_elements)
|
96 |
|
97 |
# Optional: Enforce a specific token size for texts
|
|
|
98 |
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
|
99 |
chunk_size=4000, chunk_overlap=0
|
100 |
)
|
@@ -102,10 +105,12 @@ class CNC_QA:
|
|
102 |
texts_4k_token = text_splitter.split_text(joined_texts)
|
103 |
|
104 |
# Get text, table summaries
|
|
|
105 |
text_summaries, table_summaries = self.generate_text_summaries(
|
106 |
texts_4k_token, tables, summarize_texts=True
|
107 |
)
|
108 |
|
|
|
109 |
img_base64_list, image_summaries = self.generate_img_summaries(fpath)
|
110 |
return text_summaries,texts,table_summaries,tables,image_summaries,img_base64_list
|
111 |
|
|
|
89 |
fpath, fname = split(fullpath)
|
90 |
fpath += '/'
|
91 |
# Get elements
|
92 |
+
print('Get elements')
|
93 |
raw_pdf_elements = self.extract_pdf_elements(fpath, fname)
|
94 |
|
95 |
# Get text, tables
|
96 |
+
print('Get text, tables')
|
97 |
texts, tables = self.categorize_elements(raw_pdf_elements)
|
98 |
|
99 |
# Optional: Enforce a specific token size for texts
|
100 |
+
print('Optional: Enforce a specific token size for texts')
|
101 |
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
|
102 |
chunk_size=4000, chunk_overlap=0
|
103 |
)
|
|
|
105 |
texts_4k_token = text_splitter.split_text(joined_texts)
|
106 |
|
107 |
# Get text, table summaries
|
108 |
+
print('Get text, table summaries')
|
109 |
text_summaries, table_summaries = self.generate_text_summaries(
|
110 |
texts_4k_token, tables, summarize_texts=True
|
111 |
)
|
112 |
|
113 |
+
print('Image summaries')
|
114 |
img_base64_list, image_summaries = self.generate_img_summaries(fpath)
|
115 |
return text_summaries,texts,table_summaries,tables,image_summaries,img_base64_list
|
116 |
|