Spaces:
Running
Running
adibak
commited on
Commit
·
65c99df
1
Parent(s):
d1e63a2
Refactor PDF extraction logic for single page, improve comments for clarity, and revert placeholder text
Browse files- app.py +10 -4
- helpers/file_manager.py +17 -8
app.py
CHANGED
|
@@ -299,15 +299,22 @@ def set_up_chat_ui():
|
|
| 299 |
# Check if pdf file is uploaded
|
| 300 |
# (we can use the same file if the user doesn't upload a new one)
|
| 301 |
if 'pdf_file' in st.session_state:
|
| 302 |
-
#
|
| 303 |
st.session_state['start_page'], st.session_state['end_page'] = filem.validate_page_range(
|
| 304 |
st.session_state['pdf_file'],
|
| 305 |
st.session_state['start_page'],
|
| 306 |
st.session_state['end_page']
|
| 307 |
)
|
| 308 |
-
#Show sidebar text for page selection and file name
|
| 309 |
with st.sidebar:
|
| 310 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
|
| 312 |
# Get pdf contents
|
| 313 |
st.session_state[ADDITIONAL_INFO] = filem.get_pdf_contents(
|
|
@@ -315,7 +322,6 @@ def set_up_chat_ui():
|
|
| 315 |
(st.session_state['start_page'],
|
| 316 |
st.session_state['end_page'])
|
| 317 |
)
|
| 318 |
-
|
| 319 |
provider, llm_name = llm_helper.get_provider_model(
|
| 320 |
llm_provider_to_use,
|
| 321 |
use_ollama=RUN_IN_OFFLINE_MODE
|
|
|
|
| 299 |
# Check if pdf file is uploaded
|
| 300 |
# (we can use the same file if the user doesn't upload a new one)
|
| 301 |
if 'pdf_file' in st.session_state:
|
| 302 |
+
# Get validated page range
|
| 303 |
st.session_state['start_page'], st.session_state['end_page'] = filem.validate_page_range(
|
| 304 |
st.session_state['pdf_file'],
|
| 305 |
st.session_state['start_page'],
|
| 306 |
st.session_state['end_page']
|
| 307 |
)
|
| 308 |
+
# Show sidebar text for page selection and file name
|
| 309 |
with st.sidebar:
|
| 310 |
+
if st.session_state['end_page'] is None: # If the PDF has only one page
|
| 311 |
+
st.text('Extracting page %d in %s' % (
|
| 312 |
+
st.session_state['start_page'], st.session_state['pdf_file'].name
|
| 313 |
+
))
|
| 314 |
+
else:
|
| 315 |
+
st.text('Extracting pages %d to %d in %s' % (
|
| 316 |
+
st.session_state['start_page'], st.session_state['end_page'], st.session_state['pdf_file'].name
|
| 317 |
+
))
|
| 318 |
|
| 319 |
# Get pdf contents
|
| 320 |
st.session_state[ADDITIONAL_INFO] = filem.get_pdf_contents(
|
|
|
|
| 322 |
(st.session_state['start_page'],
|
| 323 |
st.session_state['end_page'])
|
| 324 |
)
|
|
|
|
| 325 |
provider, llm_name = llm_helper.get_provider_model(
|
| 326 |
llm_provider_to_use,
|
| 327 |
use_ollama=RUN_IN_OFFLINE_MODE
|
helpers/file_manager.py
CHANGED
|
@@ -30,14 +30,19 @@ def get_pdf_contents(
|
|
| 30 |
|
| 31 |
reader = PdfReader(pdf_file)
|
| 32 |
|
| 33 |
-
start, end = page_range #
|
| 34 |
-
|
| 35 |
-
print(f"Name: {pdf_file.name} Page range: {start} to {end}")
|
| 36 |
text = ''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
for page_num in range(start - 1, end):
|
| 38 |
-
|
| 39 |
-
text += page.extract_text()
|
| 40 |
|
|
|
|
| 41 |
return text
|
| 42 |
|
| 43 |
def validate_page_range(pdf_file: st.runtime.uploaded_file_manager.UploadedFile,
|
|
@@ -52,13 +57,17 @@ def validate_page_range(pdf_file: st.runtime.uploaded_file_manager.UploadedFile,
|
|
| 52 |
"""
|
| 53 |
n_pages = len(PdfReader(pdf_file).pages)
|
| 54 |
|
| 55 |
-
#
|
| 56 |
start = max(1, start)
|
| 57 |
|
| 58 |
-
#
|
| 59 |
end = min(n_pages, end)
|
| 60 |
|
| 61 |
-
if start > end: #
|
| 62 |
start = 1
|
| 63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
return start, end
|
|
|
|
| 30 |
|
| 31 |
reader = PdfReader(pdf_file)
|
| 32 |
|
| 33 |
+
start, end = page_range # Set start and end per the range (user-specified values)
|
| 34 |
+
|
|
|
|
| 35 |
text = ''
|
| 36 |
+
|
| 37 |
+
if end is None:
|
| 38 |
+
# If end is None (where PDF has only 1 page or start = end), extract start
|
| 39 |
+
end = start
|
| 40 |
+
|
| 41 |
+
# Get the text from the specified page range
|
| 42 |
for page_num in range(start - 1, end):
|
| 43 |
+
text += reader.pages[page_num].extract_text()
|
|
|
|
| 44 |
|
| 45 |
+
|
| 46 |
return text
|
| 47 |
|
| 48 |
def validate_page_range(pdf_file: st.runtime.uploaded_file_manager.UploadedFile,
|
|
|
|
| 57 |
"""
|
| 58 |
n_pages = len(PdfReader(pdf_file).pages)
|
| 59 |
|
| 60 |
+
# Set start to max of 1 or specified start (whichever's higher)
|
| 61 |
start = max(1, start)
|
| 62 |
|
| 63 |
+
# Set end to min of pdf length or specified end (whichever's lower)
|
| 64 |
end = min(n_pages, end)
|
| 65 |
|
| 66 |
+
if start > end: # If the start is higher than the end, make it 1
|
| 67 |
start = 1
|
| 68 |
|
| 69 |
+
if start == end:
|
| 70 |
+
# If start = end (including when PDF is 1 page long), set end to None
|
| 71 |
+
return start, None
|
| 72 |
+
|
| 73 |
return start, end
|