Spaces:

barunsaha
/

slide-deck-ai

Running

App Files Files Community

adibak commited on Jun 24

Commit

65c99df

1 Parent(s): d1e63a2

Refactor PDF extraction logic for single page, improve comments for clarity, and revert placeholder text

Browse files

Files changed (2) hide show

app.py +10 -4
helpers/file_manager.py +17 -8

app.py CHANGED Viewed

@@ -299,15 +299,22 @@ def set_up_chat_ui():
         # Check if pdf file is uploaded
         # (we can use the same file if the user doesn't upload a new one)
         if 'pdf_file' in st.session_state:
-            # get validated page range
             st.session_state['start_page'], st.session_state['end_page'] = filem.validate_page_range(
                                                                                     st.session_state['pdf_file'],
                                                                                     st.session_state['start_page'],
                                                                                     st.session_state['end_page']
                                                                                 )
-            #Show sidebar text for page selection and file name
             with st.sidebar:
-                st.text(f'Extracting pages {st.session_state["start_page"]} to {st.session_state["end_page"]} in {st.session_state["pdf_file"].name}')
             # Get pdf contents
             st.session_state[ADDITIONAL_INFO] = filem.get_pdf_contents(
@@ -315,7 +322,6 @@ def set_up_chat_ui():
                                                         (st.session_state['start_page'],
                                                         st.session_state['end_page'])
                                                     )
         provider, llm_name = llm_helper.get_provider_model(
             llm_provider_to_use,
             use_ollama=RUN_IN_OFFLINE_MODE

         # Check if pdf file is uploaded
         # (we can use the same file if the user doesn't upload a new one)
         if 'pdf_file' in st.session_state:
+            # Get validated page range
             st.session_state['start_page'], st.session_state['end_page'] = filem.validate_page_range(
                                                                                     st.session_state['pdf_file'],
                                                                                     st.session_state['start_page'],
                                                                                     st.session_state['end_page']
                                                                                 )
+            # Show sidebar text for page selection and file name
             with st.sidebar:
+                if st.session_state['end_page'] is None:  # If the PDF has only one page
+                    st.text('Extracting page %d in %s' % (
+                        st.session_state['start_page'], st.session_state['pdf_file'].name
+                    ))
+                else:
+                    st.text('Extracting pages %d to %d in %s' % (
+                        st.session_state['start_page'], st.session_state['end_page'], st.session_state['pdf_file'].name
+                    ))
             # Get pdf contents
             st.session_state[ADDITIONAL_INFO] = filem.get_pdf_contents(
                                                         (st.session_state['start_page'],
                                                         st.session_state['end_page'])
                                                     )
         provider, llm_name = llm_helper.get_provider_model(
             llm_provider_to_use,
             use_ollama=RUN_IN_OFFLINE_MODE

helpers/file_manager.py CHANGED Viewed

@@ -30,14 +30,19 @@ def get_pdf_contents(
     reader = PdfReader(pdf_file)
-    start, end = page_range  # set start and end per the range (user-specified values)
-    print(f"Name: {pdf_file.name} Page range: {start} to {end}")
     text = ''
     for page_num in range(start - 1, end):
-        page = reader.pages[page_num]
-        text += page.extract_text()
     return text
 def validate_page_range(pdf_file: st.runtime.uploaded_file_manager.UploadedFile,
@@ -52,13 +57,17 @@ def validate_page_range(pdf_file: st.runtime.uploaded_file_manager.UploadedFile,
     """
     n_pages = len(PdfReader(pdf_file).pages)
-    # set start to max of 1 or specified start (whichever's higher)
     start = max(1, start)
-    # set end to min of pdf length or specified end (whichever's lower)
     end = min(n_pages, end)
-    if start > end:  # if the start is higher than the end, make it 1
         start = 1
     return start, end

     reader = PdfReader(pdf_file)
+    start, end = page_range  # Set start and end per the range (user-specified values)
     text = ''
+    if end is None:
+        # If end is None (where PDF has only 1 page or start = end), extract start
+        end = start
+    # Get the text from the specified page range
     for page_num in range(start - 1, end):
+        text += reader.pages[page_num].extract_text()
     return text
 def validate_page_range(pdf_file: st.runtime.uploaded_file_manager.UploadedFile,
     """
     n_pages = len(PdfReader(pdf_file).pages)
+    # Set start to max of 1 or specified start (whichever's higher)
     start = max(1, start)
+    # Set end to min of pdf length or specified end (whichever's lower)
     end = min(n_pages, end)
+    if start > end:  # If the start is higher than the end, make it 1
         start = 1
+    if start == end:
+        # If start = end (including when PDF is 1 page long), set end to None
+        return start, None
     return start, end