Spaces:

darylalim
/

granite-document-summarization

Running on Zero

App Files Files Community

Daryl Lim commited on 7 days ago

Commit

80a5f54

1 Parent(s): f096bc8

Update app.py

Browse files

Files changed (1) hide show

app.py +133 -46

app.py CHANGED Viewed

@@ -123,6 +123,57 @@ def convert_document_to_markdown(doc_path) -> str:
     except Exception as e:
         return f"Error converting document: {str(e)}"
 # Function to generate a summary using the IBM Granite model
 def generate_summary(chunks: List[Document], length_type="sentences", length_count=3):
     """Generate a summary from document chunks using the IBM Granite model
@@ -132,14 +183,13 @@ def generate_summary(chunks: List[Document], length_type="sentences", length_cou
         length_type: Either "sentences" or "paragraphs"
         length_count: Number of sentences (1-10) or paragraphs (1-3)
     """
-    # Print debug information to track what parameters are being used
     print(f"Generating summary with length_type={length_type}, length_count={length_count}")
     # Ensure length_count is an integer
     try:
         length_count = int(length_count)
     except (ValueError, TypeError):
-        # Default to 3 if conversion fails
         print(f"Failed to convert length_count to int: {length_count}, using default 3")
         length_count = 3
@@ -149,18 +199,36 @@ def generate_summary(chunks: List[Document], length_type="sentences", length_cou
     else:  # paragraphs
         length_count = max(1, min(3, length_count))   # Limit to 1-3 paragraphs
-    # Concatenate the retrieved chunks
-    combined_text = " ".join([chunk.page_content for chunk in chunks])
-    # Use a more direct instruction to enforce the length constraint
     if length_type == "sentences":
-        length_instruction = f"Your summary must be EXACTLY {length_count} sentence{'s' if length_count > 1 else ''}. Not more, not less."
     else:  # paragraphs
-        length_instruction = f"Your summary must be EXACTLY {length_count} paragraph{'s' if length_count > 1 else ''}. Not more, not less."
-    # Construct the prompt with clearer instructions
     prompt = f"""<instruction>
-Knowledge Cutoff Date: April 2024. You are Granite, developed by IBM. You are a helpful AI assistant. Summarize the following text. {length_instruction} Your response should only include the summary. Do not provide any further explanation.
 </instruction>
 <text>
@@ -168,28 +236,30 @@ Knowledge Cutoff Date: April 2024. You are Granite, developed by IBM. You are a
 </text>
 """
-    # Calculate appropriate max_new_tokens based on length requirements
-    # Approximate tokens: ~15 tokens per sentence, ~75 tokens per paragraph
     if length_type == "sentences":
-        max_tokens = length_count * 30  # Increased slightly for flexibility
     else:  # paragraphs
-        max_tokens = length_count * 120  # Increased slightly for flexibility
     # Ensure minimum tokens and add buffer
-    max_tokens = max(100, min(1500, max_tokens + 50))
-    # Generate the summary using the IBM Granite model
-    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
     print(f"Using max_new_tokens={max_tokens}")
     with torch.no_grad():
         output = model.generate(
             **inputs,
             max_new_tokens=max_tokens,
-            temperature=0.7,
             top_p=0.9,
-            do_sample=True
         )
     # Decode and return the generated summary
@@ -197,6 +267,18 @@ Knowledge Cutoff Date: April 2024. You are Granite, developed by IBM. You are a
     # Extract just the generated response (after the prompt)
     summary = summary[len(tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)):]
     return summary.strip()
@@ -255,19 +337,15 @@ def process_document(
         if markdown_path.startswith("Error"):
             return markdown_path
-        # Load and split the document
-        progress(0.4, "Loading and splitting document...")
-        loader = UnstructuredMarkdownLoader(str(markdown_path))
-        documents = loader.load()
-        # Optimize text splitting for better chunks
-        text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=1000,  # Larger chunk size for better context
-            chunk_overlap=100,
-            length_function=len,
-            separators=["\n\n", "\n", ".", " ", ""]  # Prioritize splitting at paragraph/sentence boundaries
-        )
-        texts = text_splitter.split_documents(documents)
         if not texts:
             return "No text could be extracted from the document."
@@ -302,36 +380,45 @@ def process_document(
             # Sleep briefly to allow memory cleanup
             time.sleep(0.1)
-        # Generate summary from chunks
-        if len(all_chunks) > 8:
-            # If we have many chunks, process in batches
             summaries = []
             for i in range(0, len(all_chunks), batch_size):
                 batch = all_chunks[i:i+batch_size]
                 summary = generate_summary(
                     batch,
-                    length_type=length_type,
-                    length_count=max(1, length_count // 2)  # Use smaller count for partial summaries
                 )
                 summaries.append(summary)
                 # Force garbage collection
                 gc.collect()
-            # Create final summary from batch summaries
             final_summary = generate_summary(
                 [Document(page_content=s) for s in summaries],
-                length_type=length_type,
                 length_count=length_count
             )
             return final_summary
-        else:
-            # If we have few chunks, generate summary directly
-            return generate_summary(
-                all_chunks,
-                length_type=length_type,
-                length_count=length_count
-            )
     except Exception as e:
         return f"Error processing document: {str(e)}"
@@ -458,4 +545,4 @@ def create_gradio_interface():
 # Launch the application
 if __name__ == "__main__":
     app = create_gradio_interface()
-    app.launch()

     except Exception as e:
         return f"Error converting document: {str(e)}"
+# Improved text processing function
+def clean_and_prepare_text(markdown_path):
+    """Load, clean and prepare document text for better processing"""
+    try:
+        # Load the document
+        loader = UnstructuredMarkdownLoader(str(markdown_path))
+        documents = loader.load()
+        if not documents:
+            return None, "No content could be extracted from the document."
+        # Combine all document content for pre-processing
+        raw_text = " ".join([doc.page_content for doc in documents])
+        # Clean up the text
+        # 1. Normalize whitespace
+        text = " ".join(raw_text.split())
+        # 2. Fix common OCR and conversion artifacts
+        text = text.replace(" .", ".").replace(" ,", ",")
+        # 3. Ensure proper spacing after punctuation
+        for punct in ['.', '!', '?']:
+            text = text.replace(f"{punct}", f"{punct} ")
+        # Split into improved documents
+        # Use a sensible paragraph size
+        paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
+        # Create structured documents for better processing
+        processed_docs = []
+        for i, para in enumerate(paragraphs):
+            if len(para) > 10:  # Skip very short paragraphs
+                processed_docs.append(Document(
+                    page_content=para,
+                    metadata={"source": markdown_path, "paragraph": i}
+                ))
+        return processed_docs, None
+    except Exception as e:
+        return None, f"Error processing document text: {str(e)}"
+# Improved text splitting configuration
+def create_optimized_text_splitter():
+    """Create an optimized text splitter for document processing"""
+    return RecursiveCharacterTextSplitter(
+        chunk_size=800,  # Slightly smaller for more focused chunks
+        chunk_overlap=150,  # Increased overlap to maintain context
+        length_function=len,
+        separators=["\n\n", "\n", ".", "!", "?", ";", ":", " ", ""]  # More comprehensive separators
+    )
 # Function to generate a summary using the IBM Granite model
 def generate_summary(chunks: List[Document], length_type="sentences", length_count=3):
     """Generate a summary from document chunks using the IBM Granite model
         length_type: Either "sentences" or "paragraphs"
         length_count: Number of sentences (1-10) or paragraphs (1-3)
     """
+    # Print debug information
     print(f"Generating summary with length_type={length_type}, length_count={length_count}")
     # Ensure length_count is an integer
     try:
         length_count = int(length_count)
     except (ValueError, TypeError):
         print(f"Failed to convert length_count to int: {length_count}, using default 3")
         length_count = 3
     else:  # paragraphs
         length_count = max(1, min(3, length_count))   # Limit to 1-3 paragraphs
+    # Clean and concatenate the text from chunks
+    # Remove any excessive whitespace and normalize
+    cleaned_chunks = []
+    for chunk in chunks:
+        text = chunk.page_content
+        # Remove excessive newlines and whitespace
+        text = ' '.join(text.split())
+        cleaned_chunks.append(text)
+    combined_text = " ".join(cleaned_chunks)
+    # More explicit and forceful prompt structure
     if length_type == "sentences":
+        length_instruction = f"Create a concise summary that is EXACTLY {length_count} complete sentences. Not {length_count-1} sentences. Not {length_count+1} sentences. EXACTLY {length_count} sentences."
     else:  # paragraphs
+        length_instruction = f"Create a concise summary that is EXACTLY {length_count} paragraphs. Each paragraph should be 2-4 sentences long. Not {length_count-1} paragraphs. Not {length_count+1} paragraphs. EXACTLY {length_count} paragraphs."
+    # More detailed prompt with examples of what constitutes a sentence
     prompt = f"""<instruction>
+You are an expert document summarizer. Your task is to create a high-quality summary of the following text.
+{length_instruction}
+Remember:
+- Your summary must capture the main points of the document
+- Your summary must be in your own words (not copied text)
+- Your summary must be clearly written and well-structured
+- Do not include any explanations, headings, bullet points, or additional formatting
+- Respond ONLY with the summary text itself
 </instruction>
 <text>
 </text>
 """
+    # Calculate appropriate max_new_tokens but with stricter limits
     if length_type == "sentences":
+        # Approximately 20 tokens per sentence
+        max_tokens = length_count * 40
     else:  # paragraphs
+        # Approximately 100 tokens per paragraph
+        max_tokens = length_count * 150
     # Ensure minimum tokens and add buffer
+    max_tokens = max(100, min(1500, max_tokens))
     print(f"Using max_new_tokens={max_tokens}")
+    # Generate with lower temperature for more consistent results
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
     with torch.no_grad():
         output = model.generate(
             **inputs,
             max_new_tokens=max_tokens,
+            temperature=0.3,  # Lower temperature for more deterministic output
             top_p=0.9,
+            do_sample=True,
+            repetition_penalty=1.2  # Discourage repetition
         )
     # Decode and return the generated summary
     # Extract just the generated response (after the prompt)
     summary = summary[len(tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)):]
+    summary = summary.strip()
+    # Post-process the summary to ensure it meets the length constraints
+    if length_type == "sentences":
+        # Simple sentence counting based on periods
+        sentences = [s.strip() for s in summary.split('.') if s.strip()]
+        if len(sentences) > length_count:
+            # Take only the requested number of sentences
+            summary = '. '.join(sentences[:length_count]) + '.'
+        elif len(sentences) < length_count:
+            # If we have too few sentences, log this issue
+            print(f"Warning: Generated only {len(sentences)} sentences instead of {length_count}")
     return summary.strip()
         if markdown_path.startswith("Error"):
             return markdown_path
+        # Clean and prepare the text
+        progress(0.4, "Processing document text...")
+        processed_docs, error = clean_and_prepare_text(markdown_path)
+        if error:
+            return error
+        # Split the documents with optimized splitter
+        text_splitter = create_optimized_text_splitter()
+        texts = text_splitter.split_documents(processed_docs)
         if not texts:
             return "No text could be extracted from the document."
             # Sleep briefly to allow memory cleanup
             time.sleep(0.1)
+        # Case 1: Very small documents - use all chunks directly
+        if len(all_chunks) <= 8:
+            return generate_summary(
+                all_chunks,
+                length_type=length_type.lower(),
+                length_count=length_count
+            )
+        # Case 2: Medium-sized documents - process in one batch
+        elif len(all_chunks) <= 16:
+            return generate_summary(
+                all_chunks[:8],  # Use first 8 chunks (usually contains most important info)
+                length_type=length_type.lower(),
+                length_count=length_count
+            )
+        # Case 3: Large documents - process in multiple batches
+        else:
+            # First pass: Generate summaries for each batch
             summaries = []
             for i in range(0, len(all_chunks), batch_size):
                 batch = all_chunks[i:i+batch_size]
                 summary = generate_summary(
                     batch,
+                    length_type="paragraphs",  # Use paragraphs for intermediate summaries
+                    length_count=1  # One paragraph per batch
                 )
                 summaries.append(summary)
                 # Force garbage collection
                 gc.collect()
+            # Second pass: Generate final summary from batch summaries
             final_summary = generate_summary(
                 [Document(page_content=s) for s in summaries],
+                length_type=length_type.lower(),
                 length_count=length_count
             )
             return final_summary
     except Exception as e:
         return f"Error processing document: {str(e)}"
 # Launch the application
 if __name__ == "__main__":
     app = create_gradio_interface()
+    app.launch()