Afeezee's picture
Update app.py
a9cbd1f verified
raw
history blame
4.82 kB
import os
from groq import Groq
from PyPDF2 import PdfReader
from docx import Document
import gradio as gr
Groqkey = os.getenv("LitReview")
# Groq API Key
GROQ_API_KEY = Groqkey
# Initialize Groq client
client = Groq(api_key=GROQ_API_KEY)
def extract_text_from_file(file):
"""Extracts text from uploaded PDF or DOCX files."""
if file.name.endswith(".pdf"):
reader = PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
elif file.name.endswith(".docx"):
doc = Document(file)
text = "\n".join([p.text for p in doc.paragraphs])
return text
else:
return "Unsupported file format. Please upload a PDF or DOCX file."
def chunk_text(text, max_tokens=4000):
"""
Splits text into chunks small enough for the Llama model to process.
Each chunk is limited to `max_tokens` for safe processing.
"""
words = text.split()
chunks = []
current_chunk = []
for word in words:
current_chunk.append(word)
if len(" ".join(current_chunk)) > max_tokens:
chunks.append(" ".join(current_chunk))
current_chunk = []
if current_chunk:
chunks.append(" ".join(current_chunk))
return chunks
def analyze_chunk(chunk):
"""
Analyzes a single chunk of text using the Llama model.
"""
messages = [
{
"role": "system",
"content": (
"You are an experienced scholar tasked with analyzing research articles. "
"Focus on extracting insights based on: Author (APA format with et al if applicable), Year of publication, Title of the article; "
"Problem addressed; Methodology (datasets, tools, techniques, algorithms); "
"Results (specific, quantifiable metrics); and Remarks (strengths, weaknesses, improvements). "
"Summarize only insights related to these fields and disregard irrelevant content."
)
},
{
"role": "user",
"content": chunk
}
]
try:
completion = client.chat.completions.create(
model="llama-3.3-70b-versatile",
messages=messages,
temperature=0.7,
max_tokens=4096,
top_p=1,
stream=False
)
return completion.choices[0].message.content
except Exception as e:
return f"An error occurred while processing a chunk: {e}"
def analyze_document(file):
"""Processes and analyzes the uploaded document."""
text = extract_text_from_file(file)
if text.startswith("Unsupported file format"):
yield f"**Error:** {text}"
return
chunks = chunk_text(text)
all_insights = []
yield "**Processing the document. Please wait...**\n"
for i, chunk in enumerate(chunks, 1):
yield f"**Processing chunk {i} of {len(chunks)}...**"
result = analyze_chunk(chunk)
if result.strip(): # Only append non-empty results
all_insights.append(result)
if not all_insights:
yield "**Error:** No valid insights were extracted from the document."
return
yield "**Consolidating all insights into a final summary...**"
consolidated_summary_prompt = (
"Below are insights extracted from multiple chunks of a document. "
"Consolidate these insights into a single output organized as follows: "
"Author, Year, Title; Problem addressed; Methodology; Results; and Remarks. "
"Make the final output concise and coherent."
)
try:
final_summary = client.chat.completions.create(
model="llama-3.3-70b-versatile",
messages=[
{"role": "system", "content": consolidated_summary_prompt},
{"role": "user", "content": "\n\n".join(all_insights)}
],
temperature=0.7,
max_tokens=4096,
top_p=1,
stream=False
)
yield f"**Final Summary:**\n\n{final_summary.choices[0].message.content}"
except Exception as e:
yield f"**Error:** An error occurred during consolidation: {e}"
# Define the Gradio interface
interface = gr.Interface(
fn=analyze_document,
inputs=gr.File(label="Upload a PDF or DOCX file"),
outputs=gr.Markdown(label="Literature Analysis"),
title="Automated Literature Review",
description=(
"Upload a PDF or DOCX document, and this tool will analyze it to extract and consolidate its content. "
"It might take a while, be patient. You are advised to upload smaller documents with shorter text as it may take a while to process longer files."
),
)
# Launch the interface
if __name__ == "__main__":
interface.launch()