Spaces:
Sleeping
Sleeping
import os | |
from groq import Groq | |
from PyPDF2 import PdfReader | |
from docx import Document | |
import gradio as gr | |
Groqkey = os.getenv("LitReview") | |
# Groq API Key | |
GROQ_API_KEY = Groqkey | |
# Initialize Groq client | |
client = Groq(api_key=GROQ_API_KEY) | |
def extract_text_from_file(file): | |
"""Extracts text from uploaded PDF or DOCX files.""" | |
if file.name.endswith(".pdf"): | |
reader = PdfReader(file) | |
text = "" | |
for page in reader.pages: | |
text += page.extract_text() | |
return text | |
elif file.name.endswith(".docx"): | |
doc = Document(file) | |
text = "\n".join([p.text for p in doc.paragraphs]) | |
return text | |
else: | |
return "Unsupported file format. Please upload a PDF or DOCX file." | |
def chunk_text(text, max_tokens=4000): | |
""" | |
Splits text into chunks small enough for the Llama model to process. | |
Each chunk is limited to `max_tokens` for safe processing. | |
""" | |
words = text.split() | |
chunks = [] | |
current_chunk = [] | |
for word in words: | |
current_chunk.append(word) | |
if len(" ".join(current_chunk)) > max_tokens: | |
chunks.append(" ".join(current_chunk)) | |
current_chunk = [] | |
if current_chunk: | |
chunks.append(" ".join(current_chunk)) | |
return chunks | |
def analyze_chunk(chunk): | |
""" | |
Analyzes a single chunk of text using the Llama model. | |
""" | |
messages = [ | |
{ | |
"role": "system", | |
"content": ( | |
"You are an experienced scholar tasked with analyzing research articles. " | |
"Focus on extracting insights based on: Author (APA format with et al if applicable), Year of publication, Title of the article; " | |
"Problem addressed; Methodology (datasets, tools, techniques, algorithms); " | |
"Results (specific, quantifiable metrics); and Remarks (strengths, weaknesses, improvements). " | |
"Summarize only insights related to these fields and disregard irrelevant content." | |
) | |
}, | |
{ | |
"role": "user", | |
"content": chunk | |
} | |
] | |
try: | |
completion = client.chat.completions.create( | |
model="llama-3.3-70b-versatile", | |
messages=messages, | |
temperature=0.7, | |
max_tokens=4096, | |
top_p=1, | |
stream=False | |
) | |
return completion.choices[0].message.content | |
except Exception as e: | |
return f"An error occurred while processing a chunk: {e}" | |
def analyze_document(file): | |
"""Processes and analyzes the uploaded document.""" | |
text = extract_text_from_file(file) | |
if text.startswith("Unsupported file format"): | |
yield f"**Error:** {text}" | |
return | |
chunks = chunk_text(text) | |
all_insights = [] | |
yield "**Processing the document. Please wait...**\n" | |
for i, chunk in enumerate(chunks, 1): | |
yield f"**Processing chunk {i} of {len(chunks)}...**" | |
result = analyze_chunk(chunk) | |
if result.strip(): # Only append non-empty results | |
all_insights.append(result) | |
if not all_insights: | |
yield "**Error:** No valid insights were extracted from the document." | |
return | |
yield "**Consolidating all insights into a final summary...**" | |
consolidated_summary_prompt = ( | |
"Below are insights extracted from multiple chunks of a document. " | |
"Consolidate these insights into a single output organized as follows: " | |
"Author, Year, Title; Problem addressed; Methodology; Results; and Remarks. " | |
"Make the final output concise and coherent." | |
) | |
try: | |
final_summary = client.chat.completions.create( | |
model="llama-3.3-70b-versatile", | |
messages=[ | |
{"role": "system", "content": consolidated_summary_prompt}, | |
{"role": "user", "content": "\n\n".join(all_insights)} | |
], | |
temperature=0.7, | |
max_tokens=4096, | |
top_p=1, | |
stream=False | |
) | |
yield f"**Final Summary:**\n\n{final_summary.choices[0].message.content}" | |
except Exception as e: | |
yield f"**Error:** An error occurred during consolidation: {e}" | |
# Define the Gradio interface | |
interface = gr.Interface( | |
fn=analyze_document, | |
inputs=gr.File(label="Upload a PDF or DOCX file"), | |
outputs=gr.Markdown(label="Literature Analysis"), | |
title="Automated Literature Review", | |
description=( | |
"Upload a PDF or DOCX document, and this tool will analyze it to extract and consolidate its content. " | |
"It might take a while, be patient. You are advised to upload smaller documents with shorter text as it may take a while to process longer files." | |
), | |
) | |
# Launch the interface | |
if __name__ == "__main__": | |
interface.launch() |