Spaces:
Sleeping
Sleeping
File size: 4,815 Bytes
8e2b59e 276038f 8e2b59e 37dff90 8e2b59e 37dff90 8e2b59e a9cbd1f 37dff90 8e2b59e a9cbd1f 8e2b59e a2d8d63 8e2b59e a9cbd1f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import os
from groq import Groq
from PyPDF2 import PdfReader
from docx import Document
import gradio as gr
Groqkey = os.getenv("LitReview")
# Groq API Key
GROQ_API_KEY = Groqkey
# Initialize Groq client
client = Groq(api_key=GROQ_API_KEY)
def extract_text_from_file(file):
"""Extracts text from uploaded PDF or DOCX files."""
if file.name.endswith(".pdf"):
reader = PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
elif file.name.endswith(".docx"):
doc = Document(file)
text = "\n".join([p.text for p in doc.paragraphs])
return text
else:
return "Unsupported file format. Please upload a PDF or DOCX file."
def chunk_text(text, max_tokens=4000):
"""
Splits text into chunks small enough for the Llama model to process.
Each chunk is limited to `max_tokens` for safe processing.
"""
words = text.split()
chunks = []
current_chunk = []
for word in words:
current_chunk.append(word)
if len(" ".join(current_chunk)) > max_tokens:
chunks.append(" ".join(current_chunk))
current_chunk = []
if current_chunk:
chunks.append(" ".join(current_chunk))
return chunks
def analyze_chunk(chunk):
"""
Analyzes a single chunk of text using the Llama model.
"""
messages = [
{
"role": "system",
"content": (
"You are an experienced scholar tasked with analyzing research articles. "
"Focus on extracting insights based on: Author (APA format with et al if applicable), Year of publication, Title of the article; "
"Problem addressed; Methodology (datasets, tools, techniques, algorithms); "
"Results (specific, quantifiable metrics); and Remarks (strengths, weaknesses, improvements). "
"Summarize only insights related to these fields and disregard irrelevant content."
)
},
{
"role": "user",
"content": chunk
}
]
try:
completion = client.chat.completions.create(
model="llama-3.3-70b-versatile",
messages=messages,
temperature=0.7,
max_tokens=4096,
top_p=1,
stream=False
)
return completion.choices[0].message.content
except Exception as e:
return f"An error occurred while processing a chunk: {e}"
def analyze_document(file):
"""Processes and analyzes the uploaded document."""
text = extract_text_from_file(file)
if text.startswith("Unsupported file format"):
yield f"**Error:** {text}"
return
chunks = chunk_text(text)
all_insights = []
yield "**Processing the document. Please wait...**\n"
for i, chunk in enumerate(chunks, 1):
yield f"**Processing chunk {i} of {len(chunks)}...**"
result = analyze_chunk(chunk)
if result.strip(): # Only append non-empty results
all_insights.append(result)
if not all_insights:
yield "**Error:** No valid insights were extracted from the document."
return
yield "**Consolidating all insights into a final summary...**"
consolidated_summary_prompt = (
"Below are insights extracted from multiple chunks of a document. "
"Consolidate these insights into a single output organized as follows: "
"Author, Year, Title; Problem addressed; Methodology; Results; and Remarks. "
"Make the final output concise and coherent."
)
try:
final_summary = client.chat.completions.create(
model="llama-3.3-70b-versatile",
messages=[
{"role": "system", "content": consolidated_summary_prompt},
{"role": "user", "content": "\n\n".join(all_insights)}
],
temperature=0.7,
max_tokens=4096,
top_p=1,
stream=False
)
yield f"**Final Summary:**\n\n{final_summary.choices[0].message.content}"
except Exception as e:
yield f"**Error:** An error occurred during consolidation: {e}"
# Define the Gradio interface
interface = gr.Interface(
fn=analyze_document,
inputs=gr.File(label="Upload a PDF or DOCX file"),
outputs=gr.Markdown(label="Literature Analysis"),
title="Automated Literature Review",
description=(
"Upload a PDF or DOCX document, and this tool will analyze it to extract and consolidate its content. "
"It might take a while, be patient. You are advised to upload smaller documents with shorter text as it may take a while to process longer files."
),
)
# Launch the interface
if __name__ == "__main__":
interface.launch() |