Afeezee's picture
Update app.py
4740109 verified
raw
history blame
5.14 kB
import os
from cerebras.cloud.sdk import Cerebras
from PyPDF2 import PdfReader
from docx import Document
import gradio as gr
Cerekey = os.getenv("LitReview")
# Initialize Cerebras AI client with the API key
client = Cerebras(api_key = Cerekey)
def extract_text_from_file(file):
"""Extracts text from uploaded PDF or DOCX files."""
if file.name.endswith(".pdf"):
reader = PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
elif file.name.endswith(".docx"):
doc = Document(file)
text = "\n".join([p.text for p in doc.paragraphs])
return text
else:
return "Unsupported file format. Please upload a PDF or DOCX file."
def chunk_text(text, max_tokens=4000):
"""Splits text into manageable chunks."""
words = text.split()
chunks = []
current_chunk = []
for word in words:
current_chunk.append(word)
if len(" ".join(current_chunk)) > max_tokens:
chunks.append(" ".join(current_chunk))
current_chunk = []
if current_chunk:
chunks.append(" ".join(current_chunk))
return chunks
def analyze_chunk(chunk):
"""Analyzes a single chunk using the Cerebras model."""
messages = [
{
"role": "system",
"content": (
"You are an experienced scholar tasked with analyzing research articles. "
"Focus on extracting insights such as Author, Year, Title; Problem addressed; "
"Methodology; Results; and Remarks. Only include relevant content."
)
},
{"role": "user", "content": chunk}
]
try:
stream = client.chat.completions.create(
messages=messages,
model="llama-3.3-70b",
stream=False,
max_completion_tokens=1024,
temperature=0.2,
top_p=1
)
return stream.choices[0].message.content
except Exception as e:
return f"Error while processing chunk: {e}"
def generate_docx(content):
"""Generates a DOCX file from content."""
document = Document()
document.add_heading("Literature Analysis", level=1)
document.add_paragraph(content)
file_path = "Literature_Analysis.docx"
document.save(file_path)
return file_path
def analyze_document(file):
"""Processes the document and generates insights."""
text = extract_text_from_file(file)
if text.startswith("Unsupported file format"):
return None, f"**Error:** {text}"
chunks = chunk_text(text)
all_insights = []
markdown_output = ""
for i, chunk in enumerate(chunks, 1):
result = analyze_chunk(chunk)
if result.strip(): # Only append non-empty results
all_insights.append(result)
markdown_output += f"### Chunk {i} Analysis\n{result}\n\n"
if not all_insights:
return None, "**Error:** No valid insights were extracted from the document."
consolidated_summary_prompt = (
"Below are insights extracted from multiple chunks. "
"Consolidate these insights into a single output organized as follows: "
"Author, Year, Title; Problem addressed; Methodology; Results; and Remarks. "
"Make the output concise and coherent."
)
try:
stream = client.chat.completions.create(
messages=[
{"role": "system", "content": consolidated_summary_prompt},
{"role": "user", "content": "\n\n".join(all_insights)}
],
model="llama-3.3-70b",
stream=True,
max_completion_tokens=1024,
temperature=0.2,
top_p=1
)
final_summary = ""
for chunk in stream:
final_summary += chunk.choices[0].delta.content or ""
# Return the final summary for display, and the content for DOCX generation
return markdown_output + f"\n\n### Final Summary\n\n{final_summary}", final_summary
except Exception as e:
return None, f"**Error:** An error occurred during consolidation: {e}"
def interface_logic(file):
"""Handles the Gradio interface logic."""
markdown_output, docx_content = analyze_document(file)
if docx_content:
# Generate the DOCX file after analysis is complete
docx_file = generate_docx(docx_content)
return markdown_output, docx_file
else:
return markdown_output, None
# Define Gradio interface
interface = gr.Interface(
fn=interface_logic,
inputs=gr.File(label="Upload a PDF or DOCX file"),
outputs=[
gr.Markdown(label="Literature Analysis"),
gr.File(label="Download Analysis as DOCX")
],
title="Automated Literature Review",
description=(
"Upload a PDF or DOCX document, and this tool will analyze it to extract and consolidate its content. "
"Progress updates will be shown during processing. After analysis, you can download the report as a DOCX file."
),
)
# Launch the interface
if __name__ == "__main__":
interface.launch()