Spaces:
Sleeping
Sleeping
File size: 5,565 Bytes
8e2b59e 4905955 8e2b59e 7b9e03b f5e1acf 4905955 7b9e03b 8e2b59e 16137d7 8e2b59e 16137d7 8e2b59e 16137d7 8e2b59e 16137d7 8e2b59e 16137d7 4905955 8e2b59e e0485bc 16137d7 4905955 8e2b59e 16137d7 8e2b59e 7d64aa2 d7cfcaf 4740109 8e2b59e 7d64aa2 8e2b59e 7b9e03b 8e2b59e 0e2f323 37dff90 0e2f323 8e2b59e a9cbd1f 0e2f323 a9cbd1f 7b9e03b 8e2b59e 16137d7 8e2b59e 16137d7 8e2b59e a9cbd1f 4905955 a9cbd1f e0485bc 4905955 a9cbd1f 4905955 16137d7 0e2f323 7b9e03b 7d64aa2 7b9e03b a9cbd1f 7b9e03b d1bba55 7b9e03b 185b795 0e2f323 7b9e03b 0e2f323 7b9e03b 0e2f323 8e2b59e 0e2f323 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
import os
from cerebras.cloud.sdk import Cerebras
from PyPDF2 import PdfReader
from docx import Document
import gradio as gr
Cerekey = os.getenv("LitReview")
# Initialize Cerebras AI client with the API key
client = Cerebras(api_key = Cerekey)
def extract_text_from_file(file):
"""Extracts text from uploaded PDF or DOCX files."""
if file.name.endswith(".pdf"):
reader = PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
elif file.name.endswith(".docx"):
doc = Document(file)
text = "\n".join([p.text for p in doc.paragraphs])
return text
else:
return "Unsupported file format. Please upload a PDF or DOCX file."
def chunk_text(text, max_tokens=4000):
"""
Splits text into chunks small enough for the Llama model to process.
Each chunk is limited to `max_tokens` for safe processing.
"""
words = text.split()
chunks = []
current_chunk = []
for word in words:
current_chunk.append(word)
if len(" ".join(current_chunk)) > max_tokens:
chunks.append(" ".join(current_chunk))
current_chunk = []
if current_chunk:
chunks.append(" ".join(current_chunk))
return chunks
def analyze_chunk(chunk):
"""
Analyzes a single chunk of text using the Cerebras Llama model.
"""
messages = [
{
"role": "system",
"content": (
"You are an experienced scholar tasked with analyzing research articles. "
"Focus on extracting insights based on: Author (APA format with et al if applicable), Year of publication, Title of the article; "
"Problem addressed; Methodology (datasets, tools, techniques, algorithms); "
"Results (specific, quantifiable metrics); and Remarks (strengths, weaknesses, improvements). "
"Summarize only insights related to these fields and disregard irrelevant content."
)
},
{
"role": "user",
"content": chunk
}
]
try:
# Use Cerebras AI for processing
stream = client.chat.completions.create(
messages=messages,
model="llama-3.3-70b",
stream=True,
max_completion_tokens=1024,
temperature=0.2,
top_p=1
)
result = ""
for chunk in stream:
result += chunk.choices[0].delta.content or ""
return result
except Exception as e:
return f"An error occurred while processing a chunk: {e}"
def save_as_docx(content):
"""Generates and saves a DOCX file."""
document = Document()
document.add_heading("Literature Analysis", level=1)
document.add_paragraph(content)
file_path = "Literature_Analysis.docx"
document.save(file_path)
return file_path
def analyze_document(file):
"""Processes and analyzes the uploaded document."""
text = extract_text_from_file(file)
if text.startswith("Unsupported file format"):
yield f"**Error:** {text}"
return
chunks = chunk_text(text)
all_insights = []
yield "**Processing the document. Please wait...**\n"
for i, chunk in enumerate(chunks, 1):
yield f"**Processing chunk {i} of {len(chunks)}...**"
result = analyze_chunk(chunk)
if result.strip(): # Only append non-empty results
all_insights.append(result)
if not all_insights:
yield "**Error:** No valid insights were extracted from the document."
return
yield "**Consolidating all insights into a final summary...**"
consolidated_summary_prompt = (
"Below are insights extracted from multiple chunks of a document. "
"Consolidate these insights into a single output organized as follows: "
"Author, Year, Title; Problem addressed; Methodology; Results; and Remarks. "
"Make the final output concise and coherent."
)
try:
stream = client.chat.completions.create(
messages=[
{"role": "system", "content": consolidated_summary_prompt},
{"role": "user", "content": "\n\n".join(all_insights)}
],
model="llama-3.3-70b",
stream=True,
max_completion_tokens=1024,
temperature=0.2,
top_p=1
)
final_summary = ""
for chunk in stream:
final_summary += chunk.choices[0].delta.content or ""
yield f"**Final Summary:**\n\n{final_summary}"
except Exception as e:
yield f"**Error:** An error occurred during consolidation: {e}"
# Generate DOCX file after processing
docx_file = save_as_docx(final_summary)
return progress_output, docx_file
except Exception as e:
return f"**Error:** An error occurred during consolidation: {e}", None
# Define the Gradio interface
interface = gr.Interface(
fn= analyze_document,
inputs=gr.File(label="Upload a PDF or DOCX file"),
outputs=gr.Markdown(label="Progress and Analysis"),
title="Automated Literature Review",
description=(
"Upload a PDF or DOCX document, and this tool will analyze it to extract and consolidate its content. "
"It might take a while, be patient. You are advised to upload smaller documents with shorter text as it may take a while to process longer files."
),
)
# Launch the interface
if __name__ == "__main__":
interface.launch() |