File size: 5,565 Bytes
8e2b59e
4905955
8e2b59e
 
 
 
7b9e03b
f5e1acf
4905955
 
7b9e03b
 
8e2b59e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16137d7
 
 
 
8e2b59e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16137d7
 
 
8e2b59e
 
 
 
 
16137d7
 
 
 
8e2b59e
 
16137d7
 
 
 
8e2b59e
 
 
16137d7
4905955
8e2b59e
e0485bc
16137d7
4905955
 
 
8e2b59e
16137d7
 
 
 
8e2b59e
7d64aa2
 
 
d7cfcaf
 
 
4740109
 
 
8e2b59e
7d64aa2
8e2b59e
 
7b9e03b
 
8e2b59e
 
 
 
0e2f323
37dff90
0e2f323
8e2b59e
 
 
 
a9cbd1f
0e2f323
 
a9cbd1f
7b9e03b
8e2b59e
16137d7
8e2b59e
 
16137d7
8e2b59e
 
a9cbd1f
4905955
a9cbd1f
 
 
 
e0485bc
4905955
 
 
 
a9cbd1f
4905955
 
16137d7
0e2f323
7b9e03b
 
7d64aa2
 
 
7b9e03b
a9cbd1f
7b9e03b
d1bba55
7b9e03b
185b795
0e2f323
7b9e03b
0e2f323
7b9e03b
0e2f323
 
 
 
 
 
 
 
8e2b59e
0e2f323
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import os
from cerebras.cloud.sdk import Cerebras
from PyPDF2 import PdfReader
from docx import Document
import gradio as gr


Cerekey = os.getenv("LitReview")

# Initialize Cerebras AI client with the API key
client = Cerebras(api_key = Cerekey)


def extract_text_from_file(file):
    """Extracts text from uploaded PDF or DOCX files."""
    if file.name.endswith(".pdf"):
        reader = PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text
    elif file.name.endswith(".docx"):
        doc = Document(file)
        text = "\n".join([p.text for p in doc.paragraphs])
        return text
    else:
        return "Unsupported file format. Please upload a PDF or DOCX file."

def chunk_text(text, max_tokens=4000):
    """
    Splits text into chunks small enough for the Llama model to process.
    Each chunk is limited to `max_tokens` for safe processing.
    """
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        current_chunk.append(word)
        if len(" ".join(current_chunk)) > max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

def analyze_chunk(chunk):
    """
    Analyzes a single chunk of text using the Cerebras Llama model.
    """
    messages = [
        {
            "role": "system",
            "content": (
                "You are an experienced scholar tasked with analyzing research articles. "
                "Focus on extracting insights based on: Author (APA format with et al if applicable), Year of publication, Title of the article; "
                "Problem addressed; Methodology (datasets, tools, techniques, algorithms); "
                "Results (specific, quantifiable metrics); and Remarks (strengths, weaknesses, improvements). "
                "Summarize only insights related to these fields and disregard irrelevant content."
            )
        },
        {
            "role": "user",
            "content": chunk
        }
    ]

    try:
        # Use Cerebras AI for processing
        stream = client.chat.completions.create(
            messages=messages,
            model="llama-3.3-70b",
            stream=True,
            max_completion_tokens=1024,
            temperature=0.2,
            top_p=1
        )
        result = ""
        for chunk in stream:
            result += chunk.choices[0].delta.content or ""
        return result
    except Exception as e:
        return f"An error occurred while processing a chunk: {e}"
def save_as_docx(content):
    """Generates and saves a DOCX file."""
    document = Document()
    document.add_heading("Literature Analysis", level=1)
    document.add_paragraph(content)
    file_path = "Literature_Analysis.docx"
    document.save(file_path)
    return file_path
def analyze_document(file):
    """Processes and analyzes the uploaded document."""
    text = extract_text_from_file(file)
    if text.startswith("Unsupported file format"):
        yield f"**Error:** {text}"
        return

    chunks = chunk_text(text)
    all_insights = []

    yield "**Processing the document. Please wait...**\n"
    for i, chunk in enumerate(chunks, 1):
        yield f"**Processing chunk {i} of {len(chunks)}...**"
        result = analyze_chunk(chunk)
        if result.strip():  # Only append non-empty results
            all_insights.append(result)

    if not all_insights:
        yield "**Error:** No valid insights were extracted from the document."
        return

    yield "**Consolidating all insights into a final summary...**"
    consolidated_summary_prompt = (
        "Below are insights extracted from multiple chunks of a document. "
        "Consolidate these insights into a single output organized as follows: "
        "Author, Year, Title; Problem addressed; Methodology; Results; and Remarks. "
        "Make the final output concise and coherent."
    )

    try:
        stream = client.chat.completions.create(
            messages=[
                {"role": "system", "content": consolidated_summary_prompt},
                {"role": "user", "content": "\n\n".join(all_insights)}
            ],
            model="llama-3.3-70b",
            stream=True,
            max_completion_tokens=1024,
            temperature=0.2,
            top_p=1
        )
        final_summary = ""
        for chunk in stream:
            final_summary += chunk.choices[0].delta.content or ""
        yield f"**Final Summary:**\n\n{final_summary}"
    except Exception as e:
        yield f"**Error:** An error occurred during consolidation: {e}"

        # Generate DOCX file after processing
        docx_file = save_as_docx(final_summary)
        return progress_output, docx_file
    except Exception as e:
        return f"**Error:** An error occurred during consolidation: {e}", None

    
# Define the Gradio interface
interface = gr.Interface(
    fn= analyze_document,
    inputs=gr.File(label="Upload a PDF or DOCX file"),
    outputs=gr.Markdown(label="Progress and Analysis"),
    title="Automated Literature Review",
    description=(
        "Upload a PDF or DOCX document, and this tool will analyze it to extract and consolidate its content. "
        "It might take a while, be patient. You are advised to upload smaller documents with shorter text as it may take a while to process longer files."
    ),
)

# Launch the interface
if __name__ == "__main__":
    interface.launch()