File size: 4,815 Bytes
8e2b59e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276038f
8e2b59e
 
 
 
 
 
 
 
 
 
 
 
 
 
37dff90
 
8e2b59e
 
 
 
37dff90
 
 
8e2b59e
 
 
 
a9cbd1f
 
 
 
37dff90
8e2b59e
 
 
 
 
 
 
a9cbd1f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e2b59e
 
 
 
 
 
a2d8d63
8e2b59e
 
 
 
 
 
 
 
a9cbd1f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import os
from groq import Groq
from PyPDF2 import PdfReader
from docx import Document
import gradio as gr

Groqkey = os.getenv("LitReview")
# Groq API Key
GROQ_API_KEY = Groqkey

# Initialize Groq client
client = Groq(api_key=GROQ_API_KEY)

def extract_text_from_file(file):
    """Extracts text from uploaded PDF or DOCX files."""
    if file.name.endswith(".pdf"):
        reader = PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text
    elif file.name.endswith(".docx"):
        doc = Document(file)
        text = "\n".join([p.text for p in doc.paragraphs])
        return text
    else:
        return "Unsupported file format. Please upload a PDF or DOCX file."

def chunk_text(text, max_tokens=4000):
    """
    Splits text into chunks small enough for the Llama model to process.
    Each chunk is limited to `max_tokens` for safe processing.
    """
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        current_chunk.append(word)
        if len(" ".join(current_chunk)) > max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

def analyze_chunk(chunk):
    """
    Analyzes a single chunk of text using the Llama model.
    """
    messages = [
        {
            "role": "system",
            "content": (
                "You are an experienced scholar tasked with analyzing research articles. "
                "Focus on extracting insights based on: Author (APA format with et al if applicable), Year of publication, Title of the article; "
                "Problem addressed; Methodology (datasets, tools, techniques, algorithms); "
                "Results (specific, quantifiable metrics); and Remarks (strengths, weaknesses, improvements). "
                "Summarize only insights related to these fields and disregard irrelevant content."
            )
        },
        {
            "role": "user",
            "content": chunk
        }
    ]

    try:
        completion = client.chat.completions.create(
            model="llama-3.3-70b-versatile",
            messages=messages,
            temperature=0.7,
            max_tokens=4096,
            top_p=1,
            stream=False
        )
        return completion.choices[0].message.content
    except Exception as e:
        return f"An error occurred while processing a chunk: {e}"

def analyze_document(file):
    """Processes and analyzes the uploaded document."""
    text = extract_text_from_file(file)
    if text.startswith("Unsupported file format"):
        yield f"**Error:** {text}"
        return

    chunks = chunk_text(text)
    all_insights = []

    yield "**Processing the document. Please wait...**\n"
    for i, chunk in enumerate(chunks, 1):
        yield f"**Processing chunk {i} of {len(chunks)}...**"
        result = analyze_chunk(chunk)
        if result.strip():  # Only append non-empty results
            all_insights.append(result)

    if not all_insights:
        yield "**Error:** No valid insights were extracted from the document."
        return

    yield "**Consolidating all insights into a final summary...**"
    consolidated_summary_prompt = (
        "Below are insights extracted from multiple chunks of a document. "
        "Consolidate these insights into a single output organized as follows: "
        "Author, Year, Title; Problem addressed; Methodology; Results; and Remarks. "
        "Make the final output concise and coherent."
    )

    try:
        final_summary = client.chat.completions.create(
            model="llama-3.3-70b-versatile",
            messages=[
                {"role": "system", "content": consolidated_summary_prompt},
                {"role": "user", "content": "\n\n".join(all_insights)}
            ],
            temperature=0.7,
            max_tokens=4096,
            top_p=1,
            stream=False
        )
        yield f"**Final Summary:**\n\n{final_summary.choices[0].message.content}"
    except Exception as e:
        yield f"**Error:** An error occurred during consolidation: {e}"

# Define the Gradio interface
interface = gr.Interface(
    fn=analyze_document,
    inputs=gr.File(label="Upload a PDF or DOCX file"),
    outputs=gr.Markdown(label="Literature Analysis"),
    title="Automated Literature Review",
    description=(
        "Upload a PDF or DOCX document, and this tool will analyze it to extract and consolidate its content. "
        "It might take a while, be patient. You are advised to upload smaller documents with shorter text as it may take a while to process longer files."
    ),
)

# Launch the interface
if __name__ == "__main__":
    interface.launch()