Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -26,11 +26,9 @@ def extract_text_from_file(file):
|
|
26 |
else:
|
27 |
return "Unsupported file format. Please upload a PDF or DOCX file."
|
28 |
|
|
|
29 |
def chunk_text(text, max_tokens=4000):
|
30 |
-
"""
|
31 |
-
Splits text into chunks small enough for the Llama model to process.
|
32 |
-
Each chunk is limited to `max_tokens` for safe processing.
|
33 |
-
"""
|
34 |
words = text.split()
|
35 |
chunks = []
|
36 |
current_chunk = []
|
@@ -45,25 +43,19 @@ def chunk_text(text, max_tokens=4000):
|
|
45 |
|
46 |
return chunks
|
47 |
|
|
|
48 |
def analyze_chunk(chunk):
|
49 |
-
"""
|
50 |
-
Analyzes a single chunk of text using the Cerebras model.
|
51 |
-
"""
|
52 |
messages = [
|
53 |
{
|
54 |
"role": "system",
|
55 |
"content": (
|
56 |
"You are an experienced scholar tasked with analyzing research articles. "
|
57 |
-
"Focus on extracting insights
|
58 |
-
"
|
59 |
-
"Results (specific, quantifiable metrics); and Remarks (strengths, weaknesses, improvements). "
|
60 |
-
"Summarize only insights related to these fields and disregard irrelevant content."
|
61 |
)
|
62 |
},
|
63 |
-
{
|
64 |
-
"role": "user",
|
65 |
-
"content": chunk
|
66 |
-
}
|
67 |
]
|
68 |
|
69 |
try:
|
@@ -77,49 +69,43 @@ def analyze_chunk(chunk):
|
|
77 |
)
|
78 |
return stream.choices[0].message.content
|
79 |
except Exception as e:
|
80 |
-
return f"
|
81 |
|
82 |
-
def save_as_docx(content, file_name="Literature_Analysis.docx"):
|
83 |
-
"""
|
84 |
-
Saves the given content to a DOCX file.
|
85 |
|
86 |
-
|
87 |
-
|
88 |
-
file_name (str): The name of the DOCX file (default: 'Literature_Analysis.docx').
|
89 |
-
"""
|
90 |
document = Document()
|
91 |
document.add_heading("Literature Analysis", level=1)
|
92 |
document.add_paragraph(content)
|
93 |
-
|
94 |
-
|
|
|
|
|
95 |
|
96 |
def analyze_document(file):
|
97 |
-
"""Processes
|
98 |
text = extract_text_from_file(file)
|
99 |
if text.startswith("Unsupported file format"):
|
100 |
-
|
101 |
-
return
|
102 |
|
103 |
chunks = chunk_text(text)
|
104 |
all_insights = []
|
|
|
105 |
|
106 |
-
yield "**Processing the document. Please wait...**\n"
|
107 |
for i, chunk in enumerate(chunks, 1):
|
108 |
-
yield f"**Processing chunk {i} of {len(chunks)}...**"
|
109 |
result = analyze_chunk(chunk)
|
110 |
if result.strip(): # Only append non-empty results
|
111 |
all_insights.append(result)
|
|
|
112 |
|
113 |
if not all_insights:
|
114 |
-
|
115 |
-
return
|
116 |
|
117 |
-
yield "**Consolidating all insights into a final summary...**"
|
118 |
consolidated_summary_prompt = (
|
119 |
-
"Below are insights extracted from multiple chunks
|
120 |
"Consolidate these insights into a single output organized as follows: "
|
121 |
"Author, Year, Title; Problem addressed; Methodology; Results; and Remarks. "
|
122 |
-
"Make the
|
123 |
)
|
124 |
|
125 |
try:
|
@@ -136,27 +122,37 @@ def analyze_document(file):
|
|
136 |
)
|
137 |
final_summary = ""
|
138 |
for chunk in stream:
|
139 |
-
|
140 |
-
final_summary += content
|
141 |
-
|
142 |
-
# Show the final summary first
|
143 |
-
yield f"**Final Summary:**\n\n{final_summary}"
|
144 |
|
145 |
-
#
|
146 |
-
|
147 |
-
yield f"**Download the DOCX file here:** [Download {docx_file}](file:///{docx_file})"
|
148 |
except Exception as e:
|
149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
|
151 |
-
# Define
|
152 |
interface = gr.Interface(
|
153 |
-
fn=
|
154 |
inputs=gr.File(label="Upload a PDF or DOCX file"),
|
155 |
-
outputs=
|
|
|
|
|
|
|
156 |
title="Automated Literature Review",
|
157 |
description=(
|
158 |
"Upload a PDF or DOCX document, and this tool will analyze it to extract and consolidate its content. "
|
159 |
-
"
|
160 |
),
|
161 |
)
|
162 |
|
|
|
26 |
else:
|
27 |
return "Unsupported file format. Please upload a PDF or DOCX file."
|
28 |
|
29 |
+
|
30 |
def chunk_text(text, max_tokens=4000):
|
31 |
+
"""Splits text into manageable chunks."""
|
|
|
|
|
|
|
32 |
words = text.split()
|
33 |
chunks = []
|
34 |
current_chunk = []
|
|
|
43 |
|
44 |
return chunks
|
45 |
|
46 |
+
|
47 |
def analyze_chunk(chunk):
|
48 |
+
"""Analyzes a single chunk using the Cerebras model."""
|
|
|
|
|
49 |
messages = [
|
50 |
{
|
51 |
"role": "system",
|
52 |
"content": (
|
53 |
"You are an experienced scholar tasked with analyzing research articles. "
|
54 |
+
"Focus on extracting insights such as Author, Year, Title; Problem addressed; "
|
55 |
+
"Methodology; Results; and Remarks. Only include relevant content."
|
|
|
|
|
56 |
)
|
57 |
},
|
58 |
+
{"role": "user", "content": chunk}
|
|
|
|
|
|
|
59 |
]
|
60 |
|
61 |
try:
|
|
|
69 |
)
|
70 |
return stream.choices[0].message.content
|
71 |
except Exception as e:
|
72 |
+
return f"Error while processing chunk: {e}"
|
73 |
|
|
|
|
|
|
|
74 |
|
75 |
+
def generate_docx(content):
|
76 |
+
"""Generates a DOCX file from content."""
|
|
|
|
|
77 |
document = Document()
|
78 |
document.add_heading("Literature Analysis", level=1)
|
79 |
document.add_paragraph(content)
|
80 |
+
file_path = "Literature_Analysis.docx"
|
81 |
+
document.save(file_path)
|
82 |
+
return file_path
|
83 |
+
|
84 |
|
85 |
def analyze_document(file):
|
86 |
+
"""Processes the document and generates insights."""
|
87 |
text = extract_text_from_file(file)
|
88 |
if text.startswith("Unsupported file format"):
|
89 |
+
return None, f"**Error:** {text}"
|
|
|
90 |
|
91 |
chunks = chunk_text(text)
|
92 |
all_insights = []
|
93 |
+
markdown_output = ""
|
94 |
|
|
|
95 |
for i, chunk in enumerate(chunks, 1):
|
|
|
96 |
result = analyze_chunk(chunk)
|
97 |
if result.strip(): # Only append non-empty results
|
98 |
all_insights.append(result)
|
99 |
+
markdown_output += f"### Chunk {i} Analysis\n{result}\n\n"
|
100 |
|
101 |
if not all_insights:
|
102 |
+
return None, "**Error:** No valid insights were extracted from the document."
|
|
|
103 |
|
|
|
104 |
consolidated_summary_prompt = (
|
105 |
+
"Below are insights extracted from multiple chunks. "
|
106 |
"Consolidate these insights into a single output organized as follows: "
|
107 |
"Author, Year, Title; Problem addressed; Methodology; Results; and Remarks. "
|
108 |
+
"Make the output concise and coherent."
|
109 |
)
|
110 |
|
111 |
try:
|
|
|
122 |
)
|
123 |
final_summary = ""
|
124 |
for chunk in stream:
|
125 |
+
final_summary += chunk.choices[0].delta.content or ""
|
|
|
|
|
|
|
|
|
126 |
|
127 |
+
# Return the final summary for display, and the content for DOCX generation
|
128 |
+
return markdown_output + f"\n\n### Final Summary\n\n{final_summary}", final_summary
|
|
|
129 |
except Exception as e:
|
130 |
+
return None, f"**Error:** An error occurred during consolidation: {e}"
|
131 |
+
|
132 |
+
|
133 |
+
def interface_logic(file):
|
134 |
+
"""Handles the Gradio interface logic."""
|
135 |
+
markdown_output, docx_content = analyze_document(file)
|
136 |
+
if docx_content:
|
137 |
+
# Generate the DOCX file after analysis is complete
|
138 |
+
docx_file = generate_docx(docx_content)
|
139 |
+
return markdown_output, docx_file
|
140 |
+
else:
|
141 |
+
return markdown_output, None
|
142 |
+
|
143 |
|
144 |
+
# Define Gradio interface
|
145 |
interface = gr.Interface(
|
146 |
+
fn=interface_logic,
|
147 |
inputs=gr.File(label="Upload a PDF or DOCX file"),
|
148 |
+
outputs=[
|
149 |
+
gr.Markdown(label="Literature Analysis"),
|
150 |
+
gr.File(label="Download Analysis as DOCX")
|
151 |
+
],
|
152 |
title="Automated Literature Review",
|
153 |
description=(
|
154 |
"Upload a PDF or DOCX document, and this tool will analyze it to extract and consolidate its content. "
|
155 |
+
"Progress updates will be shown during processing. After analysis, you can download the report as a DOCX file."
|
156 |
),
|
157 |
)
|
158 |
|