Afeezee commited on
Commit
4740109
·
verified ·
1 Parent(s): 9fc8a1b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -48
app.py CHANGED
@@ -26,11 +26,9 @@ def extract_text_from_file(file):
26
  else:
27
  return "Unsupported file format. Please upload a PDF or DOCX file."
28
 
 
29
  def chunk_text(text, max_tokens=4000):
30
- """
31
- Splits text into chunks small enough for the Llama model to process.
32
- Each chunk is limited to `max_tokens` for safe processing.
33
- """
34
  words = text.split()
35
  chunks = []
36
  current_chunk = []
@@ -45,25 +43,19 @@ def chunk_text(text, max_tokens=4000):
45
 
46
  return chunks
47
 
 
48
  def analyze_chunk(chunk):
49
- """
50
- Analyzes a single chunk of text using the Cerebras model.
51
- """
52
  messages = [
53
  {
54
  "role": "system",
55
  "content": (
56
  "You are an experienced scholar tasked with analyzing research articles. "
57
- "Focus on extracting insights based on: Author (APA format with et al if applicable), Year of publication, Title of the article; "
58
- "Problem addressed; Methodology (datasets, tools, techniques, algorithms); "
59
- "Results (specific, quantifiable metrics); and Remarks (strengths, weaknesses, improvements). "
60
- "Summarize only insights related to these fields and disregard irrelevant content."
61
  )
62
  },
63
- {
64
- "role": "user",
65
- "content": chunk
66
- }
67
  ]
68
 
69
  try:
@@ -77,49 +69,43 @@ def analyze_chunk(chunk):
77
  )
78
  return stream.choices[0].message.content
79
  except Exception as e:
80
- return f"An error occurred while processing a chunk: {e}"
81
 
82
- def save_as_docx(content, file_name="Literature_Analysis.docx"):
83
- """
84
- Saves the given content to a DOCX file.
85
 
86
- Parameters:
87
- content (str): The text content to save.
88
- file_name (str): The name of the DOCX file (default: 'Literature_Analysis.docx').
89
- """
90
  document = Document()
91
  document.add_heading("Literature Analysis", level=1)
92
  document.add_paragraph(content)
93
- document.save(file_name)
94
- return file_name
 
 
95
 
96
  def analyze_document(file):
97
- """Processes and analyzes the uploaded document."""
98
  text = extract_text_from_file(file)
99
  if text.startswith("Unsupported file format"):
100
- yield f"**Error:** {text}"
101
- return
102
 
103
  chunks = chunk_text(text)
104
  all_insights = []
 
105
 
106
- yield "**Processing the document. Please wait...**\n"
107
  for i, chunk in enumerate(chunks, 1):
108
- yield f"**Processing chunk {i} of {len(chunks)}...**"
109
  result = analyze_chunk(chunk)
110
  if result.strip(): # Only append non-empty results
111
  all_insights.append(result)
 
112
 
113
  if not all_insights:
114
- yield "**Error:** No valid insights were extracted from the document."
115
- return
116
 
117
- yield "**Consolidating all insights into a final summary...**"
118
  consolidated_summary_prompt = (
119
- "Below are insights extracted from multiple chunks of a document. "
120
  "Consolidate these insights into a single output organized as follows: "
121
  "Author, Year, Title; Problem addressed; Methodology; Results; and Remarks. "
122
- "Make the final output concise and coherent."
123
  )
124
 
125
  try:
@@ -136,27 +122,37 @@ def analyze_document(file):
136
  )
137
  final_summary = ""
138
  for chunk in stream:
139
- content = chunk.choices[0].delta.content or ""
140
- final_summary += content
141
-
142
- # Show the final summary first
143
- yield f"**Final Summary:**\n\n{final_summary}"
144
 
145
- # Save the final summary as a .docx file
146
- docx_file = save_as_docx(final_summary)
147
- yield f"**Download the DOCX file here:** [Download {docx_file}](file:///{docx_file})"
148
  except Exception as e:
149
- yield f"**Error:** An error occurred during consolidation: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
- # Define the Gradio interface
152
  interface = gr.Interface(
153
- fn=analyze_document,
154
  inputs=gr.File(label="Upload a PDF or DOCX file"),
155
- outputs=gr.Markdown(label="Literature Analysis"),
 
 
 
156
  title="Automated Literature Review",
157
  description=(
158
  "Upload a PDF or DOCX document, and this tool will analyze it to extract and consolidate its content. "
159
- "It might take a while, be patient. You are advised to upload smaller documents with shorter text as it may take a while to process longer files."
160
  ),
161
  )
162
 
 
26
  else:
27
  return "Unsupported file format. Please upload a PDF or DOCX file."
28
 
29
+
30
  def chunk_text(text, max_tokens=4000):
31
+ """Splits text into manageable chunks."""
 
 
 
32
  words = text.split()
33
  chunks = []
34
  current_chunk = []
 
43
 
44
  return chunks
45
 
46
+
47
  def analyze_chunk(chunk):
48
+ """Analyzes a single chunk using the Cerebras model."""
 
 
49
  messages = [
50
  {
51
  "role": "system",
52
  "content": (
53
  "You are an experienced scholar tasked with analyzing research articles. "
54
+ "Focus on extracting insights such as Author, Year, Title; Problem addressed; "
55
+ "Methodology; Results; and Remarks. Only include relevant content."
 
 
56
  )
57
  },
58
+ {"role": "user", "content": chunk}
 
 
 
59
  ]
60
 
61
  try:
 
69
  )
70
  return stream.choices[0].message.content
71
  except Exception as e:
72
+ return f"Error while processing chunk: {e}"
73
 
 
 
 
74
 
75
+ def generate_docx(content):
76
+ """Generates a DOCX file from content."""
 
 
77
  document = Document()
78
  document.add_heading("Literature Analysis", level=1)
79
  document.add_paragraph(content)
80
+ file_path = "Literature_Analysis.docx"
81
+ document.save(file_path)
82
+ return file_path
83
+
84
 
85
  def analyze_document(file):
86
+ """Processes the document and generates insights."""
87
  text = extract_text_from_file(file)
88
  if text.startswith("Unsupported file format"):
89
+ return None, f"**Error:** {text}"
 
90
 
91
  chunks = chunk_text(text)
92
  all_insights = []
93
+ markdown_output = ""
94
 
 
95
  for i, chunk in enumerate(chunks, 1):
 
96
  result = analyze_chunk(chunk)
97
  if result.strip(): # Only append non-empty results
98
  all_insights.append(result)
99
+ markdown_output += f"### Chunk {i} Analysis\n{result}\n\n"
100
 
101
  if not all_insights:
102
+ return None, "**Error:** No valid insights were extracted from the document."
 
103
 
 
104
  consolidated_summary_prompt = (
105
+ "Below are insights extracted from multiple chunks. "
106
  "Consolidate these insights into a single output organized as follows: "
107
  "Author, Year, Title; Problem addressed; Methodology; Results; and Remarks. "
108
+ "Make the output concise and coherent."
109
  )
110
 
111
  try:
 
122
  )
123
  final_summary = ""
124
  for chunk in stream:
125
+ final_summary += chunk.choices[0].delta.content or ""
 
 
 
 
126
 
127
+ # Return the final summary for display, and the content for DOCX generation
128
+ return markdown_output + f"\n\n### Final Summary\n\n{final_summary}", final_summary
 
129
  except Exception as e:
130
+ return None, f"**Error:** An error occurred during consolidation: {e}"
131
+
132
+
133
+ def interface_logic(file):
134
+ """Handles the Gradio interface logic."""
135
+ markdown_output, docx_content = analyze_document(file)
136
+ if docx_content:
137
+ # Generate the DOCX file after analysis is complete
138
+ docx_file = generate_docx(docx_content)
139
+ return markdown_output, docx_file
140
+ else:
141
+ return markdown_output, None
142
+
143
 
144
+ # Define Gradio interface
145
  interface = gr.Interface(
146
+ fn=interface_logic,
147
  inputs=gr.File(label="Upload a PDF or DOCX file"),
148
+ outputs=[
149
+ gr.Markdown(label="Literature Analysis"),
150
+ gr.File(label="Download Analysis as DOCX")
151
+ ],
152
  title="Automated Literature Review",
153
  description=(
154
  "Upload a PDF or DOCX document, and this tool will analyze it to extract and consolidate its content. "
155
+ "Progress updates will be shown during processing. After analysis, you can download the report as a DOCX file."
156
  ),
157
  )
158