|
import gradio as gr |
|
import ast |
|
from nbformat.v4 import (new_notebook, new_markdown_cell, new_code_cell) |
|
import pdfplumber |
|
import google.generativeai as genai |
|
import nbformat |
|
import re |
|
|
|
def classify_page(statement): |
|
genai.configure(api_key='AIzaSyBjGNru-WJoLncbVrErEzJE184MgX1o_Kg') |
|
|
|
|
|
generation_config = { |
|
"temperature": 0, |
|
"max_output_tokens": 8192, |
|
"response_mime_type": "text/plain", |
|
} |
|
|
|
model = genai.GenerativeModel( |
|
model_name="gemini-1.5-flash-002", |
|
generation_config=generation_config, |
|
) |
|
|
|
chat_session = model.start_chat( |
|
history=[ |
|
] |
|
) |
|
|
|
prompt = f""" |
|
Group the following "Input" strings as substring blocks of "Code" or "Text". |
|
The response content shall be strictly just a sequence of Python touples where the first element of each touple either "Code" or "Text" and the second elemnt is the coressponding grouped substring block. |
|
|
|
Input: |
|
# Summary Report - Visual Exploration of Data on Covid-19 Outbreak, Worldwide. |
|
|
|
The data was scraped from 'worldometer' web site at https://www.worldometers.info/coronavirus/ and the analysis was carried out using 'Python' programming language and various related libraries. |
|
|
|
The worldometer web site provides the data more on cumulative basis and therefore, this report and effort also include the process of gathering daily data. |
|
|
|
# First, we start with the loading the required packages. |
|
import pandas as pd |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
import matplotlib.dates as mdates |
|
import requests |
|
|
|
Then we access the website link, read the web page content and do some pre-processing. |
|
|
|
fig, ax = plt.subplots() |
|
ax.get_yaxis().get_major_formatter().set_scientific(False) |
|
|
|
# Create a twin Axes object that shares the x-axis |
|
ax2 = ax.twinx() |
|
|
|
# Plot the new cumulative cases time-series in green |
|
plot_timeseries(ax, daily_data2.index, daily_data2['NewCases']+MissedOut_NewCases, |
|
"green", "Date" , "Cumulative no. confirmed of cases") |
|
|
|
# Plot the new cumulative deaths data in green |
|
plot_timeseries(ax2, daily_data2.index, daily_data2['NewDeaths']+MissedOut_NewDeaths, |
|
"orange", "Date" , "Cumulative no. of deaths") |
|
|
|
# Plot the new daily cases time-series in blue |
|
plot_timeseries(ax, daily_data1.index, daily_data1['NewCases'], "blue", "Date" , "Confirmed cases") |
|
|
|
response_content: |
|
[("Text", # Summary Report - Visual Exploration of Data on Covid-19 Outbreak, Worldwide. |
|
|
|
The data was scraped from 'worldometer' web site at https://www.worldometers.info/coronavirus/ and the analysis was carried out using 'Python' programming language and various related libraries. |
|
|
|
The worldometer web site provides the data more on cumulative basis and therefore, this report and effort also include the process of gathering daily data.), |
|
("Code", # First, we start with the loading the required packages. |
|
import pandas as pd |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
import matplotlib.dates as mdates |
|
import requests), |
|
("Text", Then we access the website link, read the web page content and do some pre-processing.), |
|
("Code", fig, ax = plt.subplots() |
|
ax.get_yaxis().get_major_formatter().set_scientific(False) |
|
|
|
# Create a twin Axes object that shares the x-axis |
|
ax2 = ax.twinx() |
|
|
|
# Plot the new cumulative cases time-series in green |
|
plot_timeseries(ax, daily_data2.index, daily_data2['NewCases']+MissedOut_NewCases, |
|
"green", "Date" , "Cumulative no. confirmed of cases") |
|
|
|
# Plot the new cumulative deaths data in green |
|
plot_timeseries(ax2, daily_data2.index, daily_data2['NewDeaths']+MissedOut_NewDeaths, |
|
"orange", "Date" , "Cumulative no. of deaths") |
|
|
|
# Plot the new daily cases time-series in blue |
|
plot_timeseries(ax, daily_data1.index, daily_data1['NewCases'], "blue", "Date" , "Confirmed cases")] |
|
|
|
Now, classify this string: |
|
Input: {statement} |
|
""" |
|
response = chat_session.send_message(prompt) |
|
print(response.text) |
|
print(response.text.replace("```python\n", "").replace("```", "").strip()) |
|
response = response.text.replace("```python\n", "").replace("```", "").strip() |
|
response = re.sub(r"[^\x20-\x7E]", "", response) |
|
print(response) |
|
return ast.literal_eval(response) |
|
|
|
def create_notebook(file, tc, bc): |
|
notebook = new_notebook() |
|
with pdfplumber.open(file) as pdf: |
|
for p, page in enumerate(pdf.pages): |
|
|
|
width, height = page.width, page.height |
|
top_crop = tc |
|
bottom_crop = bc |
|
|
|
crop_box = (0, top_crop, width, height - bottom_crop) |
|
|
|
|
|
cropped_page = page.within_bbox(crop_box) |
|
text = cropped_page.extract_text() |
|
if not text: |
|
continue |
|
|
|
|
|
blocks = classify_page(text) |
|
|
|
for c, value in blocks: |
|
if c == "Code": |
|
notebook.cells.append(new_code_cell(value)) |
|
elif c == "Text": |
|
value = value.replace("\n", "\n\n") |
|
|
|
notebook.cells.append(new_markdown_cell(value.replace('[[','').replace(']',''))) |
|
|
|
print(f"Page No.{p+1} completed") |
|
|
|
file_path = file.split('.pdf')[0]+'.ipynb' |
|
|
|
|
|
with open(file_path + '.ipynb', 'w', encoding="utf-8") as f: |
|
nbformat.write(notebook, f) |
|
|
|
print(f'{file_path} notebook created successfully.') |
|
return f'{file_path}' |
|
|
|
|
|
with gr.Blocks() as app: |
|
gr.Markdown("""# PDF to IPython Notebook Convertor App |
|
## Upload your PDF document containing Python code and Text and press 'Process File' button to download the iPython Notebook. |
|
### Adjust Top Crop and Bottom Crop values based on how much of top and bottom design content of your PDF document you want to eliminate.""") |
|
|
|
file_input = gr.File(label="Upload a PDF file") |
|
tc = gr.Slider(label='Top Crop in Pixels', value=25) |
|
bc = gr.Slider(label='Bottom Crop in pixels', value=25) |
|
|
|
download_button = gr.File(label="Download processed file") |
|
|
|
process_button = gr.Button("Process File") |
|
|
|
process_button.click( |
|
fn=create_notebook, |
|
inputs=[file_input, tc, bc], |
|
outputs=download_button |
|
) |
|
|
|
app.launch(debug=True) |