Spaces:

vsrinivas
/

PDF_to_IPython_NoteBook

Running

App Files Files Community

PDF_to_IPython_NoteBook / app.py

vsrinivas

Update app.py

9d2fdea verified 6 months ago

raw

history blame contribute delete

6.75 kB

	import gradio as gr
	import ast
	from nbformat.v4 import (new_notebook, new_markdown_cell, new_code_cell)
	import pdfplumber
	import google.generativeai as genai
	import nbformat
	import re

	def classify_page(statement):
	genai.configure(api_key='AIzaSyBjGNru-WJoLncbVrErEzJE184MgX1o_Kg')

	# Create the model
	generation_config = {
	"temperature": 0,
	"max_output_tokens": 8192,
	"response_mime_type": "text/plain",
	}

	model = genai.GenerativeModel(
	model_name="gemini-1.5-flash-002",
	generation_config=generation_config,
	)

	chat_session = model.start_chat(
	history=[
	]
	)

	prompt = f"""
	Group the following "Input" strings as substring blocks of "Code" or "Text".
	The response content shall be strictly just a sequence of Python touples where the first element of each touple either "Code" or "Text" and the second elemnt is the coressponding grouped substring block.

	Input:
	# Summary Report - Visual Exploration of Data on Covid-19 Outbreak, Worldwide.

	The data was scraped from 'worldometer' web site at https://www.worldometers.info/coronavirus/ and the analysis was carried out using 'Python' programming language and various related libraries.

	The worldometer web site provides the data more on cumulative basis and therefore, this report and effort also include the process of gathering daily data.

	# First, we start with the loading the required packages.
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import matplotlib.dates as mdates
	import requests

	Then we access the website link, read the web page content and do some pre-processing.

	fig, ax = plt.subplots()
	ax.get_yaxis().get_major_formatter().set_scientific(False)

	# Create a twin Axes object that shares the x-axis
	ax2 = ax.twinx()

	# Plot the new cumulative cases time-series in green
	plot_timeseries(ax, daily_data2.index, daily_data2['NewCases']+MissedOut_NewCases,
	"green", "Date" , "Cumulative no. confirmed of cases")

	# Plot the new cumulative deaths data in green
	plot_timeseries(ax2, daily_data2.index, daily_data2['NewDeaths']+MissedOut_NewDeaths,
	"orange", "Date" , "Cumulative no. of deaths")

	# Plot the new daily cases time-series in blue
	plot_timeseries(ax, daily_data1.index, daily_data1['NewCases'], "blue", "Date" , "Confirmed cases")

	response_content:
	[("Text", # Summary Report - Visual Exploration of Data on Covid-19 Outbreak, Worldwide.

	The data was scraped from 'worldometer' web site at https://www.worldometers.info/coronavirus/ and the analysis was carried out using 'Python' programming language and various related libraries.

	The worldometer web site provides the data more on cumulative basis and therefore, this report and effort also include the process of gathering daily data.),
	("Code", # First, we start with the loading the required packages.
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import matplotlib.dates as mdates
	import requests),
	("Text", Then we access the website link, read the web page content and do some pre-processing.),
	("Code", fig, ax = plt.subplots()
	ax.get_yaxis().get_major_formatter().set_scientific(False)

	# Create a twin Axes object that shares the x-axis
	ax2 = ax.twinx()

	# Plot the new cumulative cases time-series in green
	plot_timeseries(ax, daily_data2.index, daily_data2['NewCases']+MissedOut_NewCases,
	"green", "Date" , "Cumulative no. confirmed of cases")

	# Plot the new cumulative deaths data in green
	plot_timeseries(ax2, daily_data2.index, daily_data2['NewDeaths']+MissedOut_NewDeaths,
	"orange", "Date" , "Cumulative no. of deaths")

	# Plot the new daily cases time-series in blue
	plot_timeseries(ax, daily_data1.index, daily_data1['NewCases'], "blue", "Date" , "Confirmed cases")]

	Now, classify this string:
	Input: {statement}
	"""
	response = chat_session.send_message(prompt)
	print(response.text)
	print(response.text.replace("```python\n", "").replace("```", "").strip())
	response = response.text.replace("```python\n", "").replace("```", "").strip()
	response = re.sub(r"[^\x20-\x7E]", "", response)
	print(response)
	return ast.literal_eval(response)

	def create_notebook(file, tc, bc):
	notebook = new_notebook()
	with pdfplumber.open(file) as pdf:
	for p, page in enumerate(pdf.pages):
	# Extract the text from the PDF
	width, height = page.width, page.height
	top_crop = tc # Height of the header to exclude
	bottom_crop = bc # Height of the footer to exclude

	crop_box = (0, top_crop, width, height - bottom_crop)

	# Crop the page
	cropped_page = page.within_bbox(crop_box)
	text = cropped_page.extract_text()
	if not text:
	continue
	# Split the text into lines
	# lines = text.split('\n')
	blocks = classify_page(text)
	# print(blocks)
	for c, value in blocks:
	if c == "Code":
	notebook.cells.append(new_code_cell(value))
	elif c == "Text":
	value = value.replace("\n", "\n\n")
	# notebook.cells.append(new_markdown_cell(value))
	notebook.cells.append(new_markdown_cell(value.replace('[[','').replace(']','')))

	print(f"Page No.{p+1} completed")

	file_path = file.split('.pdf')[0]+'.ipynb'

	# Write the notebook in UTF-8 encoding
	with open(file_path + '.ipynb', 'w', encoding="utf-8") as f:
	nbformat.write(notebook, f)

	print(f'{file_path} notebook created successfully.')
	return f'{file_path}'


	with gr.Blocks() as app:
	gr.Markdown("""# PDF to IPython Notebook Convertor App
	## Upload your PDF document containing Python code and Text and press 'Process File' button to download the iPython Notebook.
	### Adjust Top Crop and Bottom Crop values based on how much of top and bottom design content of your PDF document you want to eliminate.""")

	file_input = gr.File(label="Upload a PDF file")
	tc = gr.Slider(label='Top Crop in Pixels', value=25)
	bc = gr.Slider(label='Bottom Crop in pixels', value=25)

	download_button = gr.File(label="Download processed file")

	process_button = gr.Button("Process File")

	process_button.click(
	fn=create_notebook,
	inputs=[file_input, tc, bc],
	outputs=download_button
	)

	app.launch(debug=True)