Spaces:
Paused
Paused
Update chemietoolkit/chemrxnextractor.py
Browse files
chemietoolkit/chemrxnextractor.py
CHANGED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
from PyPDF2 import PdfReader, PdfWriter
|
| 2 |
-
import pdfminer.high_level
|
| 3 |
-
import pdfminer.layout
|
| 4 |
-
from operator import itemgetter
|
| 5 |
-
import os
|
| 6 |
-
import pdftotext
|
| 7 |
-
from chemrxnextractor import RxnExtractor
|
| 8 |
-
|
| 9 |
-
class ChemRxnExtractor(object):
|
| 10 |
-
def __init__(self, pdf, pn, model_dir, device):
|
| 11 |
-
self.pdf_file = pdf
|
| 12 |
-
self.pages = pn
|
| 13 |
-
self.model_dir = os.path.join(model_dir, "cre_models_v0.1") # directory saving both prod and role models
|
| 14 |
-
use_cuda = (device == 'cuda')
|
| 15 |
-
self.rxn_extractor = RxnExtractor(self.model_dir, use_cuda=use_cuda)
|
| 16 |
-
self.text_file = "info.txt"
|
| 17 |
-
self.pdf_text = ""
|
| 18 |
-
if len(self.pdf_file) > 0:
|
| 19 |
-
with open(self.pdf_file, "rb") as f:
|
| 20 |
-
self.pdf_text = pdftotext.PDF(f)
|
| 21 |
-
|
| 22 |
-
def set_pdf_file(self, pdf):
|
| 23 |
-
self.pdf_file = pdf
|
| 24 |
-
with open(self.pdf_file, "rb") as f:
|
| 25 |
-
self.pdf_text = pdftotext.PDF(f)
|
| 26 |
-
|
| 27 |
-
def set_pages(self, pn):
|
| 28 |
-
self.pages = pn
|
| 29 |
-
|
| 30 |
-
def set_model_dir(self, md):
|
| 31 |
-
self.model_dir = md
|
| 32 |
-
self.rxn_extractor = RxnExtractor(self.model_dir)
|
| 33 |
-
|
| 34 |
-
def set_text_file(self, tf):
|
| 35 |
-
self.text_file = tf
|
| 36 |
-
|
| 37 |
-
def extract_reactions_from_text(self):
|
| 38 |
-
if self.pages is None:
|
| 39 |
-
return self.extract_all(len(self.pdf_text))
|
| 40 |
-
else:
|
| 41 |
-
return self.extract_all(self.pages)
|
| 42 |
-
|
| 43 |
-
def extract_all(self, pages):
|
| 44 |
-
ans = []
|
| 45 |
-
text = self.get_paragraphs_from_pdf(pages)
|
| 46 |
-
for data in text:
|
| 47 |
-
L = [sent for paragraph in data['paragraphs'] for sent in paragraph]
|
| 48 |
-
reactions = self.get_reactions(L, page_number=data['page'])
|
| 49 |
-
ans.append(reactions)
|
| 50 |
-
return ans
|
| 51 |
-
|
| 52 |
-
def get_reactions(self, sents, page_number=None):
|
| 53 |
-
rxns = self.rxn_extractor.get_reactions(sents)
|
| 54 |
-
|
| 55 |
-
ret = []
|
| 56 |
-
for r in rxns:
|
| 57 |
-
if len(r['reactions']) != 0: ret.append(r)
|
| 58 |
-
ans = {}
|
| 59 |
-
ans.update({'page' : page_number})
|
| 60 |
-
ans.update({'reactions' : ret})
|
| 61 |
-
return ans
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
def get_paragraphs_from_pdf(self, pages):
|
| 65 |
-
current_page_num = 1
|
| 66 |
-
if pages is None:
|
| 67 |
-
pages = len(self.pdf_text)
|
| 68 |
-
result = []
|
| 69 |
-
for page in range(pages):
|
| 70 |
-
content = self.pdf_text[page]
|
| 71 |
-
pg = content.split("\n\n")
|
| 72 |
-
L = []
|
| 73 |
-
for line in pg:
|
| 74 |
-
paragraph = []
|
| 75 |
-
if '\x0c' in line:
|
| 76 |
-
continue
|
| 77 |
-
text = line
|
| 78 |
-
text = text.replace("\n", " ")
|
| 79 |
-
text = text.replace("- ", "-")
|
| 80 |
-
curind = 0
|
| 81 |
-
i = 0
|
| 82 |
-
while i < len(text):
|
| 83 |
-
if text[i] == '.':
|
| 84 |
-
if i != 0 and not text[i-1].isdigit() or i != len(text) - 1 and (text[i+1] == " " or text[i+1] == "\n"):
|
| 85 |
-
paragraph.append(text[curind:i+1] + "\n")
|
| 86 |
-
while(i < len(text) and text[i] != " "):
|
| 87 |
-
i += 1
|
| 88 |
-
curind = i + 1
|
| 89 |
-
i += 1
|
| 90 |
-
if curind != i:
|
| 91 |
-
if text[i - 1] == " ":
|
| 92 |
-
if i != 1:
|
| 93 |
-
i -= 1
|
| 94 |
-
else:
|
| 95 |
-
break
|
| 96 |
-
if text[i - 1] != '.':
|
| 97 |
-
paragraph.append(text[curind:i] + ".\n")
|
| 98 |
-
else:
|
| 99 |
-
paragraph.append(text[curind:i] + "\n")
|
| 100 |
-
L.append(paragraph)
|
| 101 |
-
|
| 102 |
-
result.append({
|
| 103 |
-
'paragraphs': L,
|
| 104 |
-
'page': current_page_num
|
| 105 |
-
})
|
| 106 |
-
current_page_num += 1
|
| 107 |
-
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|