Spaces:

CYF200127
/

ChemEagle_API

Paused

App Files Files Community

CYF200127 commited on May 17

Commit

5d415f0

verified ·

1 Parent(s): 9b764c8

Update chemietoolkit/chemrxnextractor.py

Browse files

Files changed (1) hide show

chemietoolkit/chemrxnextractor.py +0 -107

chemietoolkit/chemrxnextractor.py CHANGED Viewed

@@ -1,107 +0,0 @@
-from PyPDF2 import PdfReader, PdfWriter
-import pdfminer.high_level
-import pdfminer.layout
-from operator import itemgetter
-import os
-import pdftotext
-from chemrxnextractor import RxnExtractor
-class ChemRxnExtractor(object):
-    def __init__(self, pdf, pn, model_dir, device):
-        self.pdf_file = pdf
-        self.pages = pn
-        self.model_dir = os.path.join(model_dir, "cre_models_v0.1") # directory saving both prod and role models
-        use_cuda = (device == 'cuda')
-        self.rxn_extractor = RxnExtractor(self.model_dir, use_cuda=use_cuda)
-        self.text_file = "info.txt"
-        self.pdf_text = ""
-        if len(self.pdf_file) > 0:
-            with open(self.pdf_file, "rb") as f:
-                self.pdf_text = pdftotext.PDF(f)
-    def set_pdf_file(self, pdf):
-        self.pdf_file = pdf
-        with open(self.pdf_file, "rb") as f:
-            self.pdf_text = pdftotext.PDF(f)
-    def set_pages(self, pn):
-        self.pages = pn
-    def set_model_dir(self, md):
-        self.model_dir = md
-        self.rxn_extractor = RxnExtractor(self.model_dir)
-    def set_text_file(self, tf):
-        self.text_file = tf
-    def extract_reactions_from_text(self):
-        if self.pages is None:
-            return self.extract_all(len(self.pdf_text))
-        else:
-            return self.extract_all(self.pages)
-    def extract_all(self, pages):
-        ans = []
-        text = self.get_paragraphs_from_pdf(pages)
-        for data in text:
-            L = [sent for paragraph in data['paragraphs'] for sent in paragraph]
-            reactions = self.get_reactions(L, page_number=data['page'])
-            ans.append(reactions)
-        return ans
-    def get_reactions(self, sents, page_number=None):
-        rxns = self.rxn_extractor.get_reactions(sents)
-        ret = []
-        for r in rxns:
-            if len(r['reactions']) != 0: ret.append(r)
-        ans = {}
-        ans.update({'page' : page_number})
-        ans.update({'reactions' : ret})
-        return ans
-    def get_paragraphs_from_pdf(self, pages):
-        current_page_num = 1
-        if pages is None:
-            pages = len(self.pdf_text)
-        result = []
-        for page in range(pages):
-            content = self.pdf_text[page]
-            pg = content.split("\n\n")
-            L = []
-            for line in pg:
-                paragraph = []
-                if '\x0c' in line:
-                    continue
-                text = line
-                text = text.replace("\n", " ")
-                text = text.replace("- ", "-")
-                curind = 0
-                i = 0
-                while i < len(text):
-                    if text[i] == '.':
-                        if i != 0 and not text[i-1].isdigit() or i != len(text) - 1 and (text[i+1] == " " or text[i+1] == "\n"):
-                            paragraph.append(text[curind:i+1] + "\n")
-                            while(i < len(text) and text[i] != " "):
-                                i += 1
-                            curind = i + 1
-                    i += 1
-                if curind != i:
-                    if text[i - 1] == " ":
-                        if i != 1:
-                            i -= 1
-                        else:
-                            break
-                    if text[i - 1] != '.':
-                        paragraph.append(text[curind:i] + ".\n")
-                    else:
-                        paragraph.append(text[curind:i] + "\n")
-                L.append(paragraph)
-            result.append({
-                'paragraphs': L,
-                'page': current_page_num
-            })
-            current_page_num += 1
-        return result