import re from flask import abort import requests import validators from steps.arxiv import ArxivResponseStep from steps.bitbucket import BitbucketRepoStep from steps.cran import CranLibraryStep from steps.core import Step from steps.crossref import CrossrefResponseStep from steps.github import GithubRepoStep from steps.google import GoogleStep from steps.pmid import PMIDStep from steps.pypi import PypiLibraryStep from steps.webpage import WebpageStep class UserInputStep(Step): @property def starting_children(self): return [ GoogleStep, CrossrefResponseStep, PMIDStep, ArxivResponseStep, GithubRepoStep, BitbucketRepoStep, CranLibraryStep, PypiLibraryStep, WebpageStep, ] def set_content_url(self, input): url = self.build_starting_url(input) if url.startswith("ftp://"): abort(404) if "readthedocs" in url: url = self.get_citation_html_file(url) self.content_url = url def set_content(self, input): if self.content_url.startswith("http://arxiv"): self.content = self.content_url.replace("http://", "").replace( ".org/abs/", ":" ) else: self.content = self.content_url def build_starting_url(self, input): # doi if input.startswith("10."): url = "http://doi.org/{}".format(input) # web page elif input.startswith(("http://", "https://")): url = input # url in string elif re.search("(?Phttps?://[^\s]+)", input): url = re.search("(?Phttps?://[^\s]+)", input).group("url") # arxiv elif input.lower().startswith("arxiv"): id = input.split(":", 1)[1] url = "http://arxiv.org/abs/{}".format(id) # arvix ID only, like 1812.02329 elif self.is_arxiv_id(input): url = "http://arxiv.org/abs/{}".format(input) # add http to see if it is a valid URL elif self.is_valid_url(input): url = "http://{}".format(input) else: # google search url = input self.key_word = input return url @staticmethod def is_arxiv_id(input): r = re.compile("\d{4}.\d{5}") if r.match(input.lower()): return True @staticmethod def is_valid_url(input): url = "http://{}".format(input) if validators.url(url): try: r = requests.get(url, timeout=1) if r.status_code == requests.codes.ok: return True except: return False @staticmethod def get_citation_html_file(url): # citation paths citation_opt_1 = "citation.html" citation_opt_2 = "reference/citing.html" # format url if url.endswith("en/stable") or url.endswith("en/latest"): citation_urls = [url + "/" + citation_opt_1, url + "/" + citation_opt_2] elif url.endswith("en/stable/") or url.endswith("en/latest/"): citation_urls = [url + citation_opt_1, url + citation_opt_2] elif url.endswith("/"): citation_urls = [ url + "en/stable/" + citation_opt_1, url + "en/stable/" + citation_opt_2, ] else: citation_urls = [ url + "en/stable/" + citation_opt_1, url + "en/stable/" + citation_opt_2, ] # check if citation exists try: for citation_url in citation_urls: r = requests.get(citation_url, timeout=2) if r.status_code == 200: return citation_url return url except requests.exceptions.RequestException: return url