cite-as / steps /user_input.py
raannakasturi's picture
Upload 124 files
dc5b905 verified
import re
from flask import abort
import requests
import validators
from steps.arxiv import ArxivResponseStep
from steps.bitbucket import BitbucketRepoStep
from steps.cran import CranLibraryStep
from steps.core import Step
from steps.crossref import CrossrefResponseStep
from steps.github import GithubRepoStep
from steps.google import GoogleStep
from steps.pmid import PMIDStep
from steps.pypi import PypiLibraryStep
from steps.webpage import WebpageStep
class UserInputStep(Step):
@property
def starting_children(self):
return [
GoogleStep,
CrossrefResponseStep,
PMIDStep,
ArxivResponseStep,
GithubRepoStep,
BitbucketRepoStep,
CranLibraryStep,
PypiLibraryStep,
WebpageStep,
]
def set_content_url(self, input):
url = self.build_starting_url(input)
if url.startswith("ftp://"):
abort(404)
if "readthedocs" in url:
url = self.get_citation_html_file(url)
self.content_url = url
def set_content(self, input):
if self.content_url.startswith("http://arxiv"):
self.content = self.content_url.replace("http://", "").replace(
".org/abs/", ":"
)
else:
self.content = self.content_url
def build_starting_url(self, input):
# doi
if input.startswith("10."):
url = "http://doi.org/{}".format(input)
# web page
elif input.startswith(("http://", "https://")):
url = input
# url in string
elif re.search("(?P<url>https?://[^\s]+)", input):
url = re.search("(?P<url>https?://[^\s]+)", input).group("url")
# arxiv
elif input.lower().startswith("arxiv"):
id = input.split(":", 1)[1]
url = "http://arxiv.org/abs/{}".format(id)
# arvix ID only, like 1812.02329
elif self.is_arxiv_id(input):
url = "http://arxiv.org/abs/{}".format(input)
# add http to see if it is a valid URL
elif self.is_valid_url(input):
url = "http://{}".format(input)
else:
# google search
url = input
self.key_word = input
return url
@staticmethod
def is_arxiv_id(input):
r = re.compile("\d{4}.\d{5}")
if r.match(input.lower()):
return True
@staticmethod
def is_valid_url(input):
url = "http://{}".format(input)
if validators.url(url):
try:
r = requests.get(url, timeout=1)
if r.status_code == requests.codes.ok:
return True
except:
return False
@staticmethod
def get_citation_html_file(url):
# citation paths
citation_opt_1 = "citation.html"
citation_opt_2 = "reference/citing.html"
# format url
if url.endswith("en/stable") or url.endswith("en/latest"):
citation_urls = [url + "/" + citation_opt_1, url + "/" + citation_opt_2]
elif url.endswith("en/stable/") or url.endswith("en/latest/"):
citation_urls = [url + citation_opt_1, url + citation_opt_2]
elif url.endswith("/"):
citation_urls = [
url + "en/stable/" + citation_opt_1,
url + "en/stable/" + citation_opt_2,
]
else:
citation_urls = [
url + "en/stable/" + citation_opt_1,
url + "en/stable/" + citation_opt_2,
]
# check if citation exists
try:
for citation_url in citation_urls:
r = requests.get(citation_url, timeout=2)
if r.status_code == 200:
return citation_url
return url
except requests.exceptions.RequestException:
return url