cite-as / steps /crossref.py
raannakasturi's picture
Upload 124 files
dc5b905 verified
import re
import requests
import requests_cache
from steps.core import MetadataStep, Step
from steps.utils import clean_doi, find_or_empty_string, get_webpage_text
class CrossrefResponseStep(Step):
step_links = [
("What is a DOI?", "https://project-thor.readme.io/docs/what-is-a-doi"),
("DOI metadata", "https://project-thor.readme.io/docs/accessing-doi-metadata"),
]
step_intro = "A Digital Object Identifier (DOI) is a persistent identifier commonly used to uniquely identify scholarly papers, and increasingly used to identify datasets, software, and other research outputs."
step_more = "A DOI is associated with all information needed to properly attribute it, including authors, title, and date of publication."
@property
def starting_children(self):
return [CrossrefResponseMetadataStep]
def strip_junk_from_end_of_doi(self, doi):
doi = re.sub("\s+", "", doi)
if '">' in doi:
doi = doi.split('">')[0]
if "</a>" in doi:
doi = doi.split("</a>")[0]
doi = doi.strip(
","
) # has to be first, because comma would be last item on line
doi = doi.strip(
"."
) # has to be near first, because period would be last item on line
doi = doi.strip("'")
doi = doi.strip('"')
doi = doi.strip("}")
doi = clean_doi(doi).lower()
return doi
def extract_doi(self, text):
if text.startswith("https://zenodo.org/record/"):
text = get_webpage_text(text)
badge_doi_1 = find_or_empty_string("://zenodo.org/badge/doi/(.+?).svg", text)
if badge_doi_1:
return self.strip_junk_from_end_of_doi(badge_doi_1)
badge_doi_2 = find_or_empty_string("zenodo.org/badge/latestdoi/\d+", text)
if badge_doi_2:
text = get_webpage_text("https://" + badge_doi_2)
zenodo_doi = find_or_empty_string("10\.5281\/zenodo\.\d+", text)
if zenodo_doi:
return self.strip_junk_from_end_of_doi(zenodo_doi)
if "<html>" in text:
text = re.sub(
"<[^<]+?>", "", text
) # strip html tags before searching for dois
possible_dois = re.findall(
"10.\d{4,9}\/[-._;()/:A-Za-z0-9+]+", text, re.IGNORECASE | re.MULTILINE
)
for doi in possible_dois:
if "10.5063/schema/codemeta-2.0" not in doi.lower():
print("HERE I AM", doi)
return self.strip_junk_from_end_of_doi(doi)
def set_content(self, input):
self.set_content_url(input)
doi_url = self.content_url
if not doi_url:
return
try:
with requests_cache.disabled():
headers = {"Accept": "application/vnd.citationstyles.csl+json"}
r = requests.get(doi_url, headers=headers)
self.content = r.json()
self.content["URL"] = doi_url
except Exception:
print("no doi metadata found for {}".format(doi_url))
def set_content_url(self, input):
has_doi = False
if input.startswith("10."):
has_doi = True
elif self.content_url:
if (
self.content_url.startswith("http")
and "doi.org/10." in self.content_url
):
has_doi = True
return
elif input.startswith("http") and "doi.org/10." in input:
has_doi = True
else:
# needs to be refactored at some point
doi = self.extract_doi(input)
if doi:
input = doi
has_doi = True
elif input.startswith("http") and "github.com" in input:
# find zenodo badges in github repositories
content = get_webpage_text(input)
doi = self.extract_doi(content)
if doi:
input = doi
has_doi = True
if not has_doi:
return
try:
doi = clean_doi(input)
except Exception:
print("no doi found for {}".format(input))
return
doi_url = "https://doi.org/{}".format(doi)
self.content_url = doi_url
class CrossrefResponseMetadataStep(MetadataStep):
def set_content(self, input):
self.content = input