Spaces:
Runtime error
Runtime error
| import re | |
| import requests | |
| import requests_cache | |
| from steps.core import MetadataStep, Step | |
| from steps.utils import clean_doi, find_or_empty_string, get_webpage_text | |
| class CrossrefResponseStep(Step): | |
| step_links = [ | |
| ("What is a DOI?", "https://project-thor.readme.io/docs/what-is-a-doi"), | |
| ("DOI metadata", "https://project-thor.readme.io/docs/accessing-doi-metadata"), | |
| ] | |
| step_intro = "A Digital Object Identifier (DOI) is a persistent identifier commonly used to uniquely identify scholarly papers, and increasingly used to identify datasets, software, and other research outputs." | |
| step_more = "A DOI is associated with all information needed to properly attribute it, including authors, title, and date of publication." | |
| def starting_children(self): | |
| return [CrossrefResponseMetadataStep] | |
| def strip_junk_from_end_of_doi(self, doi): | |
| doi = re.sub("\s+", "", doi) | |
| if '">' in doi: | |
| doi = doi.split('">')[0] | |
| if "</a>" in doi: | |
| doi = doi.split("</a>")[0] | |
| doi = doi.strip( | |
| "," | |
| ) # has to be first, because comma would be last item on line | |
| doi = doi.strip( | |
| "." | |
| ) # has to be near first, because period would be last item on line | |
| doi = doi.strip("'") | |
| doi = doi.strip('"') | |
| doi = doi.strip("}") | |
| doi = clean_doi(doi).lower() | |
| return doi | |
| def extract_doi(self, text): | |
| if text.startswith("https://zenodo.org/record/"): | |
| text = get_webpage_text(text) | |
| badge_doi_1 = find_or_empty_string("://zenodo.org/badge/doi/(.+?).svg", text) | |
| if badge_doi_1: | |
| return self.strip_junk_from_end_of_doi(badge_doi_1) | |
| badge_doi_2 = find_or_empty_string("zenodo.org/badge/latestdoi/\d+", text) | |
| if badge_doi_2: | |
| text = get_webpage_text("https://" + badge_doi_2) | |
| zenodo_doi = find_or_empty_string("10\.5281\/zenodo\.\d+", text) | |
| if zenodo_doi: | |
| return self.strip_junk_from_end_of_doi(zenodo_doi) | |
| if "<html>" in text: | |
| text = re.sub( | |
| "<[^<]+?>", "", text | |
| ) # strip html tags before searching for dois | |
| possible_dois = re.findall( | |
| "10.\d{4,9}\/[-._;()/:A-Za-z0-9+]+", text, re.IGNORECASE | re.MULTILINE | |
| ) | |
| for doi in possible_dois: | |
| if "10.5063/schema/codemeta-2.0" not in doi.lower(): | |
| print("HERE I AM", doi) | |
| return self.strip_junk_from_end_of_doi(doi) | |
| def set_content(self, input): | |
| self.set_content_url(input) | |
| doi_url = self.content_url | |
| if not doi_url: | |
| return | |
| try: | |
| with requests_cache.disabled(): | |
| headers = {"Accept": "application/vnd.citationstyles.csl+json"} | |
| r = requests.get(doi_url, headers=headers) | |
| self.content = r.json() | |
| self.content["URL"] = doi_url | |
| except Exception: | |
| print("no doi metadata found for {}".format(doi_url)) | |
| def set_content_url(self, input): | |
| has_doi = False | |
| if input.startswith("10."): | |
| has_doi = True | |
| elif self.content_url: | |
| if ( | |
| self.content_url.startswith("http") | |
| and "doi.org/10." in self.content_url | |
| ): | |
| has_doi = True | |
| return | |
| elif input.startswith("http") and "doi.org/10." in input: | |
| has_doi = True | |
| else: | |
| # needs to be refactored at some point | |
| doi = self.extract_doi(input) | |
| if doi: | |
| input = doi | |
| has_doi = True | |
| elif input.startswith("http") and "github.com" in input: | |
| # find zenodo badges in github repositories | |
| content = get_webpage_text(input) | |
| doi = self.extract_doi(content) | |
| if doi: | |
| input = doi | |
| has_doi = True | |
| if not has_doi: | |
| return | |
| try: | |
| doi = clean_doi(input) | |
| except Exception: | |
| print("no doi found for {}".format(input)) | |
| return | |
| doi_url = "https://doi.org/{}".format(doi) | |
| self.content_url = doi_url | |
| class CrossrefResponseMetadataStep(MetadataStep): | |
| def set_content(self, input): | |
| self.content = input | |