Source code for scibib.data_query

"""
-------------------------------
The data_query module
-------------------------------
This module defines two classes that allow to parse author data from
Orcid and arxiv.
These are AuthorData and OrcidWork.
"""

from __future__ import annotations
from urllib import request, parse
import logging
import json
import os

from .abstract_collector import main_paragraph
import feedparser
from rapidfuzz.distance import DamerauLevenshtein


from .user_config import orcid_token


[docs]class AuthorData: """A class to parse Orcid author entries."""
[docs] def __init__(self, orcid_id: str): """Instantiator Args: orcid_id (str): The author's orcid id """ self.orcid_id = orcid_id self._orcid_record = None self._arxiv_record = None self._orcid_id_is_on_arxiv = None self._arxiv_summaries_dic = None
def _set_orcid_record(self): headers = {} headers["Authorization"] = "Bearer %s" % orcid_token headers["Accept"] = "application/json" url = "https://pub.orcid.org/v2.0/%s/works" % self.orcid_id my_request = request.Request(url, headers=headers) response = request.urlopen(my_request) loaded = json.load(response) self._orcid_record = loaded def _set_arxiv_record(self): url = "https://arxiv.org/a/%s.atom" % self.orcid_id my_request = request.Request(url) try: response = request.urlopen(my_request) self._arxiv_record = response self._orcid_id_is_on_arxiv = True except: self._orcid_id_is_on_arxiv = False self._arxiv_record = "" @property def orcid_record(self) -> list: """The raw orcid record as a parsed json. Returns: list: The raw orcid record as a parsed json (using json.load). """ if self._orcid_record is None: self._set_orcid_record() return self._orcid_record @property def arxiv_record(self) -> list: """The raw arxiv record as an atom feed.""" if self._arxiv_record is None: self._set_arxiv_record() return self._arxiv_record @property def articles(self) -> list[OrcidWork]: """list of article entries in the author's Orcid entry. Returns: list[OrcidWork]: list of article entries, formatted as OrcidWork instances. """ articles_list = [] for item in self.orcid_record["group"]: if item["work-summary"][0]["type"] == "JOURNAL_ARTICLE": articles_list.append(OrcidWork(item)) return articles_list @property def orcid_id_is_on_arxiv(self) -> bool: """Check if the author associated his/her Arxiv with Orcid. Returns: bool: True if yes, False if no! """ if self._orcid_id_is_on_arxiv is None: self._set_arxiv_record() return self._orcid_id_is_on_arxiv def _set_arxiv_summaries_dic(self): d = feedparser.parse(self.arxiv_record) # below, we reset _arxiv_record to None because it was destroyed by # feedparser.parse self._arxiv_record = None dic = {entry.title: entry.summary for entry in d.entries} self._arxiv_summaries_dic = dic @property def arxiv_summaries_dic(self) -> dict: """Return dict that maps arxiv_entries -> abstracts for the author.""" if self._arxiv_summaries_dic is None: self._set_arxiv_summaries_dic() return self._arxiv_summaries_dic
[docs] def work_summary_from_arxiv(self, orcid_work: OrcidWork) -> str: """Match work with an arxiv entry to provide a summary. Args: orcid_work (OrcidWork): the work that needs summary. Returns: str: The guessed summary """ dic_arxiv = self.arxiv_summaries_dic if orcid_work.title in dic_arxiv: logging.info( "Abstract for %s found through a perfect title match on the arxiv." % orcid_work.title ) return dic_arxiv[orcid_work.title].replace("\n", "") else: keys = list(dic_arxiv.keys()) keys.sort( key=lambda s: DamerauLevenshtein.normalized_similarity( s.lower(), orcid_work.title.lower() ) ) best_key = keys[-1] logging.warning( "We used string similarity to find the summary of %s." % orcid_work.title + "We used the summary of the arxiv entry %s " % best_key + ". Please check this is a correct choice." ) return dic_arxiv[best_key]
[docs]class OrcidWork:
[docs] def __init__(self, work_data): """Instantiate single work object. Args: work_data (nested lists/dictionaries): part of a loaded json data corresponding to a single work, as obtained from orcid's API. """ self.raw_data = work_data self._doi = None self._doi_bibtex = None self._orcid_bibtex = None
@property def path(self): "Orcid path to the data." return self.raw_data["work-summary"][0]["path"] @property def title(self): """Work title.""" return self.raw_data["work-summary"][0]["title"]["title"]["value"] def _set_doi(self): ids = self.raw_data["work-summary"][0]["external-ids"]["external-id"] for entry in ids: if ( entry["external-id-relationship"] == "SELF" and entry["external-id-type"] == "doi" ): self._doi = entry["external-id-value"] break if self._doi is None: raise KeyError( "No entry found in the orcid record to provide the doi of the article." ) @property def doi(self) -> str: """The Work's doi. Returns: str: the doi. """ if self._doi is None: self._set_doi() return self._doi @property def url_in_journal(self): return "https://doi.org/%s" % self.doi def _set_bibtex_from_doi(self): headers = {} headers["Accept"] = "text/bibliography; style=bibtex" my_request = request.Request(self.url_in_journal, headers=headers) response = request.urlopen(my_request) self._doi_bibtex = response.read().decode("utf-8") # We are not sure the method below gets any extra info on the work. def _work_details(self): headers = {} headers["Authorization"] = "Bearer %s" % orcid_token headers["Accept"] = "application/json" url = "https://pub.orcid.org/v2.0%s" % self._path my_request = request.Request(url, headers=headers) response = request.urlopen(my_request) loaded = json.load(response) return loaded @property def bibtex(self, source: str = "doi"): """Return the bibtex entry for self from source. Args: source (str, optional): Equals 'doi'. Defaults to 'doi'. Other sources might be available in the future. """ if source == "doi": if self._doi_bibtex is None: self._set_bibtex_from_doi() return self._doi_bibtex else: raise ValueError( "Currently, the only admissible value for the source parameter is 'doi'." )
[docs] def scrape_abstract(self): """Scrape the work's summary from the editor/journal's site. Beware that you might need authorization from the editor/journal to use this functionality. """ url = "https://doi.org/%s" % self.doi main_par = main_paragraph(url) logging.warning( "We got the abstract for %s scraping the editor/journal's website." % self.title + "Make sure the result is satisfactory and beware that you might need " "authorization from the editor/journal to use this functionality." )