Source code for scibib.data_query
"""
-------------------------------
The data_query module
-------------------------------
This module defines two classes that allow to parse author data from
Orcid and arxiv.
These are AuthorData and OrcidWork.
"""
from __future__ import annotations
from urllib import request, parse
import logging
import json
import os
from .abstract_collector import main_paragraph
import feedparser
from rapidfuzz.distance import DamerauLevenshtein
from .user_config import orcid_token
[docs]class AuthorData:
"""A class to parse Orcid author entries."""
[docs] def __init__(self, orcid_id: str):
"""Instantiator
Args:
orcid_id (str): The author's orcid id
"""
self.orcid_id = orcid_id
self._orcid_record = None
self._arxiv_record = None
self._orcid_id_is_on_arxiv = None
self._arxiv_summaries_dic = None
def _set_orcid_record(self):
headers = {}
headers["Authorization"] = "Bearer %s" % orcid_token
headers["Accept"] = "application/json"
url = "https://pub.orcid.org/v2.0/%s/works" % self.orcid_id
my_request = request.Request(url, headers=headers)
response = request.urlopen(my_request)
loaded = json.load(response)
self._orcid_record = loaded
def _set_arxiv_record(self):
url = "https://arxiv.org/a/%s.atom" % self.orcid_id
my_request = request.Request(url)
try:
response = request.urlopen(my_request)
self._arxiv_record = response
self._orcid_id_is_on_arxiv = True
except:
self._orcid_id_is_on_arxiv = False
self._arxiv_record = ""
@property
def orcid_record(self) -> list:
"""The raw orcid record as a parsed json.
Returns:
list: The raw orcid record as a parsed json (using json.load).
"""
if self._orcid_record is None:
self._set_orcid_record()
return self._orcid_record
@property
def arxiv_record(self) -> list:
"""The raw arxiv record as an atom feed."""
if self._arxiv_record is None:
self._set_arxiv_record()
return self._arxiv_record
@property
def articles(self) -> list[OrcidWork]:
"""list of article entries in the author's Orcid entry.
Returns:
list[OrcidWork]: list of article entries, formatted as OrcidWork instances.
"""
articles_list = []
for item in self.orcid_record["group"]:
if item["work-summary"][0]["type"] == "JOURNAL_ARTICLE":
articles_list.append(OrcidWork(item))
return articles_list
@property
def orcid_id_is_on_arxiv(self) -> bool:
"""Check if the author associated his/her Arxiv with Orcid.
Returns:
bool: True if yes, False if no!
"""
if self._orcid_id_is_on_arxiv is None:
self._set_arxiv_record()
return self._orcid_id_is_on_arxiv
def _set_arxiv_summaries_dic(self):
d = feedparser.parse(self.arxiv_record)
# below, we reset _arxiv_record to None because it was destroyed by
# feedparser.parse
self._arxiv_record = None
dic = {entry.title: entry.summary for entry in d.entries}
self._arxiv_summaries_dic = dic
@property
def arxiv_summaries_dic(self) -> dict:
"""Return dict that maps arxiv_entries -> abstracts for the author."""
if self._arxiv_summaries_dic is None:
self._set_arxiv_summaries_dic()
return self._arxiv_summaries_dic
[docs] def work_summary_from_arxiv(self, orcid_work: OrcidWork) -> str:
"""Match work with an arxiv entry to provide a summary.
Args:
orcid_work (OrcidWork): the work that needs summary.
Returns:
str: The guessed summary
"""
dic_arxiv = self.arxiv_summaries_dic
if orcid_work.title in dic_arxiv:
logging.info(
"Abstract for %s found through a perfect title match on the arxiv."
% orcid_work.title
)
return dic_arxiv[orcid_work.title].replace("\n", "")
else:
keys = list(dic_arxiv.keys())
keys.sort(
key=lambda s: DamerauLevenshtein.normalized_similarity(
s.lower(), orcid_work.title.lower()
)
)
best_key = keys[-1]
logging.warning(
"We used string similarity to find the summary of %s."
% orcid_work.title
+ "We used the summary of the arxiv entry %s " % best_key
+ ". Please check this is a correct choice."
)
return dic_arxiv[best_key]
[docs]class OrcidWork:
[docs] def __init__(self, work_data):
"""Instantiate single work object.
Args:
work_data (nested lists/dictionaries): part of a loaded json
data corresponding to a single work, as obtained from orcid's
API.
"""
self.raw_data = work_data
self._doi = None
self._doi_bibtex = None
self._orcid_bibtex = None
@property
def path(self):
"Orcid path to the data."
return self.raw_data["work-summary"][0]["path"]
@property
def title(self):
"""Work title."""
return self.raw_data["work-summary"][0]["title"]["title"]["value"]
def _set_doi(self):
ids = self.raw_data["work-summary"][0]["external-ids"]["external-id"]
for entry in ids:
if (
entry["external-id-relationship"] == "SELF"
and entry["external-id-type"] == "doi"
):
self._doi = entry["external-id-value"]
break
if self._doi is None:
raise KeyError(
"No entry found in the orcid record to provide the doi of the article."
)
@property
def doi(self) -> str:
"""The Work's doi.
Returns:
str: the doi.
"""
if self._doi is None:
self._set_doi()
return self._doi
@property
def url_in_journal(self):
return "https://doi.org/%s" % self.doi
def _set_bibtex_from_doi(self):
headers = {}
headers["Accept"] = "text/bibliography; style=bibtex"
my_request = request.Request(self.url_in_journal, headers=headers)
response = request.urlopen(my_request)
self._doi_bibtex = response.read().decode("utf-8")
# We are not sure the method below gets any extra info on the work.
def _work_details(self):
headers = {}
headers["Authorization"] = "Bearer %s" % orcid_token
headers["Accept"] = "application/json"
url = "https://pub.orcid.org/v2.0%s" % self._path
my_request = request.Request(url, headers=headers)
response = request.urlopen(my_request)
loaded = json.load(response)
return loaded
@property
def bibtex(self, source: str = "doi"):
"""Return the bibtex entry for self from source.
Args:
source (str, optional): Equals 'doi'. Defaults to 'doi'.
Other sources might be available in the future.
"""
if source == "doi":
if self._doi_bibtex is None:
self._set_bibtex_from_doi()
return self._doi_bibtex
else:
raise ValueError(
"Currently, the only admissible value for the source parameter is 'doi'."
)
[docs] def scrape_abstract(self):
"""Scrape the work's summary from the editor/journal's site.
Beware that you might need authorization from the editor/journal to use this functionality.
"""
url = "https://doi.org/%s" % self.doi
main_par = main_paragraph(url)
logging.warning(
"We got the abstract for %s scraping the editor/journal's website."
% self.title
+ "Make sure the result is satisfactory and beware that you might need "
"authorization from the editor/journal to use this functionality."
)