# -*- coding: utf-8 -*-
Reference Search Webio
Internet interfacing for reference_search.
import time
import copy
import sys
import os
import re
import pymed
import scholarly
import habanero
import bs4
from . import helper_functions
from . import citation_parsing
from . import webio
from . import fileio
from . import user_input_checking
TOOL = webio.TOOL
def build_pub_dict_from_PMID(PMID_list, from_email):
"""Query PubMed for each PMID and build a dictionary of the returned data.
PMID_list (list): A list of PMIDs as strings.
from_email (str): An email address to use when querying PubMed.
publication_dict (dict): keys are pulication ids and values are a dictionary with publication attributes.
pubmed = pymed.PubMed(tool=TOOL, email=from_email)
publication_dict = dict()
for PMID_to_search in PMID_list:
publications = pubmed.query(PMID_to_search, max_results=10)
for pub in publications:
pub_id, pub_dict = helper_functions.create_pub_dict_for_saving_PubMed(pub)
if pub_dict["pubmed_id"] == PMID_to_search:
publication_dict[pub_id] = pub_dict
return publication_dict
# def search_references_on_PubMed(running_pubs, tokenized_citations, from_email, prev_query=None):
# """Searhes PubMed for publications matching the citations.
# For each citation in tokenized_citations PubMed is queried for the publication.
# If the publication is already in running_pubs then missing information will be
# filled in if possible.
# Args:
# running_pubs (dict): dictionary of publications matching the JSON schema for publications.
# tokenized_citations (list): list of citations parsed from a source. Each citation is a dict {"authors", "title", "DOI", "PMID", "reference_line", "pub_dict_key"}.
# from_email (str): used in the query to PubMed.
# prev_query (list|None): a list of lists containing publications from a previous call to this function. [[pub1, ...], [pub1, ...], ...]
# Returns:
# running_pubs (dict): keys are pulication ids and values are a dictionary with publication attributes
# matching_key_for_citation (list): list of keys to the publication matching the citation at the same index
# all_pubs (list): list of lists, each index is the pubs searched through after querying until the citation was matched
# """
# # initiate PubMed API
# pubmed = pymed.PubMed(tool=TOOL, email=from_email)
# all_pubs = []
# matching_key_for_citation = []
# for i, citation in enumerate(tokenized_citations):
# all_pubs.append([])
# if citation["PMID"]:
# query_string = citation["PMID"]
# elif citation["DOI"]:
# query_string = citation["DOI"]
# elif citation["title"]:
# query_string = citation["title"]
# else:
# matching_key_for_citation.append(None)
# continue
# publications = pubmed.query(query_string, max_results=10) if not prev_query else prev_query[i]
# citation_matched_to_pub = False
# for pub in publications:
# if not isinstance(pub, pymed.article.PubMedArticle):
# continue
# all_pubs[i].append(pub)
# pub_dict = helper_functions.create_pub_dict_for_saving_PubMed(pub)
# pub_id = pub_dict["doi"] if pub_dict["doi"] else pub_dict["pubmed_id"]
# ## Match publication to the citation.
# if citation["PMID"] == pub_dict["pubmed_id"]:
# citation_matched_to_pub = True
# elif pub.doi and citation["DOI"] and citation["DOI"].lower() == pub.doi.lower():
# citation_matched_to_pub = True
# else:
# has_matching_author = helper_functions.match_pub_authors_to_citation_authors(citation["authors"], pub_dict["authors"])
# if has_matching_author and helper_functions.do_strings_fuzzy_match(citation["title"], pub_dict["title"]):
# citation_matched_to_pub = True
# if matching_pub_id := helper_functions.get_pub_id_in_publication_dict(pub_id, pub.title, running_pubs):
# if "PubMed" in running_pubs[matching_pub_id]["queried_sources"]:
# if not citation_matched_to_pub:
# continue
# matching_key_for_citation.append(matching_pub_id)
# break
# helper_functions._merge_pub_dicts(running_pubs[matching_pub_id], pub_dict)
# running_pubs[matching_pub_id]["queried_sources"].append("PubMed")
# if citation_matched_to_pub:
# matching_key_for_citation.append(matching_pub_id)
# break
# else:
# if not citation_matched_to_pub:
# continue
# pub_dict["queried_sources"] = ["PubMed"]
# running_pubs[pub_id] = pub_dict
# matching_key_for_citation.append(pub_id)
# break
# if not citation_matched_to_pub:
# matching_key_for_citation.append(None)
# time.sleep(1)
# return running_pubs, matching_key_for_citation, all_pubs
# def search_references_on_Crossref(running_pubs, tokenized_citations, mailto_email, prev_query=None):
# """Searhes Crossref for publications matching the citations.
# For each citation in tokenized_citations Crossref is queried for the publication.
# If the publication is already in running_pubs then missing information will be
# filled in if possible.
# Args:
# running_pubs (dict): dictionary of publications matching the JSON schema for publications.
# tokenized_citations (list): list of citations parsed from a source. Each citation is a dict {"authors", "title", "DOI", "PMID", "reference_line", "pub_dict_key"}.
# mailto_email (str): used in the query to Crossref.
# prev_query (list|None): a list of lists containing publications from a previous call to this function. [[pub1, ...], [pub1, ...], ...]
# Returns:
# running_pubs (dict): keys are pulication ids and values are a dictionary with publication attributes
# matching_key_for_citation (list): list of keys to the publication matching the citation at the same index
# all_pubs (list): list of lists, each index is the pubs searched through after querying until the citation was matched
# """
# cr = habanero.Crossref(ua_string = "Academic Tracker (mailto:" + mailto_email + ")")
# all_pubs = []
# matching_key_for_citation = []
# for i, citation in enumerate(tokenized_citations):
# all_pubs.append([])
# if not prev_query:
# if citation["DOI"]:
# results = cr.works(ids = citation["DOI"])
# works = [results["message"]]
# elif citation["title"]:
# results = cr.works(query_bibliographic = citation["title"], filter = {"type":"journal-article"}, limit = 10)
# works = results["message"]["items"]
# else:
# matching_key_for_citation.append(None)
# continue
# else:
# works = prev_query[i]
# citation_matched_to_pub = False
# for work in works:
# all_pubs[i].append(work)
# pub_id, pub_dict = helper_functions.create_pub_dict_for_saving_Crossref(work, prev_query)
# if pub_id is None:
# continue
# if citation["DOI"] == pub_dict["doi"]:
# citation_matched_to_pub = True
# else:
# if "author" in work:
# has_matching_author = helper_functions.match_pub_authors_to_citation_authors(citation["authors"], pub_dict["authors"])
# if has_matching_author and helper_functions.do_strings_fuzzy_match(citation["title"], pub_dict["title"]):
# citation_matched_to_pub = True
# ## If the publication is already in running_pubs then try to update missing information.
# if matching_pub_id := helper_functions.get_pub_id_in_publication_dict(pub_id, pub_dict["title"], running_pubs):
# if "Crossref" in running_pubs[matching_pub_id]["queried_sources"]:
# if not citation_matched_to_pub:
# continue
# matching_key_for_citation.append(matching_pub_id)
# break
# helper_functions._merge_pub_dicts(running_pubs[matching_pub_id], pub_dict)
# running_pubs[matching_pub_id]["queried_sources"].append("Crossref")
# if citation_matched_to_pub:
# matching_key_for_citation.append(matching_pub_id)
# break
# else:
# if not citation_matched_to_pub:
# continue
# pub_dict["queried_sources"] = ["Crossref"]
# running_pubs[pub_id] = pub_dict
# matching_key_for_citation.append(pub_id)
# break
# if not citation_matched_to_pub:
# matching_key_for_citation.append(None)
# time.sleep(1)
# return running_pubs, matching_key_for_citation, all_pubs
def search_references_on_source(source, running_pubs, tokenized_citations, mailto_email, prev_query=None):
"""Searhes source for publications matching the citations.
For each citation in tokenized_citations the source is queried for the publication.
If the publication is already in running_pubs then missing information will be
filled in if possible.
Possible sources are "Crossref" or "PubMed".
source (str): must be one of "Crossref" or "PubMed".
running_pubs (dict): dictionary of publications matching the JSON schema for publications.
tokenized_citations (list): list of citations parsed from a source. Each citation is a dict {"authors", "title", "DOI", "PMID", "reference_line", "pub_dict_key"}.
mailto_email (str): email provided to the source when querying.
prev_query (list|None): a list of lists containing publications from a previous call to this function. [[pub1, ...], [pub1, ...], ...]
running_pubs (dict): keys are pulication ids and values are a dictionary with publication attributes
matching_key_for_citation (list): list of keys to the publication matching the citation at the same index
all_pubs (list): list of lists, each index is the pubs searched through after querying until the citation was matched
# initiate API
if source == "PubMed":
api = pymed.PubMed(tool=TOOL, email=mailto_email)
query_function = _query_PubMed
skip_pub_function = _pub_needs_skipped_PubMed
pub_dict_creation_function = helper_functions.create_pub_dict_for_saving_PubMed
pub_dict_creation_arguments = ["pub"]
elif source == "Crossref":
api = habanero.Crossref(ua_string = "Academic Tracker (mailto:" + mailto_email + ")")
query_function = _query_Crossref
skip_pub_function = _pub_needs_skipped_Crossref
pub_dict_creation_function = helper_functions.create_pub_dict_for_saving_Crossref
pub_dict_creation_arguments = ["pub", "prev_query"]
helper_functions.vprint("Error: When searching references there was an attempt to query an unknown source, '" + source + "'.")
all_pubs = []
matching_key_for_citation = []
for i, citation in enumerate(tokenized_citations):
if not prev_query:
if not (publications := query_function(api, citation)):
publications = prev_query[i]
citation_matched_to_pub = False
for pub in publications:
if skip_pub_function(pub):
locals_ref = locals()
pub_id, pub_dict = pub_dict_creation_function(*[locals_ref[arg] for arg in pub_dict_creation_arguments])
if pub_id is None:
## Match publication to the citation.
if pub_dict["pubmed_id"] and citation["PMID"] and pub_dict["pubmed_id"] == citation["PMID"]:
citation_matched_to_pub = True
elif pub_dict["doi"] and citation["DOI"] and citation["DOI"].lower() == pub_dict["doi"]:
citation_matched_to_pub = True
has_matching_author = helper_functions.match_pub_authors_to_citation_authors(citation["authors"], pub_dict["authors"])
if has_matching_author and helper_functions.do_strings_fuzzy_match(citation["title"], pub_dict["title"]):
citation_matched_to_pub = True
if matching_pub_id := helper_functions.get_pub_id_in_publication_dict(pub_id, pub_dict["title"], running_pubs):
if source in running_pubs[matching_pub_id]["queried_sources"]:
if not citation_matched_to_pub:
helper_functions._merge_pub_dicts(running_pubs[matching_pub_id], pub_dict)
if citation_matched_to_pub:
if not citation_matched_to_pub:
pub_dict["queried_sources"] = [source]
running_pubs[pub_id] = pub_dict
if not citation_matched_to_pub:
return running_pubs, matching_key_for_citation, all_pubs
def _query_PubMed(pubmed, citation):
"""Query PubMed with either the PMID, DOI, or title from citation.
pubmed (pymed.api.PubMed): api object from the pymed library.
citation (dict): citation to query for.
if citation["PMID"]:
query_string = citation["PMID"]
elif citation["DOI"]:
query_string = citation["DOI"]
elif citation["title"]:
query_string = citation["title"]
return None
return pubmed.query(query_string, max_results=10)
def _query_Crossref(cr, citation):
"""Query Crossref with either the PMID, DOI, or title from citation.
pubmed (habanero.crossref.crossref.Crossref): api object from the habanero library.
citation (dict): citation to query for.
if citation["DOI"]:
results = cr.works(ids = citation["DOI"])
works = [results["message"]]
elif citation["title"]:
results = cr.works(query_bibliographic = citation["title"], filter = {"type":"journal-article"}, limit = 10)
works = results["message"]["items"]
return None
return works
def _pub_needs_skipped_PubMed(pub):
"""Determine whether the queried pub from PubMed should be skipped or not.
pub (pymed.article.PubMedArticle|pymed.book.PubMedBookArticle): publication queried from PubMed book articles should be skipped.
return not isinstance(pub, pymed.article.PubMedArticle)
def _pub_needs_skipped_Crossref(pub):
"""Determine whether the queried pub from Crossref should be skipped or not.
This is just here to work with the function, publications from Crossref should never be skipped.
pub (dict): publication queried from Crossref.
return False
def parse_myncbi_citations(url):
"""Tokenize the citations on a MyNCBI URL.
Note that authors and title can be missing or empty from the webpage.
This function assumes the url is the first page of the MyNCBI citations.
The first page is tokenized and then each subsequent page is visited and
url (str): the url of the MyNCBI page.
parsed_pubs (dict): the citations tokenized in a dictionary matching the tokenized citations JSON schema.
## Get the first page, find out the total pages, and parse it.
url_str = webio.get_url_contents_as_str(url)
if not url_str:
helper_functions.vprint("Error: Could not access the MYNCBI webpage. Make sure the address is correct.")
soup = bs4.BeautifulSoup(url_str, "html.parser")
number_of_pages = int(soup.find("span", class_ = "totalPages").text)
parsed_pubs = citation_parsing.tokenize_myncbi_citations(url_str)
## Parse the rest of the pages.
new_url = url if url[-1] == "/" else url + "/"
for i in range(2,number_of_pages+1):
url_str = webio.get_url_contents_as_str(new_url + "?page=" + str(i))
if not url_str:
helper_functions.vprint("Error: Could not access page " + str(i) + " of the MYNCBI webpage. Aborting run.")
temp_pubs = citation_parsing.tokenize_myncbi_citations(url_str)
parsed_pubs += temp_pubs
return parsed_pubs