Source code for academic_tracker.athr_srch_webio

# -*- coding: utf-8 -*-
"""
Author Search Webio
~~~~~~~~~~~~~~~~~~~

Internet interfacing for author_search.
"""


import time
import copy
import traceback

import pymed
import orcid
import scholarly
import habanero

from . import helper_functions
from . import webio



TOOL = webio.TOOL
DOI_URL = webio.DOI_URL

PUBLICATION_TEMPLATE = webio.PUBLICATION_TEMPLATE


## TODO get with pymed and add grants and pmcid to PubMedArticle class.

[docs]
def search_PubMed_for_pubs(running_pubs, authors_json, from_email, prev_query=None):
    """Searhes PubMed for publications by each author.
    
    For each author in authors_json PubMed is queried for the publications. The list of publications is then filtered 
    by affiliations and cutoff_year. If the publication is in the of running_pubs then it tries to fill in missing 
    information from this source. If the author doesn't have at least one matching affiliation then the publication 
    is skipped. If the publication was published before the cutoff_year then it is skipped. If prev_query is given, then 
    publications will be taken from it instead of querying PubMed again. 
    
    Args:
        running_pubs (dict): dictionary of publications matching the JSON schema for publications.
        authors_json (dict): keys are authors and values are author attributes. Matches Authors section of configuration JSON schema.
        from_email (str): used in the query to PubMed.
        prev_query (dict|None): a dictionary containing publications from a previous call to this function. {author1: [pub1, ...], ...}
        
    Returns:
        running_pubs (dict): keys are publication ids and values are a dictionary with publication attributes
        all_pubs (dict): a dictionary where the keys are the authors in authors_json and the values are a list of the publications queried for them.
    """
    
    ## Some helpful code to get the xml back as text. import xml.etree.ElementTree as ET    ET.tostring(Element)    ET.ElementTree(element).write('path')
    # initiate PubMed API
    pubmed = pymed.PubMed(tool=TOOL, email=from_email)
    
    all_pubs = {}
    
    ########################
    # loop through list of authors and request a list of their publications
    ########################
    for author, author_attributes in authors_json.items():
        all_pubs[author] = []
        
        publications = pubmed.query(author_attributes["pubmed_name_search"], max_results=500) if not prev_query else prev_query[author]
        
        ## Unpacking pub from publications appears to be the slowest part of the code.
        ## publications is an iterator that is broken up into batches and there are noticeable slow downs each time a new batch is fetched.
        for pub in publications:
            if not isinstance(pub, pymed.article.PubMedArticle):
                continue
            all_pubs[author].append(pub)
            
            pub_id, pub_dict = helper_functions.create_pub_dict_for_saving_PubMed(pub)
            
            if matching_pub_id := helper_functions.get_pub_id_in_publication_dict(pub_id, pub.title, running_pubs):
                if "PubMed" in running_pubs[matching_pub_id]["queried_sources"]:
                    continue
                
                helper_functions._merge_pub_dicts(running_pubs[matching_pub_id], pub_dict)
                running_pubs[matching_pub_id]["queried_sources"].append("PubMed")
            else:
                    
                ## Sometimes the publication_date can be None, so just skip it.
                if not pub.publication_date:
                    continue
                publication_date = int(str(pub.publication_date)[:4])
                
                ## if the publication date is before the cutoff year then skip.
                if publication_date < author_attributes["cutoff_year"]:
                    continue
                
                author_list = helper_functions.match_pub_authors_to_config_authors(authors_json, pub_dict["authors"])
        
                ## If no authors were matched then go to the next publication. Note that this is not uncommon because PubMed returns publications for authors who were just colloborators.
                if not author_list:
                    continue
                    
                pub_dict["authors"] = author_list
                pub_dict["queried_sources"] = ["PubMed"]
                running_pubs[pub_id] = pub_dict
                    
            
        # don't piss off NCBI
        time.sleep(1)
        
    return running_pubs, all_pubs



       
        
        

[docs]
def search_ORCID_for_pubs(running_pubs, ORCID_key, ORCID_secret, authors_json, prev_query=None):
    """Searhes ORCID for publications by each author.
    
    For each author in authors_json ORCID is queried for the publications. The list of publications is then filtered 
    by affiliations and cutoff_year. If the author doesn't have at least one matching affiliation, then the publication 
    is skipped. If the publication was published before the cutoff_year, then it is skipped. If prev_query is given, then publications 
    will be taken from it instead of querying ORCID again. 
    
    Args:
        running_pubs (dict): dictionary of publications matching the JSON schema for publications.
        ORCID_key (str): string of the app key ORCID gives when you register the app with them
        ORCID_secret (str): string of the secret ORCID gives when you register the app with them
        authors_json (dict): keys are authors and values are author attributes. Matches authors JSON schema.
        prev_query (dict|None): a dictionary containing publications from a previous call to this function. {author1: [pub1, ...], ...}
        
    Returns:
        running_pubs (dict): keys are publication ids and values are a dictionary with publication attributes
        all_pubs (dict): a dictionary where the keys are the authors in authors_json and the values are a list of the publications queried for them.
    """
    
    if prev_query is None:
        api = orcid.PublicAPI(ORCID_key, ORCID_secret)
        search_token = api.get_search_token_from_orcid()
    
    all_pubs = {}

    for author, authors_attributes in authors_json.items():
        all_pubs[author] = []
        
        if not "ORCID" in authors_attributes:
            continue
        
        works = api.read_record_public(authors_attributes["ORCID"], 'works', search_token)["group"] if prev_query is None else prev_query[author]
        
        for work in works:
            all_pubs[author].append(work)
            
            title = None
            doi = None
            external_url = None
            publication_year = None
            publication_month = None
            publication_day = None
            pmid = None
            ## If the work is not a journal article then skip it.
            work_is_a_journal_article = True
            work_before_relevant_year = False
            for work_summary in work["work-summary"]:
                
                if work_summary["type"] != "JOURNAL_ARTICLE":
                    work_is_a_journal_article = False
                
                
                if work_summary["publication-date"]:
                    if not publication_year and work_summary["publication-date"]["year"]:
                        publication_year = int(work_summary["publication-date"]["year"]["value"])
                        
                    if not publication_month and work_summary["publication-date"]["month"]:
                        publication_month = int(work_summary["publication-date"]["month"]["value"])
                        
                    if not publication_day and work_summary["publication-date"]["day"]:
                        publication_day = int(work_summary["publication-date"]["day"]["value"])

                    if publication_year is not None and publication_year < authors_attributes["cutoff_year"]:
                        work_before_relevant_year = True
                
                
                if work_summary["title"] and not title:
                    title = work_summary["title"]["title"]["value"]
                
                if not doi:
                    for external_id in work_summary["external-ids"]["external-id"]:
                        if external_id["external-id-type"] == "doi":
                            doi = external_id["external-id-value"].lower()
                        elif external_id["external-id-url"]:
                            external_url = external_id["external-id-url"]["value"]
                        elif external_id["external-id-type"] == "pmid":
                            pmid = external_id["external-id-value"]
                
                if title and doi and publication_year and publication_month and publication_day:
                    break
            
                        
            ## Try to find a way to give the publication an ID.
            if doi:
                pub_id = DOI_URL + doi
            elif external_url:
                pub_id = external_url
            elif pmid:
                pub_id = pmid
            elif not prev_query and title is not None:
                helper_functions.vprint("Warning: Could not find a DOI, URL, or PMID for a publication when searching ORCID. It will not be in the publications", verbosity=1)
                helper_functions.vprint("Title: " + title, verbosity=1)
                continue
            else:
                continue
            
            ## Pretty sure the title is never None, but if it is then there will be an error, so skip.
            if title is None:
                continue
            
            ## Pull out relevant information from ORCID.
            pub_dict = copy.deepcopy(PUBLICATION_TEMPLATE)
            if doi:
                pub_dict["doi"] = doi
            if title:
                pub_dict["title"] = title
            if publication_year:
                pub_dict["publication_date"]["year"] = publication_year
            if publication_month:
                pub_dict["publication_date"]["month"] = publication_month
            if publication_day:
                pub_dict["publication_date"]["day"] = publication_day
            if pmid:
                pub_dict["pubmed_id"] = pmid
                
           
            authors_dict = {}
            authors_dict["ORCID"] = authors_attributes.get("ORCID")
            authors_dict["author_id"] = author
            if "collective_name" in authors_attributes:
                authors_dict["collectivename"] = authors_attributes["collective_name"]
            else:
                authors_dict["affiliation"] = "\n".join(authors_attributes["affiliations"]) if authors_attributes["affiliations"] else None
                authors_dict["firstname"] = authors_attributes["first_name"]
                authors_dict["initials"] = None
                authors_dict["lastname"] = authors_attributes["last_name"]
            
            pub_dict["authors"] = [authors_dict]
           
            
            ## If the publication is already in running_pubs then try to update missing information.
            if matching_pub_id := helper_functions.get_pub_id_in_publication_dict(pub_id, title, running_pubs):
                if "ORCID" in running_pubs[matching_pub_id]["queried_sources"]:
                    continue
                
                helper_functions._merge_pub_dicts(running_pubs[matching_pub_id], pub_dict)
                running_pubs[matching_pub_id]["queried_sources"].append("ORCID")
            
            else:
            
                if work_before_relevant_year or not work_is_a_journal_article or not publication_year:
                    continue
                    
                pub_dict["queried_sources"] = ["ORCID"]
                running_pubs[pub_id] = pub_dict
                
        time.sleep(1)
        
    return running_pubs, all_pubs






[docs]
def search_Google_Scholar_for_pubs(running_pubs, authors_json, mailto_email, prev_query=None):
    """Searhes Google Scholar for publications by each author.
    
    For each author in authors_json Google Scholar is queried for the publications. The list of publications is then filtered 
    by affiliations and cutoff_year. If the author doesn't have at least one matching affiliation, then the publication is 
    skipped. If the publication was published before the cutoff_year, then it is skipped. If prev_query is given, then publications 
    will be taken from it instead of querying Google Scholar again. 
    
    Args:
        running_pubs (dict): dictionary of publications matching the JSON schema for publications.
        authors_json (dict): keys are authors and values are author attributes. Matches authors JSON schema.
        mailto_email (str): used in the query to Crossref when trying to find DOIs for the articles.
        prev_query (dict|None): a dictionary containing publications from a previous call to this function. {author1: [pub1, ...], ...}
        
    Returns:
        running_pubs (dict): keys are pulication ids and values are a dictionary with publication attributes
        all_pubs (dict): a dictionary where the keys are the authors in authors_json and the values are a list of the publications queried for them.
    """
    all_pubs = {}
    for author, authors_attributes in authors_json.items():
        all_pubs[author] = []
        
        if not "scholar_id" in authors_attributes:
            continue
        
        ## Either query Google Scholar or use the prev_query parameter.
        if not prev_query:
            try:
                queried_author = scholarly.scholarly.search_author_id(authors_attributes["scholar_id"])
            except:
                message = "Warning: The \"scholar_id\" for author " + author + " is probably incorrect, an error occured when trying to query Google Scholar.\n"
                message += traceback.format_exc()
                helper_functions.vprint(message, verbosity=1)    
                continue
            
            if not queried_author["scholar_id"] == authors_attributes["scholar_id"]:
                continue
            
            ## Note that fill modifies the passed dictionary directly, but this is easier to mock in unit tests.
            queried_author = scholarly.scholarly.fill(queried_author, sections=["publications"])
            publications = queried_author["publications"]
        else:
            publications = prev_query[author]
        
        ## Loop over queried publications.
        for i, pub in enumerate(publications):
            all_pubs[author].append(pub)
            
            ## Determine the pub_id
            title = pub["bib"]["title"]
            doi = webio.get_DOI_from_Crossref(title, mailto_email) if prev_query is None else pub["doi"]
            all_pubs[author][i]["doi"] = doi
            if doi:
                pub_id = DOI_URL + doi
            else:
                if prev_query is None:
                    pub = scholarly.scholarly.fill(pub)
                ## The fill method modifies the original pub I think, so this line isn't necessary.
                # all_pubs[author][-1] = pub
                if "pub_url" in pub:
                    pub_id = pub["pub_url"]
                elif not prev_query:
                    helper_functions.vprint("Warning: Could not find a DOI, URL, or PMID for a publication when searching Google Scholar. It will not be in the publications.", verbosity=1)
                    helper_functions.vprint("Title: " + title, verbosity=1)
                    continue
                else:
                    continue
            
            ## Build pub_dict
            publication_year = int(pub["bib"]["pub_year"]) if "pub_year" in pub["bib"] else None
            
            pub_dict = copy.deepcopy(PUBLICATION_TEMPLATE)
            if doi:
                pub_dict["doi"] = doi
            if title:
                pub_dict["title"] = title
            if publication_year:
                pub_dict["publication_date"]["year"] = publication_year
            
            authors_dict = {}
            authors_dict["ORCID"] = authors_attributes.get("ORCID")
            authors_dict["author_id"] = author
            if "collective_name" in authors_attributes:
                authors_dict["collectivename"] = authors_attributes["collective_name"]
            else:
                authors_dict["affiliation"] = "\n".join(authors_attributes["affiliations"]) if authors_attributes["affiliations"] else None
                authors_dict["firstname"] = authors_attributes["first_name"]
                authors_dict["initials"] = None
                authors_dict["lastname"] = authors_attributes["last_name"]
            
            pub_dict["authors"] = [authors_dict]
            
            
            ## If the publication is already in running_pubs then try to update missing information.
            if matching_pub_id := helper_functions.get_pub_id_in_publication_dict(pub_id, title, running_pubs):
                if "Google Scholar" in running_pubs[matching_pub_id]["queried_sources"]:
                    continue
                
                helper_functions._merge_pub_dicts(running_pubs[matching_pub_id], pub_dict)
                running_pubs[matching_pub_id]["queried_sources"].append("Google Scholar")
            
            else:
            
                ## Check if the publication year is in range.
                if not publication_year or publication_year < authors_attributes["cutoff_year"]:
                    continue
                
                pub_dict["queried_sources"] = ["Google Scholar"]
                running_pubs[pub_id] = pub_dict
                            
        time.sleep(1)
            
    return running_pubs, all_pubs







[docs]
def search_Crossref_for_pubs(running_pubs, authors_json, mailto_email, prev_query=None):
    """Searhes Crossref for publications by each author.
    
    For each author in authors_json Crossref is queried for the publications. The list of publications is then filtered 
    by affiliations and cutoff_year. If the author doesn't have at least one matching affiliation, then the publication 
    is skipped. If the publication was published before the cutoff_year, then it is skipped. Each publication is then 
    determined to have citations for any of the grants in the author's grants. If prev_query is given, then publications 
    will be taken from it instead of querying Crossref again. 
    
    Args:
        running_pubs (dict): dictionary of publications matching the JSON schema for publications.
        authors_json (dict): keys are authors and values are author attributes. Matches authors JSON schema.
        mailto_email (str): used in the query to Crossref.
        prev_query (dict|None): a dictionary containing publications from a previous call to this function. {author1: [pub1, ...], ...}
        
    Returns:
        running_pubs (dict): keys are pulication ids and values are a dictionary with publication attributes
        all_pubs (dict): a dictionary where the keys are the authors in authors_json and the values are a list of the publications queried for them.
    """
    
    cr = habanero.Crossref(ua_string = "Academic Tracker (mailto:" + mailto_email + ")")
    
    all_pubs = {}
    for author, authors_attributes in authors_json.items():
        all_pubs[author] = []
        
        ## Query Crossref or use prev_query.
        if not prev_query:
            results = cr.works(query_author = authors_attributes["pubmed_name_search"], 
                               filter = {"type":"journal-article", "from-pub-date":str(authors_attributes["cutoff_year"])}, 
                               limit = 300)
            publications = results["message"]["items"]
        else:
            publications = prev_query[author]
        
        ## Loop over publications.
        for work in publications:
            all_pubs[author].append(work)
            
            pub_id, pub_dict = helper_functions.create_pub_dict_for_saving_Crossref(work, prev_query)
            
            if pub_id is None:
                continue
            
            
            ## If the publication is already in running_pubs then try to update missing information.
            if matching_pub_id := helper_functions.get_pub_id_in_publication_dict(pub_id, pub_dict["title"], running_pubs):
                if "Crossref" in running_pubs[matching_pub_id]["queried_sources"]:
                    continue
                
                helper_functions._merge_pub_dicts(running_pubs[matching_pub_id], pub_dict)
                running_pubs[matching_pub_id]["queried_sources"].append("Crossref")
            
            else:
            
                if not pub_dict["publication_date"]["year"] or pub_dict["publication_date"]["year"] < authors_attributes["cutoff_year"]:
                    continue
                
                author_list = helper_functions.match_pub_authors_to_config_authors(authors_json, pub_dict["authors"])
                ## If the author_list is empty then there were no matching authors, continue.
                if not author_list:
                    continue
            
                pub_dict["authors"] = author_list
                pub_dict["queried_sources"] = ["Crossref"]
                running_pubs[pub_id] = pub_dict
            
        time.sleep(1)
            
            
    return running_pubs, all_pubs