Source code for academic_tracker.athr_srch_modularized

# -*- coding: utf-8 -*-
"""
Author Search Modularized
~~~~~~~~~~~~~~~~~~~~~~~~~

Modularized pieces of author_search.
"""
import sys
import re
import datetime
import os

import deepdiff

from . import user_input_checking
from . import fileio
from . import helper_functions
from . import athr_srch_webio
from . import athr_srch_emails_and_reports
from . import webio


[docs]def input_reading_and_checking(config_json_filepath, no_ORCID, no_GoogleScholar, no_Crossref, no_PubMed): """Read in inputs from user and do error checking. Args: config_json_filepath (str): filepath to the configuration JSON. no_ORCID (bool): If True search ORCID else don't. Reduces checking on config JSON if True. no_GoogleScholar (bool): if True search Google Scholar else don't. Reduces checking on config JSON if True. no_Crossref (bool): If True search Crossref else don't. Reduces checking on config JSON if True. no_PubMed (bool): If True search PubMed else don't. Reduces checking on config JSON if True. Returns: config_dict (dict): Matches the Configuration file JSON schema. """ ## read in config file config_dict = fileio.load_json(config_json_filepath) if not "ORCID_search" in config_dict: no_ORCID = True if not "Crossref_search" in config_dict: no_Crossref = True if not "PubMed_search" in config_dict: no_PubMed = True ## Get inputs from config file and check them for errors. user_input_checking.config_file_check(config_dict, no_ORCID, no_GoogleScholar, no_Crossref, no_PubMed) user_input_checking.config_report_check(config_dict) return config_dict
[docs]def generate_internal_data_and_check_authors(config_dict): """Create authors_by_project_dict and look for authors without projects. Args: config_dict (dict): Matches the Configuration file JSON schema. Returns: authors_by_project_dict (dict): Keys are project names and values are a dictionary of authors and their attributes. config_dict (dict): same as input but with author information updated based on project information. """ ## Create an authors_json for each project in the config_dict and update those authors attributes with the project attributes. authors_by_project_dict = helper_functions.create_authors_by_project_dict(config_dict) ## Find minimum cutoff_year, and take the union of affiliations and grants for each author. helper_functions.adjust_author_attributes(authors_by_project_dict, config_dict) ## Look for authors not in any projects and warn user. authors_in_projects = {author for project_attributes in config_dict["project_descriptions"].values() if "authors" in project_attributes for author in project_attributes["authors"]} authors_not_in_projects = set(config_dict["Authors"].keys()) - authors_in_projects projects_without_authors = [project for project, project_attributes in config_dict["project_descriptions"].items() if not "authors" in project_attributes] if authors_not_in_projects and projects_without_authors: helper_functions.vprint("Warning: The following authors in the Authors section of the configuration JSON file are not in any project.") for author in authors_not_in_projects: helper_functions.vprint(author) return authors_by_project_dict, config_dict
[docs]def build_publication_dict(config_dict, prev_pubs, no_ORCID, no_GoogleScholar, no_Crossref, no_PubMed): """Query PubMed, ORCID, Google Scholar, and Crossref for publications for the authors. Args: config_dict (dict): Matches the Configuration file JSON schema. prev_pubs (dict): Matches the publication JSON schema. Used to ignore publications when querying. no_ORCID (bool): If True search ORCID else don't. no_GoogleScholar (bool): if True search Google Scholar else don't. no_Crossref (bool): If True search Crossref else don't. no_PubMed (bool): If True search PubMed else don't. Returns: running_pubs (dict): The dictionary matching the publication JSON schema. all_queries (dict): The pubs searched for each source and each author. {"PubMed":{"author1":[pub1, ...], ...}, "ORCID":{"author1":[pub1, ...], ...}, "Google Scholar":{"author1":[pub1, ...], ...}, "Crossref":{"author1":[pub1, ...], ...}} """ ## Get publications from PubMed helper_functions.vprint("Finding author's publications. This could take a while.") running_pubs = {} all_queries = {} if not no_PubMed: helper_functions.vprint("Searching PubMed.") running_pubs, PubMed_publication_dict = athr_srch_webio.search_PubMed_for_pubs(running_pubs, config_dict["Authors"], config_dict["PubMed_search"]["PubMed_email"]) all_queries["PubMed"] = PubMed_publication_dict if not no_ORCID: helper_functions.vprint("Searching ORCID.") running_pubs, ORCID_publication_dict = athr_srch_webio.search_ORCID_for_pubs(running_pubs, config_dict["ORCID_search"]["ORCID_key"], config_dict["ORCID_search"]["ORCID_secret"], config_dict["Authors"]) all_queries["ORCID"] = ORCID_publication_dict if not no_GoogleScholar: helper_functions.vprint("Searching Google Scholar.") running_pubs, Google_Scholar_publication_dict = athr_srch_webio.search_Google_Scholar_for_pubs(running_pubs, config_dict["Authors"], config_dict["Crossref_search"]["mailto_email"]) all_queries["Google Scholar"] = Google_Scholar_publication_dict if not no_Crossref: helper_functions.vprint("Searching Crossref.") running_pubs, Crossref_publication_dict = athr_srch_webio.search_Crossref_for_pubs(running_pubs, config_dict["Authors"], config_dict["Crossref_search"]["mailto_email"]) all_queries["Crossref"] = Crossref_publication_dict ## Do a second pass using the saved queries. if not no_PubMed: running_pubs, PubMed_publication_dict = athr_srch_webio.search_PubMed_for_pubs(running_pubs, config_dict["Authors"], config_dict["PubMed_search"]["PubMed_email"], all_queries["PubMed"]) if not no_ORCID: running_pubs, ORCID_publication_dict = athr_srch_webio.search_ORCID_for_pubs(running_pubs, config_dict["ORCID_search"]["ORCID_key"], config_dict["ORCID_search"]["ORCID_secret"], config_dict["Authors"], all_queries["ORCID"]) if not no_GoogleScholar: running_pubs, Google_Scholar_publication_dict = athr_srch_webio.search_Google_Scholar_for_pubs(running_pubs, config_dict["Authors"], config_dict["Crossref_search"]["mailto_email"], all_queries["Google Scholar"]) if not no_Crossref: running_pubs, Crossref_publication_dict = athr_srch_webio.search_Crossref_for_pubs(running_pubs, config_dict["Authors"], config_dict["Crossref_search"]["mailto_email"], all_queries["Crossref"]) ## Compare current pubs with previous and only keep those that are new or updated. for pub_id, pub_values in prev_pubs.items(): if pub_id in running_pubs and not deepdiff.DeepDiff(running_pubs[pub_id], pub_values, ignore_order=True, report_repetition=True): del running_pubs[pub_id] if len(running_pubs) == 0: helper_functions.vprint("No new publications found.") sys.exit() ## Convert PubMed articles class to dicts so they can be saved as JSON. if not no_PubMed: for author, pub_list in all_queries["PubMed"].items(): new_list = [] for pub in pub_list: _, pub_dict = helper_functions.create_pub_dict_for_saving_PubMed(pub, True) new_list.append(pub_dict) all_queries["PubMed"][author] = new_list return running_pubs, all_queries
[docs]def save_and_send_reports_and_emails(authors_by_project_dict, publication_dict, config_dict, test): """Build the summary report and project reports and email them. Args: authors_by_project_dict (dict): Keys are project names and values are a dictionary of authors and their attributes. publication_dict (dict): The dictionary matching the publication JSON schema. config_dict (dict): Matches the Configuration file JSON schema. test (bool): If True save_dir_name is tracker-test instead of tracker- and emails are not sent. Returns: save_dir_name (str): Name of the directory where the emails and reports were saved. """ ## Build the save directory name. if test: save_dir_name = "tracker-test-" + re.sub(r"\-| |\:", "", str(datetime.datetime.now())[2:16]) else: save_dir_name = "tracker-" + re.sub(r"\-| |\:", "", str(datetime.datetime.now())[2:16]) os.mkdir(save_dir_name) email_messages = athr_srch_emails_and_reports.create_project_reports_and_emails(authors_by_project_dict, publication_dict, config_dict, save_dir_name) email_messages["emails"] = email_messages["emails"] + athr_srch_emails_and_reports.create_collaborators_reports_and_emails(publication_dict, config_dict, save_dir_name)["emails"] if "summary_report" in config_dict: if "columns" in config_dict["summary_report"]: summary_report, summary_filename = athr_srch_emails_and_reports.create_tabular_summary_report(publication_dict, config_dict, authors_by_project_dict, save_dir_name) else: if "template" in config_dict["summary_report"]: template = config_dict["summary_report"]["template"] else: template = athr_srch_emails_and_reports.DEFAULT_SUMMARY_TEMPLATE if "filename" in config_dict["summary_report"]: summary_filename = config_dict["summary_report"]["filename"] else: summary_filename = "summary_report.txt" summary_report = athr_srch_emails_and_reports.create_summary_report(publication_dict, config_dict, authors_by_project_dict, template) fileio.save_string_to_file(save_dir_name, summary_filename, summary_report) if "from_email" in config_dict["summary_report"]: email_messages["emails"].append({"to":",".join([email for email in config_dict["summary_report"]["to_email"]]), "from":config_dict["summary_report"]["from_email"], "cc":",".join([email for email in config_dict["summary_report"]["cc_email"]]) if "cc_email" in config_dict["summary_report"] else "", "subject":config_dict["summary_report"]["email_subject"], "body":config_dict["summary_report"]["email_body"], "attachment":summary_report, "attachment_filename": summary_filename}) if email_messages["emails"]: ## save email messages to file fileio.save_emails_to_file(email_messages, save_dir_name) ## send emails if not test: webio.send_emails(email_messages) return save_dir_name