Source code for academic_tracker.ref_srch_modularized

# -*- coding: utf-8 -*-
"""
Reference Search Modularized
~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Modularized pieces of reference_search.
"""

import re
import datetime
import os
import sys

from . import user_input_checking
from . import fileio
from . import helper_functions
from . import ref_srch_webio
from . import ref_srch_emails_and_reports
from . import webio



[docs]def input_reading_and_checking(config_json_filepath, ref_path_or_URL, MEDLINE_reference, no_Crossref, no_PubMed, prev_pub_filepath, remove_duplicates): """Read in inputs from user and do error checking. Args: config_json_filepath (str): filepath to the configuration JSON. ref_path_or_URL (str): either a filepath to file to tokenize or a URL to tokenize. MEDLINE_reference (bool): If True re_path_or_URL is a filepath to a MEDLINE formatted file. no_Crossref (bool): If True search Crossref else don't. Reduces checking on config JSON if True. no_PubMed (bool): If True search PubMed else don't. Reduces checking on config JSON if True. prev_pub_filepath (str or None): filepath to the publication JSON to read in. remove_duplicates (bool): if True, remove duplicate entries in tokenized citations. Returns: config_dict (dict): Matches the Configuration file JSON schema. tokenized_citations (list): list of dicts. Matches the tokenized citations JSON schema. has_previous_pubs (bool): True if a prev_pub file was input, False otherwise. prev_pubs (dict): The contents of the prev_pub file input by the user if provided. """ ## read in config file config_dict = fileio.load_json(config_json_filepath) if not "Crossref_search" in config_dict: no_Crossref = True if not "PubMed_search" in config_dict: no_PubMed = True ## Get inputs from config file and check them for errors. user_input_checking.ref_config_file_check(config_dict, no_Crossref, no_PubMed) user_input_checking.config_report_check(config_dict) if not prev_pub_filepath or prev_pub_filepath.lower() == "ignore": prev_pubs = {} has_previous_pubs = False else: prev_pubs = fileio.load_json(prev_pub_filepath) has_previous_pubs = True if has_previous_pubs: user_input_checking.prev_pubs_file_check(prev_pubs) tokenized_citations = ref_srch_webio.tokenize_reference_input(ref_path_or_URL, MEDLINE_reference, remove_duplicates) return config_dict, tokenized_citations, has_previous_pubs, prev_pubs
[docs]def build_publication_dict(config_dict, tokenized_citations, no_Crossref, no_PubMed): """Query PubMed and Crossref for publications matching the citations in tokenized_citations. Args: config_dict (dict): Matches the Configuration file JSON schema. tokenized_citations (list): list of dicts. Matches the tokenized citations JSON schema. no_Crossref (bool): If True search Crossref else don't. Reduces checking on config JSON if True. no_PubMed (bool): If True search PubMed else don't. Reduces checking on config JSON if True. Returns: running_pubs (dict): The dictionary matching the publication JSON schema. tokenized_citations (list): Same list as the input but with the pud_dict_key updated to match the publication found. all_queries (dict): for each source searched a list of lists, each index is the pubs searched through after querying until the citation was matched, {"PubMed":[[pub1, ...], ...], "Crossref":[[pub1, ...], ...]} """ helper_functions.vprint("Finding publications. This could take a while.") running_pubs = {} all_queries = {} if not no_PubMed: helper_functions.vprint("Searching PubMed.") running_pubs, PubMed_matching_key_for_citation, PubMed_publication_dict = \ ref_srch_webio.search_references_on_source("PubMed", running_pubs, tokenized_citations, config_dict["PubMed_search"]["PubMed_email"]) all_queries["PubMed"] = PubMed_publication_dict if not no_Crossref: helper_functions.vprint("Searching Crossref.") running_pubs, Crossref_matching_key_for_citation, Crossref_publication_dict = \ ref_srch_webio.search_references_on_source("Crossref", running_pubs, tokenized_citations, config_dict["Crossref_search"]["mailto_email"]) all_queries["Crossref"] = Crossref_publication_dict ## Do a second pass using the saved queries. if not no_PubMed: running_pubs, PubMed_matching_key_for_citation, PubMed_publication_dict = \ ref_srch_webio.search_references_on_source("PubMed", running_pubs, tokenized_citations, config_dict["PubMed_search"]["PubMed_email"], all_queries["PubMed"]) if not no_Crossref: running_pubs, Crossref_matching_key_for_citation, Crossref_publication_dict = \ ref_srch_webio.search_references_on_source("Crossref", running_pubs, tokenized_citations, config_dict["Crossref_search"]["mailto_email"], all_queries["Crossref"]) matching_key_for_citation = [None] * len(tokenized_citations) if not no_PubMed: matching_key_for_citation = [key if key else PubMed_matching_key_for_citation[count] for count, key in enumerate(matching_key_for_citation)] if not no_Crossref: matching_key_for_citation = [key if key else Crossref_matching_key_for_citation[count] for count, key in enumerate(matching_key_for_citation)] for count, citation in enumerate(tokenized_citations): if matching_key_for_citation[count]: citation["pub_dict_key"] = matching_key_for_citation[count] ## Convert PubMed articles class to dicts so they can be saved as JSON. if not no_PubMed: for i, pub_list in enumerate(all_queries["PubMed"]): new_list = [] for pub in pub_list: _, pub_dict = helper_functions.create_pub_dict_for_saving_PubMed(pub, True) new_list.append(pub_dict) all_queries["PubMed"][i] = new_list return running_pubs, tokenized_citations, all_queries
[docs]def save_and_send_reports_and_emails(config_dict, tokenized_citations, publication_dict, prev_pubs, has_previous_pubs, test): """Build the summary report and email it. Args: config_dict (dict): Matches the Configuration file JSON schema. tokenized_citations (list): list of dicts. Matches the tokenized citations JSON schema. publication_dict (dict): The dictionary matching the publication JSON schema. prev_pubs (dict): The contents of the prev_pub file input by the user if provided. has_previous_pubs (bool): True if a prev_pub file was input, False otherwise. test (bool): If True save_dir_name is tracker-test instead of tracker- and emails are not sent. Returns: save_dir_name (str): Name of the directory where the emails and report were saved. """ ## Compare citations to prev_pubs is_citation_in_prev_pubs_list = [] if has_previous_pubs: is_citation_in_prev_pubs_list = helper_functions.are_citations_in_pub_dict(tokenized_citations, prev_pubs) ## Build the save directory name. if test: save_dir_name = "tracker-test-" + re.sub(r"\-| |\:", "", str(datetime.datetime.now())[2:16]) else: save_dir_name = "tracker-" + re.sub(r"\-| |\:", "", str(datetime.datetime.now())[2:16]) os.mkdir(save_dir_name) if "summary_report" in config_dict: if "columns" in config_dict["summary_report"]: summary_report, summary_filename = ref_srch_emails_and_reports.create_tabular_report(publication_dict, config_dict, is_citation_in_prev_pubs_list, tokenized_citations, save_dir_name) else: if "template" in config_dict["summary_report"]: template = config_dict["summary_report"]["template"] else: template = ref_srch_emails_and_reports.DEFAULT_SUMMARY_TEMPLATE if "filename" in config_dict["summary_report"]: summary_filename = config_dict["summary_report"]["filename"] else: summary_filename = "summary_report.txt" summary_report = ref_srch_emails_and_reports.create_report_from_template(publication_dict, is_citation_in_prev_pubs_list, tokenized_citations, template) fileio.save_string_to_file(save_dir_name, summary_filename, summary_report) if "from_email" in config_dict["summary_report"]: email_messages = {"creation_date" : str(datetime.datetime.now())[0:16]} email_messages["emails"] = [{"to":",".join([email for email in config_dict["summary_report"]["to_email"]]), "from":config_dict["summary_report"]["from_email"], "cc":",".join([email for email in config_dict["summary_report"]["cc_email"]]) if "cc_email" in config_dict["summary_report"] else "", "subject":config_dict["summary_report"]["email_subject"], "body":config_dict["summary_report"]["email_body"], "attachment":summary_report, "attachment_filename": summary_filename}] fileio.save_emails_to_file(email_messages, save_dir_name) ## send emails if not test: webio.send_emails(email_messages) return save_dir_name