Source code for academic_tracker.ref_srch_emails_and_reports

# -*- coding: utf-8 -*-
"""
Reference Search Emails and Reports
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Functions to create emails and reports for reference_search.
"""

import re
import copy
import os

import pandas

from . import helper_functions
from . import fileio
from . import emails_and_reports_helpers


DEFAULT_SUMMARY_TEMPLATE = "<pub_loop>Reference Line:\n\t<ref_line>\nTokenized Reference:\n\tAuthors: <tok_authors>\n\tTitle: <tok_title>\n\tPMID: <tok_PMID>\n\tDOI: <tok_DOI>\nQueried Information:\n\tDOI: <DOI>\n\tPMID: <PMID>\n\tPMCID: <PMCID>\n\tGrants: <grants>\n\n</pub_loop>"

simple_publication_keywords_map = emails_and_reports_helpers.simple_publication_keywords_map

pub_authors_keyword_map = emails_and_reports_helpers.pub_authors_keyword_map

references_keyword_map = emails_and_reports_helpers.references_keyword_map

publication_date_keywords_map = emails_and_reports_helpers.publication_date_keywords_map

tokenized_keywords_map = emails_and_reports_helpers.tokenized_keywords_map


[docs]def convert_tokenized_authors_to_str(authors): """Combine authors into a comma separated string. Try to do first_name last_name for each author, but if first name isn't there then last_name initials. ex. first_name1 last_name1, last_name2 initials2 Args: authors (list): a list of dictionaries [{"last":last_name, "initials":initials}, {"last":last_name, "first":first_name}] Returns: authors_string (str): comma separated list of authors. """ authors_string = "" for author in authors: if "first" in author: if author["first"]: authors_string += author["first"] authors_string += " " + author["last"] + ", " if author["last"] else ", " elif author["last"]: authors_string += author["last"] + ", " else: if author["last"]: authors_string += author["last"] authors_string += " " + author["initials"] + ", " if author["initials"] else ", " else: authors_string += author["initials"] + ", " authors_string = authors_string[:-2] if not authors_string: authors_string = str(None) return authors_string
[docs]def create_report_from_template(publication_dict, is_citation_in_prev_pubs_list, tokenized_citations, template_string = DEFAULT_SUMMARY_TEMPLATE): """Create project report based on template_string. Loop over each publication in publication_dict and build a report based on the tags in the template_string. Details about reports are in the documentation. Args: publication_dict (dict): keys and values match the publications JSON file. is_citation_in_prev_pubs_list (list): list of bools that indicate whether or not the citation at the same index in tokenized_citations is in the prev_pubs tokenized_citations (list): list of dicts. Matches the JSON schema for tokenized citations. template_string (str): string with tags indicated what information to put in the report. Returns: report (str): text of the created report. """ matching_key_for_citation = [citation["pub_dict_key"] for citation in tokenized_citations] pub_template = helper_functions.regex_group_return(helper_functions.regex_match_return(r"(?s).*<pub_loop>(.*)</pub_loop>.*", template_string), 0) pub_author_template = helper_functions.regex_group_return(helper_functions.regex_match_return(r"(?s).*<pub_author_loop>(.*)</pub_author_loop>.*", template_string), 0) reference_template = helper_functions.regex_group_return(helper_functions.regex_match_return(r"(?s).*<reference_loop>(.*)</reference_loop>.*", template_string), 0) report_string = "" for pub_id, pub_values in publication_dict.items(): pub_template_copy = pub_template tok_index = matching_key_for_citation.index(pub_id) tokenized_citation = tokenized_citations[tok_index] is_citation_in_prev_pubs = is_citation_in_prev_pubs_list[tok_index] if is_citation_in_prev_pubs_list else None pub_template_copy = emails_and_reports_helpers._replace_pub_author_and_reference_loops(publication_dict, pub_id, pub_template_copy, pub_author_template, reference_template) pub_template_copy = emails_and_reports_helpers._replace_keywords({"1":pub_template_copy}, publication_dict, {}, pub=pub_id, tokenized_citation=tokenized_citation, is_citation_in_prev_pubs=is_citation_in_prev_pubs)["1"] report_string += pub_template_copy report = re.sub(r"(?s)<pub_loop>.*</pub_loop>", report_string, template_string) return report
[docs]def create_tabular_report(publication_dict, config_dict, is_citation_in_prev_pubs_list, tokenized_citations, save_dir_name): """Create a pandas DataFrame and save it as Excel or CSV. Args: publication_dict (dict): keys and values match the publications JSON file. config_dict (dict): keys and values match the project tracking configuration JSON file. is_citation_in_prev_pubs_list (list): list of bools that indicate whether or not the citation at the same index in tokenized_citations is in the prev_pubs tokenized_citations (list): list of dicts. Matches the JSON schema for tokenized citations. save_dir_name (str): directory to save the report in. Returns: report (str): Either the text of the report if csv or a relative filepath to where the Excel file is saved. filename (str): Filename of the report. Made have had an .xlsx added to the end. """ row_template = copy.deepcopy(config_dict["summary_report"]["columns"]) matching_key_for_citation = [citation["pub_dict_key"] for citation in tokenized_citations] separator = config_dict["summary_report"]["separator"] if "separator" in config_dict["summary_report"] else "," sort = config_dict["summary_report"]["sort"] if "sort" in config_dict["summary_report"] else [] if "column_order" in config_dict["summary_report"]: column_order = config_dict["summary_report"]["column_order"] else: column_order = list(row_template.keys()) file_format = config_dict["summary_report"]["file_format"] if "file_format" in config_dict["summary_report"] else "csv" if "filename" in config_dict["summary_report"]: filename = config_dict["summary_report"]["filename"] else: filename = "summary_report.csv" if file_format == "csv" else "summary_report.xlsx" row_string = "".join(row_template.values()) has_pub_author_keywords = False if any([pub_author_keyword in row_string for pub_author_keyword in pub_authors_keyword_map.keys()]): has_pub_author_keywords = True has_reference_keywords = False if any([reference_keyword in row_string for reference_keyword in references_keyword_map.keys()]): has_reference_keywords = True rows = [] for pub, pub_values in publication_dict.items(): tok_index = matching_key_for_citation.index(pub) is_citation_in_prev_pubs = is_citation_in_prev_pubs_list[tok_index] if is_citation_in_prev_pubs_list else None if has_reference_keywords or has_pub_author_keywords: rows += emails_and_reports_helpers._build_pub_author_and_reference_rows(publication_dict, config_dict, has_pub_author_keywords, has_reference_keywords, row_template, None, None, pub, tokenized_citations[tok_index], is_citation_in_prev_pubs) else: rows.append(emails_and_reports_helpers._replace_keywords(row_template, publication_dict, None, pub=pub, tokenized_citation=tokenized_citations[tok_index], is_citation_in_prev_pubs=is_citation_in_prev_pubs)) report, filename = emails_and_reports_helpers._save_rows_to_file(rows, filename, sort, column_order, file_format, separator, save_dir_name) return report, filename
# def replace_keywords(template, publication_dict, pub, tokenized_citation, is_citation_in_prev_pubs, pub_author={}): # """Replace keywords in the values of the template dictionary. # Args: # template (dict): keys are column names and values are what the elements of the column should be. # publication_dict (dict): keys and values match the publications JSON file. # pub (str): the key to the pub in publication_dict. # tokenized_citation (dict): The tokenized citation from the reference for the publication. # is_citation_in_prev_pubs (bool or None): Whether this publication is in the previous publications or not. If None then it isn't applicable. # pub_author (dict): The author in pub. # Returns: # template_copy (dict): template with the keywords replaced in its values. # """ # template_copy = copy.deepcopy(template) # for key in template_copy: # for keyword, pub_key in simple_publication_keywords_map.items(): # template_copy[key] = template_copy[key].replace(keyword, str(publication_dict[pub][pub_key])) # ## build first and last author # first_author = str(publication_dict[pub]["authors"][0]["lastname"]) + ", " + str(publication_dict[pub]["authors"][0]["firstname"]) # template_copy[key] = template_copy[key].replace("<first_author>", first_author) # last_author = str(publication_dict[pub]["authors"][-1]["lastname"]) + ", " + str(publication_dict[pub]["authors"][-1]["firstname"]) # template_copy[key] = template_copy[key].replace("<last_author>", last_author) # authors = ", ".join([str(author["firstname"]) + " " + str(author["lastname"]) for author in publication_dict[pub]["authors"]]) # template_copy[key] = template_copy[key].replace("<authors>", authors) # grants = ", ".join(publication_dict[pub]["grants"]) if publication_dict[pub]["grants"] else "None Found" # template_copy[key] = template_copy[key].replace("<grants>", grants) # for keyword, date_key in publication_date_keywords_map.items(): # template_copy[key] = template_copy[key].replace(keyword, str(publication_dict[pub]["publication_date"][date_key])) # ## Pub authors keywords # if pub_author: # for keyword, pub_author_key in pub_authors_keyword_map.items(): # template_copy[key] = template_copy[key].replace(keyword, str(pub_author[pub_author_key])) # ## tokenized keywords # for keyword, tok_key in tokenized_keywords_map.items(): # replacement = str(tokenized_citation[tok_key]) # if not replacement: # replacement = "None" # template_copy[key] = template_copy[key].replace(keyword, replacement) # tok_authors = convert_tokenized_authors_to_str(tokenized_citation["authors"]) # template_copy[key] = template_copy[key].replace("<tok_authors>", tok_authors) # if tokenized_citation["reference_line"]: # pretty_print = tokenized_citation["reference_line"].split("\n") # pretty_print = " ".join([line.strip() for line in pretty_print]) # template_copy[key] = template_copy[key].replace("<ref_line>", pretty_print) # else: # template_copy[key] = template_copy[key].replace("<ref_line>", "N/A") # if type(is_citation_in_prev_pubs) == bool: # template_copy[key] = template_copy[key].replace("<is_in_comparison_file>", str(is_citation_in_prev_pubs)) # else: # template_copy[key] = template_copy[key].replace("<is_in_comparison_file>", "N/A") # return template_copy
[docs]def create_tokenization_report(tokenized_citations): """Create a report that details all the information about how a reference was tokenized. Intended as a troubleshooting report. Args: tokenized_citations (list): list of dicts. Matches the JSON schema for tokenized citations. Returns: report_string (str): report text built from tokenized_citations. """ report_string = "" for count, citation in enumerate(tokenized_citations): if tokenized_citations[count]["reference_line"]: pretty_print = tokenized_citations[count]["reference_line"].split("\n") pretty_print = " ".join([line.strip() for line in pretty_print]) report_string += "Reference Line: \n\t" + pretty_print + "\n" else: report_string += "Reference Line: \n\tN/A\n" report_string += "Tokenized Reference: \n\tAuthors: " + convert_tokenized_authors_to_str(citation["authors"]) report_string += "\n\tTitle: " + citation["title"] if citation["title"] else "\n\tTitle: None" report_string += "\n\tPMID: " + str(citation["PMID"]) if citation["PMID"] else "\n\tPMID: None" report_string += "\n\tDOI: " + citation["DOI"] if citation["DOI"] else "\n\tDOI: None" report_string += "\n\n" return report_string
############### ## Unused ############### # def create_reference_search_diagnostic(publication_dict, is_citation_in_prev_pubs_list, tokenized_citations): # """""" # report_string = "" # for count, citation in enumerate(tokenized_citations): # if tokenized_citations[count]["reference_line"]: # pretty_print = tokenized_citations[count]["reference_line"].split("\n") # pretty_print = " ".join([line.strip() for line in pretty_print]) # report_string += "Reference Line: " + pretty_print + "\n" # report_string += "Tokenized Reference: \n\tAuthors: " + convert_tokenized_authors_to_str(citation["authors"]) + " \n\tTitle: " + citation["title"] # if citation["PMID"]: # report_string += " \n\tPMID: " + str(citation["PMID"]) # if citation["DOI"]: # report_string += " \n\tDOI: " + citation["DOI"] # report_string += "\n" # if tokenized_citations[count]["pub_dict_key"]: # doi = publication_dict[tokenized_citations[count]["pub_dict_key"]]["doi"] # pmid = publication_dict[tokenized_citations[count]["pub_dict_key"]]["pubmed_id"] # pmcid = publication_dict[tokenized_citations[count]["pub_dict_key"]]["PMCID"] # if publication_dict[tokenized_citations[count]["pub_dict_key"]]["grants"]: # grants = ", ".join(publication_dict[tokenized_citations[count]["pub_dict_key"]]["grants"]) # if not doi: # doi = "Not Found" # if not pmid: # pmid = "Not Found" # if not pmcid: # pmcid = "Not Found" # if not grants: # grants = "None Found" # report_string += "Queried Information: \n\tDOI: " + doi + \ # " \n\tPMID: " + pmid + \ # " \n\tPMCID: " + pmcid +\ # " \n\tGrants: " + grants # if is_citation_in_prev_pubs_list: # report_string += " \n\tIs In Comparison File: " + str(is_citation_in_prev_pubs_list[count]) # report_string += "\n\n\n" # return report_string