Source code for academic_tracker.ref_srch_emails_and_reports

# -*- coding: utf-8 -*-
"""
Reference Search Emails and Reports
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Functions to create emails and reports for reference_search.
"""

import re
import copy
import os

import pandas

from . import helper_functions
from . import fileio
from . import emails_and_reports_helpers


DEFAULT_SUMMARY_TEMPLATE = "<pub_loop>Reference Line:\n\t<ref_line>\nTokenized Reference:\n\tAuthors: <tok_authors>\n\tTitle: <tok_title>\n\tPMID: <tok_PMID>\n\tDOI: <tok_DOI>\nQueried Information:\n\tDOI: <DOI>\n\tPMID: <PMID>\n\tPMCID: <PMCID>\n\tGrants: <grants>\n\n</pub_loop>"

simple_publication_keywords_map = emails_and_reports_helpers.simple_publication_keywords_map

pub_authors_keyword_map = emails_and_reports_helpers.pub_authors_keyword_map

references_keyword_map = emails_and_reports_helpers.references_keyword_map

publication_date_keywords_map = emails_and_reports_helpers.publication_date_keywords_map

tokenized_keywords_map = emails_and_reports_helpers.tokenized_keywords_map



[docs]
def convert_tokenized_authors_to_str(authors):
    """Combine authors into a comma separated string.
    
    Try to do first_name last_name for each author, but if first name isn't there
    then last_name initials. ex. first_name1 last_name1, last_name2 initials2
    
    Args:
        authors (list): a list of dictionaries [{"last":last_name, "initials":initials}, {"last":last_name, "first":first_name}]
        
    Returns:
        authors_string (str): comma separated list of authors.
    """
    
    authors_string = ""
    for author in authors:
        if "first" in author:
            if author["first"]:
                authors_string += author["first"]
                authors_string += " " + author["last"] + ", " if author["last"] else ", "
            elif author["last"]:
                authors_string += author["last"] + ", "
        else:
            if author["last"]:
                authors_string += author["last"]
                authors_string += " " + author["initials"] + ", " if author["initials"] else ", "
            else:
                authors_string += author["initials"] + ", "
        
    authors_string = authors_string[:-2]
    
    if not authors_string:
        authors_string = str(None)
            
    return authors_string




    

[docs]
def create_report_from_template(publication_dict, is_citation_in_prev_pubs_list, tokenized_citations, template_string = DEFAULT_SUMMARY_TEMPLATE):
    """Create project report based on template_string.
    
    Loop over each publication in publication_dict and build a report based on 
    the tags in the template_string. Details about reports are in the documentation.
    
    Args:
        publication_dict (dict): keys and values match the publications JSON file.
        is_citation_in_prev_pubs_list (list): list of bools that indicate whether or not the citation at the same index in tokenized_citations is in the prev_pubs
        tokenized_citations (list): list of dicts. Matches the JSON schema for tokenized citations.
        template_string (str): string with tags indicated what information to put in the report.
        
    Returns:
        report (str): text of the created report.
    """
    
    matching_key_for_citation = [citation["pub_dict_key"] for citation in tokenized_citations]
    
    pub_template = helper_functions.regex_group_return(helper_functions.regex_match_return(r"(?s).*<pub_loop>(.*)</pub_loop>.*", template_string), 0)
    pub_author_template = helper_functions.regex_group_return(helper_functions.regex_match_return(r"(?s).*<pub_author_loop>(.*)</pub_author_loop>.*", template_string), 0)
    reference_template = helper_functions.regex_group_return(helper_functions.regex_match_return(r"(?s).*<reference_loop>(.*)</reference_loop>.*", template_string), 0)

    
    report_string = ""
    for pub_id, pub_values in publication_dict.items():
        pub_template_copy = pub_template
        tok_index = matching_key_for_citation.index(pub_id)
        tokenized_citation = tokenized_citations[tok_index]
        is_citation_in_prev_pubs = is_citation_in_prev_pubs_list[tok_index] if is_citation_in_prev_pubs_list else None
        
        pub_template_copy = emails_and_reports_helpers._replace_pub_author_and_reference_loops(publication_dict, 
                                                                                               pub_id, 
                                                                                               pub_template_copy, 
                                                                                               pub_author_template, 
                                                                                               reference_template)
                
        pub_template_copy = emails_and_reports_helpers._replace_keywords({"1":pub_template_copy}, 
                                                                         publication_dict, {}, 
                                                                         pub=pub_id, 
                                                                         tokenized_citation=tokenized_citation, 
                                                                         is_citation_in_prev_pubs=is_citation_in_prev_pubs)["1"]
                
        report_string += pub_template_copy
        
    report = re.sub(r"(?s)<pub_loop>.*</pub_loop>", report_string, template_string)

    return report





[docs]
def create_tabular_report(publication_dict, config_dict, is_citation_in_prev_pubs_list, tokenized_citations, save_dir_name):
    """Create a pandas DataFrame and save it as Excel or CSV.
    
    Args:
        publication_dict (dict): keys and values match the publications JSON file.
        config_dict (dict): keys and values match the project tracking configuration JSON file.
        is_citation_in_prev_pubs_list (list): list of bools that indicate whether or not the citation at the same index in tokenized_citations is in the prev_pubs
        tokenized_citations (list): list of dicts. Matches the JSON schema for tokenized citations.
        save_dir_name (str): directory to save the report in.
        
    Returns:
        report (str): Either the text of the report if csv or a relative filepath to where the Excel file is saved.
        filename (str): Filename of the report. Made have had an .xlsx added to the end.
    """
    
    row_template = copy.deepcopy(config_dict["summary_report"]["columns"])
    
    matching_key_for_citation = [citation["pub_dict_key"] for citation in tokenized_citations]
    
    separator = config_dict["summary_report"]["separator"] if "separator" in config_dict["summary_report"] else ","
    
    sort = config_dict["summary_report"]["sort"] if "sort" in config_dict["summary_report"] else []
    
    if "column_order" in config_dict["summary_report"]:
        column_order = config_dict["summary_report"]["column_order"]
    else:
        column_order = list(row_template.keys())
    
    file_format = config_dict["summary_report"]["file_format"] if "file_format" in config_dict["summary_report"] else "csv"
        
    if "filename" in config_dict["summary_report"]:
        filename = config_dict["summary_report"]["filename"]
    else:
        filename = "summary_report.csv" if file_format == "csv" else "summary_report.xlsx"
    
    
    row_string = "".join(row_template.values()) 
    
    has_pub_author_keywords = False
    if any([pub_author_keyword in row_string for pub_author_keyword in pub_authors_keyword_map.keys()]):
        has_pub_author_keywords = True
        
    
    has_reference_keywords = False
    if any([reference_keyword in row_string for reference_keyword in references_keyword_map.keys()]):
        has_reference_keywords = True
    
    
    rows = []
    for pub, pub_values in publication_dict.items():
        tok_index = matching_key_for_citation.index(pub)
        is_citation_in_prev_pubs = is_citation_in_prev_pubs_list[tok_index] if is_citation_in_prev_pubs_list else None
        
        if has_reference_keywords or has_pub_author_keywords:
            
            rows += emails_and_reports_helpers._build_pub_author_and_reference_rows(publication_dict, 
                                                                                    config_dict, 
                                                                                    has_pub_author_keywords, 
                                                                                    has_reference_keywords,
                                                                                    row_template, 
                                                                                    None, 
                                                                                    None, 
                                                                                    pub, 
                                                                                    tokenized_citations[tok_index], 
                                                                                    is_citation_in_prev_pubs)
            
        else:
            rows.append(emails_and_reports_helpers._replace_keywords(row_template, 
                                                                     publication_dict, 
                                                                     None,
                                                                     pub=pub, 
                                                                     tokenized_citation=tokenized_citations[tok_index], 
                                                                     is_citation_in_prev_pubs=is_citation_in_prev_pubs))
            
    report, filename = emails_and_reports_helpers._save_rows_to_file(rows, 
                                                                     filename, 
                                                                     sort, 
                                                                     column_order, 
                                                                     file_format, 
                                                                     separator, 
                                                                     save_dir_name)
            
    return report, filename





# def replace_keywords(template, publication_dict, pub, tokenized_citation, is_citation_in_prev_pubs, pub_author={}):
#     """Replace keywords in the values of the template dictionary.
    
#     Args:
#         template (dict): keys are column names and values are what the elements of the column should be.
#         publication_dict (dict): keys and values match the publications JSON file.
#         pub (str): the key to the pub in publication_dict.
#         tokenized_citation (dict): The tokenized citation from the reference for the publication.
#         is_citation_in_prev_pubs (bool or None): Whether this publication is in the previous publications or not. If None then it isn't applicable.
#         pub_author (dict): The author in pub.
        
#     Returns:
#         template_copy (dict): template with the keywords replaced in its values.
#     """
    
#     template_copy = copy.deepcopy(template)
    
#     for key in template_copy:
                
#         for keyword, pub_key in simple_publication_keywords_map.items():
#             template_copy[key] = template_copy[key].replace(keyword, str(publication_dict[pub][pub_key]))
            
#         ## build first and last author
#         first_author = str(publication_dict[pub]["authors"][0]["lastname"]) + ", " + str(publication_dict[pub]["authors"][0]["firstname"])
#         template_copy[key] = template_copy[key].replace("<first_author>", first_author)
        
#         last_author = str(publication_dict[pub]["authors"][-1]["lastname"]) + ", " + str(publication_dict[pub]["authors"][-1]["firstname"])
#         template_copy[key] = template_copy[key].replace("<last_author>", last_author)
        
#         authors = ", ".join([str(author["firstname"]) + " " + str(author["lastname"]) for author in publication_dict[pub]["authors"]])
#         template_copy[key] = template_copy[key].replace("<authors>", authors)
        
#         grants = ", ".join(publication_dict[pub]["grants"]) if publication_dict[pub]["grants"] else "None Found"
#         template_copy[key] = template_copy[key].replace("<grants>", grants)
        
#         for keyword, date_key in publication_date_keywords_map.items():
#             template_copy[key] = template_copy[key].replace(keyword, str(publication_dict[pub]["publication_date"][date_key]))
        
#         ## Pub authors keywords
#         if pub_author:
#             for keyword, pub_author_key in pub_authors_keyword_map.items():
#                 template_copy[key] = template_copy[key].replace(keyword, str(pub_author[pub_author_key]))
                
#         ## tokenized keywords
#         for keyword, tok_key in tokenized_keywords_map.items():
#             replacement = str(tokenized_citation[tok_key])
#             if not replacement:
#                 replacement = "None"
#             template_copy[key] = template_copy[key].replace(keyword, replacement)
            
#         tok_authors = convert_tokenized_authors_to_str(tokenized_citation["authors"])
#         template_copy[key] = template_copy[key].replace("<tok_authors>", tok_authors)
        
#         if tokenized_citation["reference_line"]:
#             pretty_print = tokenized_citation["reference_line"].split("\n")
#             pretty_print = " ".join([line.strip() for line in pretty_print])
#             template_copy[key] = template_copy[key].replace("<ref_line>", pretty_print)
#         else:
#             template_copy[key] = template_copy[key].replace("<ref_line>", "N/A")
        
#         if type(is_citation_in_prev_pubs) == bool:
#             template_copy[key] = template_copy[key].replace("<is_in_comparison_file>", str(is_citation_in_prev_pubs))
#         else:
#             template_copy[key] = template_copy[key].replace("<is_in_comparison_file>", "N/A")
            
#     return template_copy





[docs]
def create_tokenization_report(tokenized_citations):
    """Create a report that details all the information about how a reference was tokenized.
    
    Intended as a troubleshooting report.
    
    Args:
        tokenized_citations (list): list of dicts. Matches the JSON schema for tokenized citations.
        
    Returns:
        report_string (str): report text built from tokenized_citations.
    """
    
    report_string = ""
    for count, citation in enumerate(tokenized_citations):
        if tokenized_citations[count]["reference_line"]:
            pretty_print = tokenized_citations[count]["reference_line"].split("\n")
            pretty_print = " ".join([line.strip() for line in pretty_print])
            report_string += "Reference Line: \n\t" + pretty_print + "\n"
        else:
            report_string += "Reference Line: \n\tN/A\n"
        
        report_string += "Tokenized Reference: \n\tAuthors: " + convert_tokenized_authors_to_str(citation["authors"])
        report_string += "\n\tTitle: " + citation["title"] if citation["title"] else "\n\tTitle: None"
        report_string += "\n\tPMID: " + str(citation["PMID"]) if citation["PMID"] else "\n\tPMID: None"
        report_string += "\n\tDOI: " + citation["DOI"] if citation["DOI"] else "\n\tDOI: None"
        report_string += "\n\n"
        
    return report_string




###############
## Unused
###############
    

# def create_reference_search_diagnostic(publication_dict, is_citation_in_prev_pubs_list, tokenized_citations):
#     """"""
    
#     report_string = ""
#     for count, citation in enumerate(tokenized_citations):
#         if tokenized_citations[count]["reference_line"]:
#             pretty_print = tokenized_citations[count]["reference_line"].split("\n")
#             pretty_print = " ".join([line.strip() for line in pretty_print])
#             report_string += "Reference Line: " + pretty_print + "\n"
        
#         report_string += "Tokenized Reference: \n\tAuthors: " + convert_tokenized_authors_to_str(citation["authors"]) + " \n\tTitle: " + citation["title"]
#         if citation["PMID"]:
#             report_string += " \n\tPMID: " + str(citation["PMID"])
#         if citation["DOI"]:
#             report_string += " \n\tDOI: " + citation["DOI"]
#         report_string += "\n"
        
#         if tokenized_citations[count]["pub_dict_key"]:
#             doi = publication_dict[tokenized_citations[count]["pub_dict_key"]]["doi"]
#             pmid = publication_dict[tokenized_citations[count]["pub_dict_key"]]["pubmed_id"]
#             pmcid = publication_dict[tokenized_citations[count]["pub_dict_key"]]["PMCID"]
#             if publication_dict[tokenized_citations[count]["pub_dict_key"]]["grants"]:
#                 grants = ", ".join(publication_dict[tokenized_citations[count]["pub_dict_key"]]["grants"])
        
                
#         if not doi:
#             doi = "Not Found"
#         if not pmid:
#             pmid = "Not Found"
#         if not pmcid:
#             pmcid = "Not Found"
#         if not grants:
#             grants = "None Found"
        
#         report_string += "Queried Information: \n\tDOI: " + doi + \
#                          " \n\tPMID: " + pmid + \
#                          " \n\tPMCID: " + pmcid +\
#                          " \n\tGrants: " + grants
#         if is_citation_in_prev_pubs_list:
#             report_string += " \n\tIs In Comparison File: " + str(is_citation_in_prev_pubs_list[count])
        
#         report_string += "\n\n\n"
        
#     return report_string