Source code for academic_tracker.helper_functions

# -*- coding: utf-8 -*-
"""
Helper Functions
~~~~~~~~~~~~~~~~

This module contains helper functions, such as printing, and regex searching.
"""

import re
import copy
import collections.abc
import xml.etree.ElementTree as ET

import fuzzywuzzy.fuzz

from . import __main__
from . import webio

DOI_URL = webio.DOI_URL
PUBLICATION_TEMPLATE = webio.PUBLICATION_TEMPLATE


[docs]def vprint(*args, verbosity=0): """Print depending on the state of VERBOSE, SILENT, and verbosity. If the global SILENT is True don't print anything. If verbosity is 0 then print. If verbosity is 1 then VERBOSE must be True to print. Args: verbosity (int): Either 0 or 1 for different levels of verbosity. """ VERBOSE = __main__.VERBOSE SILENT = __main__.SILENT if SILENT: return if verbosity == 0: print(*args) elif VERBOSE and verbosity == 1: print(*args)
[docs]def create_authors_by_project_dict(config_dict): """Create the authors_by_project_dict dict from the config_dict. Creates a dict where the keys are the projects in the config_dict and the values are the authors associated with that project from config_dict["Authors"]. Args: config_dict (dict): schema matches the JSON Project Tracking Configuration file. Returns: authors_by_project_dict (dict): keys are the projects in the config_dict and the values are the authors associated with that project from config_dict["Authors"]. """ authors_by_project_dict = {project:{} for project in config_dict["project_descriptions"]} for project, project_attributes in config_dict["project_descriptions"].items(): if "authors" in project_attributes: for author in project_attributes["authors"]: if author in config_dict["Authors"]: authors_by_project_dict[project][author] = copy.deepcopy(config_dict["Authors"][author]) else: vprint("Warning: The author, " + author + ", in the " + project + " project of the project tracking configuration file could not be found in the Authors section of the Configuration JSON file.") else: authors_by_project_dict[project] = copy.deepcopy(config_dict["Authors"]) for author_attr in authors_by_project_dict[project].values(): for key in project_attributes: author_attr.setdefault(key, project_attributes[key]) return authors_by_project_dict
[docs]def adjust_author_attributes(authors_by_project_dict, config_dict): """Modifies config_dict with values from authors_by_project_dict Go through the authors in authors_by_project_dict and find the lowest cutoff_year. Also find affiliations and grants and create a union of them across projects. Update the authors in config_dict["Authors"]. Args: authors_by_project_dict (dict): keys are the projects in the config_dict and the values are the authors associated with that project from config_dict["Authors"]. config_dict (dict): schema matches the JSON Project Tracking Configuration file. Returns: config_dict (dict): schema matches the JSON Project Tracking Configuration file. """ for author, author_attr in config_dict["Authors"].items(): cutoff_year = author_attr["cutoff_year"] if "cutoff_year" in author_attr else 9999 affiliations = set(author_attr["affiliations"]) if "affiliations" in author_attr else set() grants = set(author_attr["grants"]) if "grants" in author_attr else set() for project, authors_dict in authors_by_project_dict.items(): if author in authors_dict: if "cutoff_year" in authors_dict[author] and authors_dict[author]["cutoff_year"] < cutoff_year: cutoff_year = authors_dict[author]["cutoff_year"] if "affiliations" in authors_dict[author]: affiliations |= set(authors_dict[author]["affiliations"]) if "grants" in authors_dict[author]: grants |= set(authors_dict[author]["grants"]) if "collaborator_report" in authors_dict[author]: config_dict["Authors"][author]["collaborator_report"] = authors_dict[author]["collaborator_report"] affiliations = list(affiliations) grants = list(grants) grants.sort() affiliations.sort() config_dict["Authors"][author]["cutoff_year"] = cutoff_year config_dict["Authors"][author]["affiliations"] = affiliations config_dict["Authors"][author]["grants"] = grants return config_dict
[docs]def regex_match_return(regex, string_to_match): """Return the groups matched in the regex if the regex matches. regex is delivered to re.match() with string_to_match, and if there is a match the match.groups() is returned, otherwise an empty tuple is returned. Args: regex (str): A string with a regular expression to be delivered to re.match(). string_to_match (str): The string to match with the regex. Returns: (tuple): either the tuple of the matched groups in the regex or an empty tuple if a match wasn't found. """ match = re.match(regex, string_to_match) return match.groups() if match else ()
[docs]def regex_group_return(regex_groups, group_index): """Return the group in the regex_groups indicated by group_index if it exists, else return empty string. If group_index is out of range of the regex_groups an empty string is retruned. Args: regex_groups (tuple): A tuple returned from a matched regex.groups() call. group_number (int): The index of the regex_groups to return. Returns: (str): Either emtpy string or the group string matched by the regex. """ return regex_groups[group_index] if group_index < len(regex_groups) and regex_groups[group_index] else ""
[docs]def regex_search_return(regex, string_to_search): """Return the groups matched in the regex if the regex matches. regex is delivered to re.search() with string_to_search, and if there is a match the match.groups() is returned, otherwise an empty tuple is returned. Args: regex (str): A string with a regular expression to be delivered to re.search(). string_to_search (str): The string to match with the regex. Returns: (tuple): either the tuple of the matched groups in the regex or an empty tuple if a match wasn't found. """ match = re.search(regex, string_to_search) return match.groups() if match else ()
def _generate_first_name_match_regex(name): """Generate a regular expression to match the given first name. Args: name (str): first name to generate regex for. Returns: (str) regular expression to use with re.match(). """ return ".* " + name + "|" + name + " .*|" + name
[docs]def do_strings_fuzzy_match(string1, string2, match_ratio=90): """Fuzzy match the 2 strings and if the ratio is greater than or equal to match_ratio, return True. Args: string1 (str|None): a string to fuzzy match. string2 (str|None): a string to fuzzy match. match_ratio (int): the ratio (0-100) that the match must be greater than to return True. Returns: (bool): True if strings match, False otherwise. """ if string1 is None or string2 is None: return False string1 = string1.lower() string2 = string2.lower() if fuzzywuzzy.fuzz.ratio(string1, string2) >= match_ratio or\ fuzzywuzzy.fuzz.ratio(string2, string1) >= match_ratio: return True return False
# def match_authors_in_pub_PubMed(authors_json, author_list): # """Look for matching authors in PubMed pub data. # Goes through the author list from PubMed and tries matching to an author in # authors_json using firstname, lastname, and affiliations. # Args: # authors_json (dict): keys are authors and values are author attributes. Matches authors JSON schema. # author_list (list): list of dicts where each dict is attributes of an author. # Returns: # author_list (list): either the author list with matched authors containing an additional author_id attribute, or an empty list if no authors were matched. # """ # ## pub.authors are dictionaries with lastname, firstname, initials, and affiliation. firstname can have initials at the end ex "Andrew P" # publication_has_affiliated_author = False # for author_items in author_list: # author_items_affiliation = str(author_items.get("affiliation")).lower() # author_items_first_name = str(author_items.get("firstname")).replace(".","").lower() # author_items_last_name = str(author_items.get("lastname")).replace(".","").lower() # for author, author_attributes in authors_json.items(): # author_json_first_name = author_attributes["first_name"].replace(".","").lower() # author_json_last_name = author_attributes["last_name"].replace(".","").lower() # ## Match the author's first and last name and then match affiliations. # ## The first name is matched with an additional .* to try and allow for the addition of initials, but this could cause bad matches. # ## For example the name Hu will match Hubert. Counting on the last name to reduce errors. # ## Note that the PubMed query will return publications where the author is just a collaborater, so not finding a match in the authors isn't uncommon. # if re.match(_generate_first_name_match_regex(author_json_first_name), author_items_first_name) and \ # author_json_last_name == author_items_last_name: # ## affiliations are sets of strings so the intersection should have at least 1 string for a match. # if any([affiliation.lower() in author_items_affiliation for affiliation in author_attributes["affiliations"]]): # publication_has_affiliated_author = True # author_items["author_id"] = author # break # return author_list if publication_has_affiliated_author else [] # def match_authors_in_pub_Crossref(authors_json, author_list): # """Look for matching authors in Crossref pub data. # Goes through the author list from Crossref and tries matching to an author in # authors_json using firstname, lastname, and affiliations, or ORCID if ORCID # is present. # Args: # authors_json (dict): keys are authors and values are author attributes. Matches authors JSON schema. # author_list (list): list of dicts where each dict is attributes of an author. # Returns: # author_list (list): either the author list with matched authors containing an additional author_id attribute, or an empty list if no authors were matched. # """ # publication_has_affiliated_author = False # for author_items in author_list: # ## If the author has no affiliation or ORCID id then go to the next. # if not author_items["affiliation"] and not "ORCID" in author_items: # continue # can_name_match = True # if author_items.get("given") is None: # can_name_match = False # else: # author_items_first_name = author_items["given"].replace(".","").lower() # if author_items.get("family") is None: # can_name_match = False # else: # author_items_last_name = author_items["family"].replace(".","").lower() # if author_items["affiliation"] and "name" in author_items["affiliation"][0]: # author_items_affiliation = author_items["affiliation"][0]["name"].lower() # else: # author_items_affiliation = [] # if "ORCID" in author_items: # ORCID_id = regex_search_return(r"(\d{4}-\d{4}-\d{4}-\d{3}[0,1,2,3,4,5,6,7,8,9,X])", author_items["ORCID"])[0] # else: # ORCID_id = "" # for author, author_attributes in authors_json.items(): # author_json_first_name = author_attributes["first_name"].replace(".","").lower() # author_json_last_name = author_attributes["last_name"].replace(".","").lower() # ## Match the author's first and last name and then match affiliations. # ## The first name is matched with an additional .* to try and allow for the addition of initials, but this could cause bad matches. # ## For example the name Hu will match Hubert. Counting on the last name to reduce errors. # ## Note that the PubMed query will return publications where the author is just a collaborater, so not finding a match in the authors isn't uncommon. # if can_name_match and re.match(_generate_first_name_match_regex(author_json_first_name), author_items_first_name) and \ # author_json_last_name == author_items_last_name: # ## affiliations are sets of strings so the intersection should have at least 1 string for a match. # if any([affiliation.lower() in author_items_affiliation for affiliation in author_attributes["affiliations"]]): # publication_has_affiliated_author = True # author_items["author_id"] = author # break # elif ORCID_id and "ORCID" in author_attributes and ORCID_id == author_attributes["ORCID"]: # publication_has_affiliated_author = True # author_items["author_id"] = author # break # if publication_has_affiliated_author: # return author_list # else: # return []
[docs]def match_pub_authors_to_citation_authors(citation_authors, author_list): """Try to match authors in pub data to authors in citation data. Goes through the author_list from a publication and tries matching to an author in citation_authors using last name or ORCID if ORCID is present. Args: citation_authors (list): list of dicts where each dict can have last name and ORCID or collective_name and ORCID. author_list (list): list of dicts where each dict is the attributes of an author. Returns: (bool): True if an author was matched, False otherwise. """ for author_items in author_list: is_collective_author = False has_collective_name = False if "collectivename" in author_items: is_collective_author = True if author_items["collectivename"] is not None: has_collective_name = True author_items_collective_name = author_items["collectivename"] else: can_name_match = True if author_items["lastname"] is None: can_name_match = False else: author_items_last_name = author_items["lastname"].replace(".","").lower() for author_attributes in citation_authors: ## Try to match on ORCID. if author_items["ORCID"] and "ORCID" in author_attributes and author_items["ORCID"] == author_attributes["ORCID"]: return True ## If it is a collective author then match on collectivename, else match on first and last. if is_collective_author: if not has_collective_name or \ "collective_name" not in author_attributes: continue citation_author_collective_name = author_attributes["collective_name"] if do_strings_fuzzy_match(citation_author_collective_name, author_items_collective_name): return True else: citation_author_last_name = author_attributes["last"].replace(".","").lower() if can_name_match and citation_author_last_name == author_items_last_name: return True return False
[docs]def match_pub_authors_to_config_authors(authors_json, author_list): """Try to match authors in pub data to authors in config data. Goes through the author_list from a publication and tries matching to an author in authors_json using firstname, lastname, and affiliations, or ORCID if ORCID is present. Args: authors_json (dict): keys are authors and values are author attributes. Matches authors JSON schema. author_list (list): list of dicts where each dict is the attributes of an author. Returns: author_list (list): either the author list with matched authors containing an additional author_id and/or ORCID attribute, or an empty list if no authors were matched. """ publication_has_affiliated_author = False for author_items in author_list: is_collective_author = False has_collective_name = False if "collectivename" in author_items: is_collective_author = True if author_items["collectivename"] is not None: has_collective_name = True author_items_collective_name = author_items["collectivename"] else: can_name_match = True if author_items["firstname"] is None: can_name_match = False else: author_items_first_name = author_items["firstname"].replace(".","").lower() if author_items["lastname"] is None: can_name_match = False else: author_items_last_name = author_items["lastname"].replace(".","").lower() if author_items["affiliation"] is None: can_name_match = False else: author_items_affiliation = author_items["affiliation"].lower() for author, author_attributes in authors_json.items(): ## Try to match on ORCID. if author_items["ORCID"] and "ORCID" in author_attributes and author_items["ORCID"] == author_attributes["ORCID"]: publication_has_affiliated_author = True author_items["author_id"] = author break ## If it is a collective author then match on collectivename, else match on first and last. if is_collective_author: if not has_collective_name or \ "collective_name" not in author_attributes: continue author_json_collective_name = author_attributes["collective_name"] if do_strings_fuzzy_match(author_json_collective_name, author_items_collective_name): publication_has_affiliated_author = True author_items["author_id"] = author author_items["ORCID"] = author_attributes["ORCID"] if "ORCID" in author_attributes and author_items["ORCID"] is None else author_items["ORCID"] break else: author_json_first_name = author_attributes["first_name"].replace(".","").lower() author_json_last_name = author_attributes["last_name"].replace(".","").lower() ## Match the author's first and last name and then match affiliations. ## The first name is matched with an additional .* to try and allow for the addition of initials, but this could cause bad matches. ## For example the name Hu will match Hubert. Counting on the last name to reduce errors. ## Note that the PubMed query will return publications where the author is just a collaborater, so not finding a match in the authors isn't uncommon. if can_name_match and re.match(_generate_first_name_match_regex(author_json_first_name), author_items_first_name) and \ author_json_last_name == author_items_last_name: ## affiliations in author_attributes are sets of strings so see if any are in the author_items string. if any([affiliation.lower() in author_items_affiliation for affiliation in author_attributes["affiliations"]]): publication_has_affiliated_author = True author_items["author_id"] = author author_items["ORCID"] = author_attributes["ORCID"] if "ORCID" in author_attributes and author_items["ORCID"] is None else author_items["ORCID"] break if publication_has_affiliated_author: return author_list else: return []
[docs]def match_authors_in_prev_pub(prev_author_list, new_author_list): """Look for matching authors in previous pub data. Goes through the new_author_list and tries to find a match for each author in the prev_author_list. Any authors that aren't matched are added to a combined list. Both lists are expected to be a list of a dicts. {"firstname": author's first name, "lastname": author's last name, "author_id" : author's ID, "ORCID": ORCID ID} {"collectivename": collective name, "author_id": author's ID, "ORCID": ORCID ID} If author_id is missing or None in the prev_author_list, it will be updated in the combined_author_list if the matched author in new_author_list has it. The same can be said for the ORCID attribute. Args: prev_author_list (list): list of dicts where each dict is the attributes of an author. new_author_list (list): list of dicts where each dict is the attributes of an author. Returns: combined_author_list (list): the prev_author_list updated with "author_id" for dictionaries in the list that matched the given author. """ combined_author_list = copy.deepcopy(prev_author_list) for new_author_attributes in new_author_list: is_collective_author = False has_collective_name = False if "collectivename" in new_author_attributes: is_collective_author = True if new_author_attributes["collectivename"] is not None: has_collective_name = True new_collective_name = new_author_attributes["collectivename"] else: can_name_match = True if new_author_attributes["firstname"] is None: can_name_match = False else: new_firstname_adjusted = new_author_attributes["firstname"].replace(".","").lower() if new_author_attributes["lastname"] is None: can_name_match = False else: new_lastname_adjusted = new_author_attributes["lastname"].replace(".","").lower() new_author_matched = False for i, prev_author_attributes in enumerate(prev_author_list): ## Try to match with author_id. if new_author_attributes.get("author_id", 0) == prev_author_attributes.get("author_id", 1): new_author_matched = True if prev_author_attributes["ORCID"] is None: combined_author_list[i]["ORCID"] = new_author_attributes["ORCID"] break ## Try to match with ORCID. if new_author_attributes["ORCID"] and new_author_attributes["ORCID"] == prev_author_attributes["ORCID"]: new_author_matched = True if (author_id := new_author_attributes.get("author_id")) and not prev_author_attributes.get("author_id"): combined_author_list[i]["author_id"] = author_id break ## If it is a collective author then match on collectivename, else match on first and last. if is_collective_author: if not has_collective_name or \ "collectivename" not in prev_author_attributes or \ prev_author_attributes["collectivename"] is None: continue prev_collective_name = prev_author_attributes["collectivename"] if do_strings_fuzzy_match(prev_collective_name, new_collective_name): new_author_matched = True if (author_id := new_author_attributes.get("author_id")) and not prev_author_attributes.get("author_id"): combined_author_list[i]["author_id"] = author_id if prev_author_attributes["ORCID"] is None: combined_author_list[i]["ORCID"] = new_author_attributes["ORCID"] break else: ## If the first and last names are missing for either author then they can't be matched. if prev_author_attributes["firstname"] is None or \ prev_author_attributes["lastname"] is None or \ not can_name_match: continue prev_firstname_adjusted = prev_author_attributes["firstname"].replace(".","").lower() prev_lastname_adjusted = prev_author_attributes["lastname"].replace(".","").lower() if re.match(_generate_first_name_match_regex(new_firstname_adjusted), prev_firstname_adjusted) and \ new_lastname_adjusted == prev_lastname_adjusted: new_author_matched = True if (author_id := new_author_attributes.get("author_id")) and not prev_author_attributes.get("author_id"): combined_author_list[i]["author_id"] = author_id if prev_author_attributes["ORCID"] is None: combined_author_list[i]["ORCID"] = new_author_attributes["ORCID"] break if not new_author_matched: combined_author_list.append(new_author_attributes) return combined_author_list
def _match_references_in_prev_pub(prev_reference_list, new_reference_list): """Look for matching references in previous pub data. Goes through the new_reference_list and tries to find a match for each reference in the prev_reference_list. Any references that aren't matched are added to a combined list. Both lists are expected to be a list of a dicts. {"citation": reference citation, "doi": reference DOI, "pubmed_id" : reference PMID, "PMCID": reference PMCID} Fields that are missing or None in the prev_reference_list will be updated in the combined_reference_list if the matched reference in new_reference_list has it. Args: prev_reference_list (list): list of dicts where each dict is the attributes of a reference. new_reference_list (list): list of dicts where each dict is the attributes of a reference. Returns: combined_reference_list (list): the prev_reference_list updated with keys for dictionaries in the list that matched the given reference. """ characters_to_remove = ['.', ',', ';', '(', ')', '[', ']', '{', '}'] combined_reference_list = copy.deepcopy(prev_reference_list) matched_indexes = [] for new_reference_attributes in new_reference_list: new_reference_matched = False for i, prev_reference_attributes in enumerate(prev_reference_list): ## There is a rare case where 2 different publications have the same title and ## are referenced in the same paper. Need to disallow matching to the same reference ## twice to avoid duplicates. Example paper: https://pubmed.ncbi.nlm.nih.gov/24404440/ ## reference 1 and 3 have the same title, but are not the same reference. if i in matched_indexes: continue new_doi = new_reference_attributes.get("doi") prev_doi = prev_reference_attributes.get("doi") new_pmid = new_reference_attributes.get("pubmed_id") prev_pmid = prev_reference_attributes.get("pubmed_id") new_pmcid = new_reference_attributes.get("PMCID") prev_pmcid = prev_reference_attributes.get("PMCID") new_citation = new_reference_attributes.get("citation") prev_citation = prev_reference_attributes.get("citation") new_title = new_reference_attributes.get("title") prev_title = prev_reference_attributes.get("title") if (new_doi and prev_doi and new_doi.lower() == prev_doi.lower()) or\ (new_pmid and prev_pmid and new_pmid == prev_pmid) or\ (new_pmcid and prev_pmcid and new_pmcid == prev_pmcid) or\ (new_title and prev_title and \ (do_strings_fuzzy_match(new_title, prev_title))) or\ (new_title and prev_citation and new_title.lower() in prev_citation.lower()) or\ (new_citation and prev_title and prev_title.lower() in new_citation.lower()) or\ ((percentages := _compute_common_phrase_percent(prev_citation, new_citation, characters_to_remove, 4)) and\ (percentages[0] >= 85 or percentages[1] >= 85)): _update(combined_reference_list[i], new_reference_attributes) new_reference_matched = True matched_indexes.append(i) break if not new_reference_matched: combined_reference_list.append(new_reference_attributes) return combined_reference_list def _compute_common_phrase_percent(prev_citation, new_citation, characters_to_remove, min_len=2): """Find common phrases between prev_citation and new_citation and return percentage. Determine the common subphrases between the citations and then divide the length of the common subphrases with the length of the common subphrases plus the uncommon subphrases left for each citation and multiply by 100 to get the percentage of characters that are common between the common+uncommon and common subphrases. If either citation is None, then return None. Example: prev_citation = 'open js foundation 2019 accessed on 1 january 2023 available online: https://openjsforg/' new_citation = '2023 january 01 open js foundation available online: https://openjsforg/' common_base_string = ' https://openjsforg/open js foundation available online: january 2023 ' prev_common_plus_uncommon = ' https://openjsforg/open js foundation available online: january 2023 2019 accessed on 1' new_common_plus_uncommon = ' https://openjsforg/open js foundation available online: january 2023 01' prev_percent = 78.88888888888889 new_percent = 97.26027397260275 Args: prev_citation (str|None): a string to find common subphrases with another string. new_citation (str|None): a string to find common subphrases with another string. characters_to_remove (list): a list of characters or strings to remove from each citation before finding common subphrases. min_len (int): the minimum length of a subphrase. Returns: ((int, int)|None): if either citation is None, then return None, else the percentage of common to uncommon phrase length for each citation. """ if prev_citation and new_citation: citation_strip_regex = "|".join([f"\{char}" for char in characters_to_remove]) # citation_strip_regex = r"\.|,|;|\(|\)|\[|\]|\{|\}" stripped_prev_citation = re.sub(citation_strip_regex, "", prev_citation.lower()) stripped_new_citation = re.sub(citation_strip_regex, "", new_citation.lower()) common_subphrases = find_common_subphrases(stripped_prev_citation, stripped_new_citation, min_len) prev_citation_common_phrases_removed = stripped_prev_citation new_citation_common_phrases_removed = stripped_new_citation for phrase in common_subphrases: prev_citation_common_phrases_removed = prev_citation_common_phrases_removed.replace(phrase.strip(), "") new_citation_common_phrases_removed = new_citation_common_phrases_removed.replace(phrase.strip(), "") common_base_string = "".join(common_subphrases) prev_common_percentage = len(common_base_string) / len(common_base_string + prev_citation_common_phrases_removed.strip()) * 100 new_common_percentage = len(common_base_string) / len(common_base_string + new_citation_common_phrases_removed.strip()) * 100 return prev_common_percentage, new_common_percentage else: return None
[docs]def create_pub_dict_for_saving_PubMed(pub, include_xml=False): """Convert pymed.PubMedArticle to a dictionary and modify it for saving. Converts a pymed.PubMedArticle to a dictionary, deletes the "xml" key if include_xml is False, and converts the "publication_date" key to a string. Args: pub (pymed.PubMedArticle): publication to convert to a dictionary. include_xml (bool): if True, include the raw XML query in the key "xml". Returns: pub_id (str): the ID of the publication (DOI or PMID). pub_dict (dict): pub converted to a dictionary. Keys are "pubmed_id", "title", "abstract", "keywords", "journal", "publication_date", "authors", "methods", "conclusions", "results", "copyrights", and "doi" """ pub_dict = pub.toDict() if (pmid := pub_dict["xml"].find("PubmedData/ArticleIdList/ArticleId[@IdType='pubmed']")) is not None: pub_dict["pubmed_id"] = pmid.text else: pub_dict["pubmed_id"] = None if (doi := pub_dict["xml"].find("PubmedData/ArticleIdList/ArticleId[@IdType='doi']")) is not None: pub_dict["doi"] = doi.text.lower() else: pub_dict["doi"] = None if (pmc := pub_dict["xml"].find("PubmedData/ArticleIdList/ArticleId[@IdType='pmc']")) is not None: pub_dict["PMCID"] = pmc.text else: pub_dict["PMCID"] = None pub_dict["grants"] = [grant.text for grant in pub.xml.findall(".//GrantID")] references = [] for reference in pub_dict["xml"].findall(".//Reference"): if (text := reference.find("Citation")) is not None: citation = text.text else: citation = None if (text := reference.find("ArticleIdList/ArticleId[@IdType='pmc']")) is not None: ref_pmc = text.text else: ref_pmc = None if (text := reference.find("ArticleIdList/ArticleId[@IdType='pubmed']")) is not None: ref_pmid = text.text else: ref_pmid = None if (text := reference.find("ArticleIdList/ArticleId[@IdType='doi']")) is not None: ref_doi = text.text.lower() else: ref_doi = None temp_dict = {"citation":citation, "title": None, "PMCID":ref_pmc, "pubmed_id":ref_pmid, "doi":ref_doi} ## Only add references that have at least 1 non-null value. if not all([value is None for value in temp_dict.values()]): references.append(temp_dict) pub_dict["references"] = references authors = [] for author in pub_dict["xml"].findall(".//Author"): last_name = None if (text := author.find("LastName")) is not None: last_name = text.text collective_name = None if (text := author.find("CollectiveName")) is not None: collective_name = text.text first_name = None if (text := author.find("ForeName")) is not None: first_name = text.text initials = None if (text := author.find("Initials")) is not None: initials = text.text affiliation = None if (affiliations := author.findall("AffiliationInfo/Affiliation")) is not None: affiliation = '\n'.join([text.text for text in affiliations]) if affiliations else None orcid = None if (text := author.find("Identifier[@Source='ORCID']")) is not None: orcid = extract_ORCID_from_string(text.text) if collective_name: temp_dict = {"collectivename":collective_name, "ORCID":orcid, "author_id":None} else: temp_dict = {"lastname":last_name if last_name else collective_name, "firstname": first_name, "initials":initials, "affiliation":affiliation, "ORCID":orcid, "author_id":None} ## Only add authors that have at least 1 non-null value. if not all([value is None for value in temp_dict.values()]): authors.append(temp_dict) pub_dict["authors"] = authors if not include_xml: del pub_dict["xml"] else: pub_dict["xml"] = str(ET.tostring(pub_dict["xml"]), encoding="utf-8") if pub_dict["publication_date"]: pub_dict["publication_date"] = {"year":pub_dict["publication_date"].year, "month":pub_dict["publication_date"].month, "day":pub_dict["publication_date"].day} else: pub_dict["publication_date"] = {"year":None, "month":None, "day":None} pub_id = DOI_URL + pub_dict["doi"] if pub_dict["doi"] else pub_dict["pubmed_id"] return pub_id, pub_dict
[docs]def create_pub_dict_for_saving_Crossref(work, prev_query): """Create the standard pub_dict from a Crossref query work dict. Args: work (dict): the dictionary for a publication returned in a Crossref query. prev_query (dict|None): a dictionary containing publications from a previous query, used for message printing. Returns: pub_id (str|None): the ID of the publication (DOI, PMID, or URL). If None, an ID couldn't be determined. pub_dict (dict|None): the standard pub_dict with values filled in from the Crossref publication. If None, an ID couldn't be determined. """ if "title" in work: title = work["title"][0] else: return None, None ## Look for DOI doi = work["DOI"].lower() if "DOI" in work else None ## Look for external URL if "URL" in work: external_url = work["URL"] elif "link" in work: external_url = [link["URL"] for link in work["link"] if "URL" in link and link["URL"]][0] if not external_url: external_url = None else: external_url = None if doi: pub_id = DOI_URL + doi elif external_url: pub_id = external_url elif not prev_query: vprint("Warning: Could not find a DOI or external URL for a publication when searching Crossref. It will not be in the publications.", verbosity=1) vprint("Title: " + title, verbosity=1) return None, None else: return None, None ## Determine authors and put them in unified form. new_author_list = [] if "author" in work: for cr_author_dict in work["author"]: temp_dict = {} orcid = None if "ORCID" in cr_author_dict: orcid = extract_ORCID_from_string(cr_author_dict["ORCID"]) temp_dict["ORCID"] = orcid temp_dict["author_id"] = None ## If "name" is in dict then it is a collective author and not an individual. if "name" in cr_author_dict: temp_dict["collectivename"] = cr_author_dict["name"] else: temp_dict["lastname"] = cr_author_dict.get("family") temp_dict["firstname"] = cr_author_dict.get("given") temp_dict["initials"] = None affiliations = [] if cr_author_dict["affiliation"]: for affiliation in cr_author_dict["affiliation"]: if aff_text := affiliation.get("name"): affiliations.append(aff_text) temp_dict["affiliation"] = '\n'.join(affiliations) if affiliations else None ## Only add authors that have at least 1 non-null value. if not all([value is None for value in temp_dict.values()]): new_author_list.append(temp_dict) ## Find published date date_found = False if "published" in work: date_key = "published" date_found = True elif "published-online" in work: date_key = "published-online" date_found = True elif "published-print" in work: date_key = "published-print" date_found = True if date_found: date_length = len(work[date_key]["date-parts"][0]) if date_length > 2: publication_year = work[date_key]["date-parts"][0][0] publication_month = work[date_key]["date-parts"][0][1] publication_day = work[date_key]["date-parts"][0][2] elif date_length > 1: publication_year = work[date_key]["date-parts"][0][0] publication_month = work[date_key]["date-parts"][0][1] publication_day = None elif date_length > 0: publication_year = work[date_key]["date-parts"][0][0] publication_month = None publication_day = None else: publication_year = None publication_month = None publication_day = None else: publication_year = None publication_month = None publication_day = None ## look for grants in results found_grants = [] if "funder" in work: for funder in work["funder"]: if awards := funder.get("award"): found_grants += awards ## Look for journal journal = work["publisher"] if "publisher" in work else None ## Look for references and put them in unified form. references = [] if "reference" in work: for reference in work["reference"]: if "DOI" in reference: ref_doi = reference["DOI"] else: ref_doi = None if ref_title := reference.get("article-title"): pass else: ref_title = reference.get("series-title") reference_dict = {"citation":reference.get("unstructured"), "title":ref_title, "PMCID":None, "pubmed_id":None, "doi":ref_doi} if all([value is None for value in reference_dict.values()]): continue else: references.append(reference_dict) ## Build the pub_dict from what we were able to collect. pub_dict = copy.deepcopy(PUBLICATION_TEMPLATE) pub_dict["doi"] = doi pub_dict["publication_date"]["year"] = publication_year pub_dict["publication_date"]["month"] = publication_month pub_dict["publication_date"]["day"] = publication_day pub_dict["journal"] = journal pub_dict["grants"] = found_grants pub_dict["authors"] = new_author_list pub_dict["title"] = title pub_dict["references"] = references return pub_id, pub_dict
ORCID_regex = r"(\d{4}-\d{4}-\d{4}-\d{3}[0,1,2,3,4,5,6,7,8,9,X])" alt_ORCID_regex = r"(\d{4}\d{4}\d{4}\d{3}[0,1,2,3,4,5,6,7,8,9,X])"
[docs]def extract_ORCID_from_string(string): """Extract an ORCID ID from a string. Args: string (str): the string to extract the ID from. Returns: (str|None): either the extracted ID as a string or None. """ if re_match := re.search(ORCID_regex, string): return re_match.groups()[0] elif re_match := re.search(alt_ORCID_regex, string): captured_string = re_match.groups()[0] new_string = captured_string[0:4] + '-' + captured_string[4:8] + '-' + captured_string[8:12] + '-' + captured_string[12:] return new_string else: return None
[docs]def is_fuzzy_match_to_list(str_to_match, list_to_match): """True if string is a 90 or higher ratio match to any string in list, False otherwise. Args: str_to_match (str): string to compare with list list_to_match (list): list of strings to compare with str_to_match Returns: (bool): True if str_to_match is a match to any string in list_tp_match, False otherwise. """ # str_to_match = str_to_match.lower() # list_to_match = [list_string.lower() for list_string in list_to_match] return any([do_strings_fuzzy_match(str_to_match, list_string) for list_string in list_to_match])
[docs]def fuzzy_matches_to_list(str_to_match, list_to_match): """Return strings and indexes for strings with match ratio that is 90 or higher. Args: str_to_match (str): string to compare with list list_to_match (list): list of strings to compare with str_to_match Returns: (list): list of matches (tuples) with each element being the string and its index in list_to_match. [(9, "title 1"), ...] """ # str_to_match = str_to_match.lower() # list_to_match = [list_string.lower() for list_string in list_to_match] return [(index, list_string) for index, list_string in enumerate(list_to_match) if do_strings_fuzzy_match(str_to_match, list_string)]
[docs]def is_pub_in_publication_dict(pub_id, title, publication_dict, titles=None): """True if pub_id is in publication_dict or title is a fuzzy match to titles in titles. Check whether the pub_id is in publication_dict. If it isn't then see if there is a fuzzy match in titles. If titles is not provided then get a list of titles from publication_dict. Args: pub_id (str): pub_id to check against in publication_dict to see if it already exists. title (str): title corresponding to pub_id to check against titles in publication_dict. publication_dict (dict): keys are pub_ids and values are pub attributes. titles (list|None): list of strings that should be titles to fuzzy match to title. Returns: (bool): True if the pub_id is in publication_dict or title is fuzzy matched in titles, False otherwise """ if pub_id.lower() in publication_dict: return True if not titles: titles = [pub_attr["title"] for pub_attr in publication_dict.values() if pub_attr["title"]] return True if is_fuzzy_match_to_list(title, titles) else False
[docs]def get_pub_id_in_publication_dict(pub_id, title, publication_dict): """Get the pub_id in publication_dict for the publication that matches the given pub_id or fuzzy matches a title. Check whether the pub_id is in publication_dict. If it isn't then see if there is a fuzzy match in titles. It is assumed every dictionary in publication_dict will have a "title" key with a string value. Args: pub_id (str): pub_id to check against in publication_dict to see if it already exists. title (str): title corresponding to pub_id to check against titles in publication_dict. publication_dict (dict): keys are pub_ids and values are pub attributes. Returns: (str|None): the pub_id matched in publication_dict or None if nothing was found. """ if pub_id.lower() in publication_dict: return pub_id.lower() for key, value in publication_dict.items(): if do_strings_fuzzy_match(title, value["title"]): return key return None
[docs]def find_duplicate_citations(tokenized_citations): """Find citations that are duplicates of each other in tokenized_citations. Citations can be duplicates in 3 ways. Same PMID, same DOI, or similar enough titles. The function goes through each citation and looks for matches on these criteria. Then the matches are compared to create unique sets. For instance if citation 1 matches the PMID in citation 2, and citation 2 matches the DOI in citation 3, but citation 1 and 3 don't match a duplicate set containing all 3 is created. The unique duplicate sets are returned as a list of sorted lists. Args: tokenized_citation (list): list of dictionaries where each dictionary is a citation. Matches the tokenized_reference.json schema. Returns: unique_duplicate_sets (list): list of lists where each element is a list of indexes in tokenized_citations that match each other. The list of indexes is sorted in ascending order. """ titles_list = [citation["title"] for citation in tokenized_citations if citation["title"]] pmids = {} dois = {} titles = {} for count, citation in enumerate(tokenized_citations): if citation["PMID"]: if citation["PMID"] in pmids: pmids[citation["PMID"]].append(count) else: pmids[citation["PMID"]] = [count] if citation["DOI"]: doi = citation["DOI"].lower() if doi in dois: dois[doi].append(count) else: dois[doi] = [count] if citation["title"]: fuzzy_matches = fuzzy_matches_to_list(citation["title"], titles_list) if len(fuzzy_matches) > 1 and fuzzy_matches[0][1] in titles: titles[fuzzy_matches[0][1]].append(count) else: titles[citation["title"]] = [count] matching_pmids = {tuple(indexes) for pmid, indexes in pmids.items() if len(indexes) > 1} matching_dois = {tuple(indexes) for doi, indexes in dois.items() if len(indexes) > 1} matching_titles = {tuple(indexes) for title, indexes in titles.items() if len(indexes) > 1} all_matches = matching_pmids | matching_dois | matching_titles ## Matches need to be matched recursively to find true sets of matches. ## For example if 1 and 2 are found to match on title and 2 and 3 are found to match ## on PMID then we need 1 set of (1,2,3) instead of 2 sets of (1,2) (2,3). duplicates_dict = {} for index_tuple in list(all_matches): for index in index_tuple: if index in duplicates_dict: for index2 in index_tuple: if index2 != index: duplicates_dict[index].add(index2) else: duplicates_dict[index] = set() for index2 in index_tuple: if index2 != index: duplicates_dict[index].add(index2) for index, matches in duplicates_dict.items(): temp_set = matches set_before_loop = matches while(True): for match in matches: temp_set = temp_set | duplicates_dict[match] if set_before_loop == temp_set: break else: set_before_loop = temp_set duplicates_dict[index] = temp_set unique_duplicate_sets = {tuple(sorted(duplicate_set)) for duplicate_set in duplicates_dict.values()} unique_duplicate_sets = [list(duplicate_set) for duplicate_set in unique_duplicate_sets] for duplicate_set in unique_duplicate_sets: duplicate_set.sort() return unique_duplicate_sets
[docs]def are_citations_in_pub_dict(tokenized_citations, pub_dict): """Determine which citations in tokenized_citations are in pub_dict. For each citation in tokenized_citations see if it is in pub_dict. Will be True for a citation if the PMID matches, DOI matches, or the title is similar enough. Args: tokenized_citation (list): list of dictionaries where each dictionary is a citation. Matches the tokenized_reference.json schema. pub_dict (dict): schema matches the publication.json schema. Returns: (list): list of bools, True if the citation at that index is in pub_dict, False otherwise. """ pub_titles = [pub["title"] for pub in pub_dict.values() if pub["title"]] pub_dois = [doi.lower() for pub in pub_dict.values() if (doi := normalize_DOI(pub["doi"]))] pub_pmids = [pub["pubmed_id"] for pub in pub_dict.values() if pub["pubmed_id"]] return [True if citation["PMID"] in pub_pmids or ((doi := normalize_DOI(citation["DOI"])) and doi.lower()) in pub_dois or is_fuzzy_match_to_list(citation["title"], pub_titles) else False for citation in tokenized_citations]
[docs]def normalize_DOI(doi_string): """ """ if doi_string: if doi_match := regex_match_return(r'https?://doi.org/(.*)', doi_string): return doi_match[0] return doi_string return None
# def _update_nulls(original_dict, upgrade_dict): # """Update a dictionary in a nested fashion. # Only updates original_dict if original_dict has a None value for a key. # This is recursive, so if a dicitonary type is seen for the value then this # is called on that nested dictionary. # Args: # original_dict (dict): the dictionary to update. # upgrade_dict (dict): the dictionary to update values from. # Returns: # original_dict (dict): the updated original_dict # """ # for key, value in original_dict.items(): # if isinstance(value, collections.abc.Mapping): # original_dict[key] = _update_nulls(original_dict[key], upgrade_dict[key]) # elif value is None and upgrade_dict.get(key) is not None: # original_dict[key] = upgrade_dict[key] # return original_dict def _update(original_dict, upgrade_dict): """Update a dictionary in a nested fashion. Only updates original_dict if original_dict has a None value for a key, or if the key does not already exist in original_dict. This is recursive, so if a dicitonary type is seen for the value then this is called on that nested dictionary. Args: original_dict (dict): the dictionary to update. upgrade_dict (dict): the dictionary to update values from. Returns: original_dict (dict): the updated original_dict """ for key, value in upgrade_dict.items(): if key not in original_dict: original_dict[key] = value elif isinstance(value, collections.abc.Mapping) and isinstance(original_dict[key], collections.abc.Mapping): original_dict[key] = _update(original_dict[key], upgrade_dict[key]) elif value is not None and original_dict[key] is None: original_dict[key] = upgrade_dict[key] return original_dict def _merge_pub_dicts(prev_dict, new_dict): """Merge information from 2 unified pub_dicts. Args: prev_dict (dict): the dictionary whose values will be updated. new_dict (dict): the dictionary whose values will be used to update prev_dict. Returns: prev_dict (dict): prev_dict with updated values """ _update(prev_dict, new_dict) prev_dict["authors"] = match_authors_in_prev_pub(prev_dict["authors"], new_dict["authors"]) prev_dict["references"] = _match_references_in_prev_pub(prev_dict["references"], new_dict["references"]) new_grants = [grant for grant in new_dict["grants"] if grant not in prev_dict["grants"]] prev_dict["grants"] += new_grants return prev_dict
[docs]def find_common_subphrases(str1, str2, min_len=2): """Find all common subphrases between str1 and str2 longer than min_len. Modified from https://stackoverflow.com/a/63337541/19957088. Find all common subphrases between the 2 strings, but filer out common subphrases between subphrases. For example, if "sand" is common between the 2 strings the function will not return "and" unless there is another instance of "and" between the 2 strings. A phrase is a string that must end in a space or be at the end of the string. So "sand asdf" and "sand awer" will only match "sand " and not "sand a". Spaces are expected to be meaningful. It is recommended to remove punctuation from the strings. Args: str1 (str): one of the 2 strings to look for common substrings in. str2 (str): one of the 2 strings to look for common substrings in. min_len (int): if the length of a substring is less than this, then ignore it. Returns: cs_array (list): a list of the common substrings. """ len1, len2 = len(str1), len(str2) if len1 > len2: str1, str2 = str2, str1 len1, len2 = len2, len1 cs_array=[] cs_array_index = [] for i in range(len1, min_len-1, -1): for k in range(len1-i+1): end_index = i + k sub_string = str1[k:end_index] if sub_string[-1] != " " and end_index < len1 and sub_string[0] != " " and k > 0: continue if sub_string in str2: flag = 1 for m in range(len(cs_array)): if (sub_string in cs_array[m] and\ ## Have to make sure the sub_string is in the same index range as what it matched. ## Otherwise, "and" won't get matched between 'sand asdf and' and 'sand and'. k >= cs_array_index[m][0] and end_index <= cs_array_index[m][1]) or (\ ## We don't want to return overlapping substrings, without the below check 'sand asdf and' ## and 'sand and' will return 'sand a' and 'and' instead of 'sand a' and 'nd'. ## This is undesirable for us because we plan to remove the found substrings ## after this function, and overlapping substrings make that harder. (k >= cs_array_index[m][0] and k < cs_array_index[m][1])): flag=0 break if flag == 1: cs_array.append(sub_string) cs_array_index.append([k, end_index]) return cs_array
## Good test phrases to understand how the function works. # find_common_subphrases('sandc cand','sandb asdf and', 2) # find_common_subphrases('sand and','sand asdf and', 2) # find_common_subphrases('sand and','sand qsdf and', 2)