Source code for academic_tracker.fileio

"""
Fileio
~~~~~~

This module contains the functions that read and write files.
"""


import re
import os
import sys
import json

import docx
import pandas

from . import helper_functions



[docs]def load_json(filepath): """Adds error checking around loading a json file. Args: filepath (str): filepath to the json file Returns: internal_data (dict): json read from file in a dictionary Raises: Exception: If file opening has a problem will raise an exception. """ if os.path.exists(filepath): try: with open(filepath, "r") as f: internal_data = json.loads(f.read()) except Exception as e: raise e return internal_data else: helper_functions.vprint("No such file: " + filepath) sys.exit()
[docs]def read_previous_publications(filepath): """Read in the previous publication json file. If the prev_pub option was given by the user then that filepath is used to read in the file and it is checked to make sure the json is a list and each value is a string. If the prev_pub option was not given then look for a "tracker-timestamp" directory in the current working directory and if it has a publications.json file then read in that file. If no previous publications are found then an empty dict is returned for prev_pubs. Args: filepath (str or None): path to the publications JSON to read in. Returns: has_previous_pubs (bool): True means that a previous publications file was found prev_pubs (dict): dict where keys are publication ids and values are a dict of publication attributes """ has_previous_pubs = False if filepath: if filepath.lower() == "ignore": return False, {} prev_pubs = load_json(filepath) has_previous_pubs = True else: dir_contents = os.listdir() ## find all directories matching the tracker directory structure and convert the timestamps to ints to find the largest one. tracker_dirs = [int(re.match(r"tracker-(\d{10})", folder).group(1)) for folder in dir_contents if re.match(r"tracker-(\d{10})", folder)] if len(tracker_dirs) > 0: tracker_dirs.sort(reverse=True) for latest_dir in tracker_dirs: prev_publication_filepath = os.path.join(os.getcwd(), "tracker-"+str(latest_dir), "publications.json") if os.path.exists(prev_publication_filepath): prev_pubs = load_json(prev_publication_filepath) has_previous_pubs = True break if has_previous_pubs: return has_previous_pubs, prev_pubs else: return has_previous_pubs, {}
[docs]def save_emails_to_file(email_messages, save_dir_name): """Save email_messages to "emails.json" in save_dir_name in the current working directory. Args: email_messages (dict): keys are author names and values are the of the email save_dir_name (str): directory name to append to the current working directory to save the emails.json file in """ email_save_path = os.path.join(os.getcwd(), save_dir_name, "emails.json") with open(email_save_path, 'w') as outFile: print(json.dumps(email_messages, indent=2, sort_keys=False), file=outFile)
[docs]def save_publications_to_file(save_dir_name, publication_dict, prev_pubs): """Saves the publication_dict to "publications.json" in save_dir_name in the current working directory. prev_pubs and publication_dict will be combined before saving. Args: save_dir_name (str): directory name to append to the current working directory to save the publications.json file in publication_dict (dict): dictionary with publication ids as the keys to the dict prev_pubs (list): List of publication ids that are publications previously found. """ publications_save_path = os.path.join(os.getcwd(), save_dir_name, "publications.json") prev_pubs.update(publication_dict) with open(publications_save_path, 'w') as outFile: print(json.dumps(prev_pubs, indent=2, sort_keys=True), file=outFile)
[docs]def read_text_from_docx(doc_path): """Open docx file at doc_path and read contents into a string. Args: doc_path (str): path to docx file. Returns: (str): A string of the contents of the docx file. Each line concatenated with a newline character. Raises: Exception: If file opening has a problem will raise an exception. """ ## https://stackoverflow.com/questions/25228106/how-to-extract-text-from-an-existing-docx-file-using-python-docx if os.path.exists(doc_path): try: document = docx.Document(doc_path) return u"\n".join([u"".join([r.text for r in paragraph._element.xpath(".//w:t")]) for paragraph in document.paragraphs]) except Exception as e: raise e else: helper_functions.vprint("No such file: " + doc_path) sys.exit()
[docs]def read_text_from_txt(doc_path): """Open txt or csv file at doc_path and read contents into a string. Args: doc_path (str): path to txt or csv file. Returns: (str): A string of the contents of the txt or csv file. Each line concatenated with a newline character. Raises: Exception: If file opening has a problem will raise an exception. """ if os.path.exists(doc_path): try: with open(doc_path, encoding = "utf-8") as document: lines = document.readlines() except Exception as e: raise e return "".join(lines) else: helper_functions.vprint("No such file: " + doc_path) sys.exit()
[docs]def read_csv(doc_path): """Read csv into a pandas dataframe. Args: doc_path (str): path to the csv file to read in. Returns: df (DataFrame): Pandas dataframe of the csv contents. Raises: Exception: If file opening has a problem will raise an exception. """ if os.path.exists(doc_path): try: df = pandas.read_csv(doc_path) except Exception as e: raise e return df else: helper_functions.vprint("No such file: " + doc_path) sys.exit()
[docs]def save_string_to_file(save_dir_name, file_name, text_to_save): """Save a string to file. Args: save_dir_name (str): directory in the current working directory to save the string to. file_name (str): string to name the file. text_to_save (str): the string to put in the file contents. """ save_path = os.path.join(os.getcwd(), save_dir_name, file_name) with open(save_path, 'wb') as outFile: outFile.write(text_to_save.encode("utf-8"))
[docs]def save_json_to_file(save_dir_name, file_name, json_dict, sort_keys=True): """Saves the json_dict to file_name in save_dir_name in the current working directory. Args: save_dir_name (str): directory name to append to the current working directory to save the json_dict in. file_name (str): the name to give the file, should have '.json' as the extension. json_dict (dict or list): data to save to file. sort_keys (bool): passed to json.dumps, if True sort the dictionary keys before saving. """ save_path = os.path.join(os.getcwd(), save_dir_name, file_name) with open(save_path, 'w') as outFile: print(json.dumps(json_dict, indent=2, sort_keys=sort_keys), file=outFile)