Source code for academic_tracker.fileio

"""
Fileio
~~~~~~

This module contains the functions that read and write files.
"""


import re
import os
import sys
import json

import docx
import pandas

from . import helper_functions




[docs]
def load_json(filepath):
    """Adds error checking around loading a json file.
    
    Args:
        filepath (str): filepath to the json file
        
    Returns:
        internal_data (dict): json read from file in a dictionary
        
    Raises:
        Exception: If file opening has a problem will raise an exception.
    """
    if os.path.exists(filepath):
        try:
            with open(filepath, "r") as f:
                internal_data = json.loads(f.read())
        except Exception as e:
            raise e

        return internal_data
    else:
        helper_functions.vprint("No such file: " + filepath)
        sys.exit()






[docs]
def read_previous_publications(filepath):
    """Read in the previous publication json file.
    
    If the prev_pub option was given by the user then that filepath is used to read in the file
    and it is checked to make sure the json is a list and each value is a string. If the prev_pub
    option was not given then look for a "tracker-timestamp" directory in the current working 
    directory and if it has a publications.json file then read in that file. 
    If no previous publications are found then an empty dict is returned for prev_pubs.
    
    Args:
        filepath (str or None): path to the publications JSON to read in.
        
    Returns:
        has_previous_pubs (bool): True means that a previous publications file was found
        prev_pubs (dict): dict where keys are publication ids and values are a dict of publication attributes
    """
    
    has_previous_pubs = False
    if filepath:
        
        if filepath.lower() == "ignore":
            return False, {}
        
        prev_pubs = load_json(filepath)
        has_previous_pubs = True
                            
    else:
        dir_contents = os.listdir()
        ## find all directories matching the tracker directory structure and convert the timestamps to ints to find the largest one.
        tracker_dirs = [int(re.match(r"tracker-(\d{10})", folder).group(1)) for folder in dir_contents if re.match(r"tracker-(\d{10})", folder)]
        if len(tracker_dirs) > 0:
            tracker_dirs.sort(reverse=True)
            for latest_dir in tracker_dirs:
                prev_publication_filepath = os.path.join(os.getcwd(), "tracker-"+str(latest_dir), "publications.json")
                if os.path.exists(prev_publication_filepath):
                    prev_pubs = load_json(prev_publication_filepath)
                    has_previous_pubs = True
                    break
    
    if has_previous_pubs:                
                    
        return has_previous_pubs, prev_pubs
    
    else:
        return has_previous_pubs, {}

    
    
    
    


[docs]
def save_emails_to_file(email_messages, save_dir_name):
    """Save email_messages to "emails.json" in save_dir_name in the current working directory.
    
    Args:
        email_messages (dict): keys are author names and values are the of the email
        save_dir_name (str): directory name to append to the current working directory to save the emails.json file in
    """
    
    email_save_path = os.path.join(os.getcwd(), save_dir_name, "emails.json")
    
    with open(email_save_path, 'w') as outFile:
        print(json.dumps(email_messages, indent=2, sort_keys=False), file=outFile)

        
            



[docs]
def save_publications_to_file(save_dir_name, publication_dict, prev_pubs):
    """Saves the publication_dict to "publications.json" in save_dir_name in the current working directory.
    
    prev_pubs and publication_dict will be combined before saving.
    
    Args:
        save_dir_name (str): directory name to append to the current working directory to save the publications.json file in
        publication_dict (dict): dictionary with publication ids as the keys to the dict
        prev_pubs (list): List of publication ids that are publications previously found.
    """
    
    publications_save_path = os.path.join(os.getcwd(), save_dir_name, "publications.json")
    
    prev_pubs.update(publication_dict)
    with open(publications_save_path, 'w') as outFile:
        print(json.dumps(prev_pubs, indent=2, sort_keys=True), file=outFile)


        
        


[docs]
def read_text_from_docx(doc_path):
    """Open docx file at doc_path and read contents into a string.
    
    Args:
        doc_path (str): path to docx file.
        
    Returns:
        (str): A string of the contents of the docx file. Each line concatenated with a newline character.
    
    Raises:
        Exception: If file opening has a problem will raise an exception.
    """
    
    ## https://stackoverflow.com/questions/25228106/how-to-extract-text-from-an-existing-docx-file-using-python-docx
    if os.path.exists(doc_path):
        try:
            document = docx.Document(doc_path)
            return u"\n".join([u"".join([r.text for r in paragraph._element.xpath(".//w:t")]) for paragraph in document.paragraphs])
        except Exception as e:
            raise e
    else:
        helper_functions.vprint("No such file: " + doc_path)
        sys.exit()





[docs]
def read_text_from_txt(doc_path):
    """Open txt or csv file at doc_path and read contents into a string.
    
    Args:
        doc_path (str): path to txt or csv file.
        
    Returns:
        (str): A string of the contents of the txt or csv file. Each line concatenated with a newline character. 
    
    Raises:
        Exception: If file opening has a problem will raise an exception.
    """
    
    if os.path.exists(doc_path):
        try:
            with open(doc_path, encoding = "utf-8") as document:
                lines = document.readlines()
        except Exception as e:
            raise e
        
        return "".join(lines)
    else:
        helper_functions.vprint("No such file: " + doc_path)
        sys.exit()

    


[docs]
def read_csv(doc_path):
    """Read csv into a pandas dataframe.
    
    Args:
        doc_path (str): path to the csv file to read in.
        
    Returns:
        df (DataFrame): Pandas dataframe of the csv contents.
    
    Raises:
        Exception: If file opening has a problem will raise an exception.
    """
    
    if os.path.exists(doc_path):
        try:
            df = pandas.read_csv(doc_path)
        except Exception as e:
            raise e
        
        return df
    else:
        helper_functions.vprint("No such file: " + doc_path)
        sys.exit()


         
    


[docs]
def save_string_to_file(save_dir_name, file_name, text_to_save):
    """Save a string to file.
    
    Args:
        save_dir_name (str): directory in the current working directory to save the string to.
        file_name (str): string to name the file.
        text_to_save (str): the string to put in the file contents.
    """
    
    save_path = os.path.join(os.getcwd(), save_dir_name, file_name)
    
    with open(save_path, 'wb') as outFile:
        outFile.write(text_to_save.encode("utf-8"))

    
    


[docs]
def save_json_to_file(save_dir_name, file_name, json_dict, sort_keys=True):
    """Saves the json_dict to file_name in save_dir_name in the current working directory.
    
    Args:
        save_dir_name (str): directory name to append to the current working directory to save the json_dict in.
        file_name (str): the name to give the file, should have '.json' as the extension.
        json_dict (dict or list): data to save to file.
        sort_keys (bool): passed to json.dumps, if True sort the dictionary keys before saving.
    """
    
    save_path = os.path.join(os.getcwd(), save_dir_name, file_name)
    
    with open(save_path, 'w') as outFile:
        print(json.dumps(json_dict, indent=2, sort_keys=sort_keys), file=outFile)