Source code for md_harmonize.MetaCyc_parser

#!/usr/bin/python3

"""

md_harmonize.MetaCyc_parser
~~~~~~~~~~~~~~~~~~~~~~~~~~~

This module provides functions to parse MetaCyc text data.

Note: All MetaCyc reactions atom_mappings are stored in a single text file.

"""

import collections
import copy

from . import tools
from . import reaction


[docs]def reaction_side_parser(reaction_side: str) -> dict:
    """
    This is to parse FROM_SIDE or TO_SIDE in the reaction.

    eg: FROM-SIDE - (CPD-9147 0 8) (OXYGEN-MOLECULE 9 10)

    Information includes compound name and the start and end atom index in this compound used for atom mappings.
    The order of the atoms are the orders in the compound molfile.

    :param reaction_side: the text description of reaction side.
    :return: the dictionary of compounds and the corresponding start and end atom index in the atom mappings.
    """
    i = 0
    compounds = collections.defaultdict(list)
    while i < len(reaction_side):
        if reaction_side[i] == "(":
            i += 1
            count = 1
            start_point = i
            while count > 0:
                if reaction_side[i] == "(":
                    count += 1
                if reaction_side[i] == ")":
                    count -= 1
                i += 1
            i -= 1
            inner_substring = reaction_side[start_point: i]
            if "(" in inner_substring:
                compound, n, s, e = inner_substring.split()
                compounds[compound[1:]].append((int(s), int(e)))
            else:
                compound, s, e = inner_substring.split()
                compounds[compound].append((int(s), int(e)))
        i += 1
    return compounds


[docs]def generate_one_to_one_mappings(from_side: dict, to_side: dict, indices: str) -> list:
    """
    To generate the one to one atom mappings between two the sides of a metabolic reaction.

    :param from_side: the dictionary of compounds with their corresponding start and end atom indices in the from_side.
    :param to_side: the dictionary of compounds with their corresponding start and end atom indices in the to_side.
    :param indices: the string representation of mapped atoms.
    :return: the list of mapped atoms between the two sides (from_index, to_index).
    """
    from_index_dict = {}
    mappings = [int(num) for num in indices.split(" ") if num != ""]
    
    for cpd_name in from_side:
        for (start_index, end_index) in from_side[cpd_name]:
            for i in range(start_index, end_index+1):
                atom_index = i - start_index
                from_index_dict[i] = (cpd_name, atom_index)
    to_index_dict = {}
    
    for cpd_name in to_side:
        for (start_index, end_index) in to_side[cpd_name]:
            for i in range(start_index, end_index+1):
                atom_index = i - start_index
                to_index_dict[i] = (cpd_name, atom_index)
    one_to_one_mappings = []
    
    for to_index, from_index in enumerate(mappings):
        one_to_one_mappings.append((from_index_dict[from_index], to_index_dict[to_index]))
    return one_to_one_mappings


[docs]def atom_mappings_parser(atom_mapping_text: list) -> dict:
    """
    This is to parse the MetaCyc reaction with atom mappings.

    eg:

    REACTION - RXN-11981

    NTH-ATOM-MAPPING - 1

    MAPPING-TYPE - NO-HYDROGEN-ENCODING

    FROM-SIDE - (CPD-12950 0 23) (WATER 24 24)

    TO-SIDE - (CPD-12949 0 24)

    INDICES - 0 1 2 3 5 4 7 6 9 10 11 13 12 14 15 16 17 8 18 19 21 20 22 24 23

    note: the INDICES are atom mappings between two sides of the reaction.
    TO-SIDE[i] is mapped to FROM-SIDE[idx] for i, idx in enumerate(INDICES).
    Pay attention to the direction!

    :param atom_mapping_text: the text descriptions of reactions with atom mappings.
    :return: the dictionary of reactions with atom mappings.
    """
    reaction_dicts = {}
    current_reaction = {}
    for line in atom_mapping_text:
        if line.startswith("#"):
            continue
        elif line.startswith("//"):
            reaction_dicts[current_reaction['REACTION']] = copy.deepcopy(current_reaction)
            reaction_dicts[current_reaction['REACTION']]["ONE_TO_ONE_MAPPINGS"] = \
                generate_one_to_one_mappings(current_reaction["FROM-SIDE"], current_reaction["TO-SIDE"],
                                             current_reaction["INDICES"])
            current_reaction = {}
        else:
            key = line.split(" - ")[0]
            value = " - ".join(line.split(" - ")[1:])
            if key == "FROM-SIDE" or key == "TO-SIDE":
                current_reaction[key] = reaction_side_parser(value)
            else:
                current_reaction[key] = value
    return reaction_dicts


[docs]def reaction_parser(reaction_text: list) -> dict:
    """
    This is used to parse MetaCyc reaction.

    eg:

    UNIQUE-ID - RXN-13583

    TYPES - Redox-Half-Reactions

    ATOM-MAPPINGS - (:NO-HYDROGEN-ENCODING (1 0 2) (((WATER 0 0) (HYDROXYLAMINE 1 2)) ((NITRITE 0 2))))

    CREDITS - SRI

    CREDITS - caspi

    IN-PATHWAY - HAONITRO-RXN

    LEFT - NITRITE

    ^COMPARTMENT - CCO-IN

    LEFT - PROTON

    ^COEFFICIENT - 5

    ^COMPARTMENT - CCO-IN

    LEFT - E-

    ^COEFFICIENT - 4

    ORPHAN? - :NO

    PHYSIOLOGICALLY-RELEVANT? - T

    REACTION-BALANCE-STATUS - :BALANCED

    REACTION-DIRECTION - LEFT-TO-RIGHT

    RIGHT - HYDROXYLAMINE

    ^COMPARTMENT - CCO-IN

    RIGHT - WATER

    ^COMPARTMENT - CCO-IN

    STD-REDUCTION-POTENTIAL - 0.1

    //

    :param reaction_text: the text descriptions of MetaCyc reactions.
    :return: the dict of parsed MetaCyc reactions.
    """
    reaction_dicts = {}
    current_reaction = collections.defaultdict(list)
    count_left, count_right, previous_key = 0, 0, ""
    for line in reaction_text:
        if line.startswith("#"):
            continue
        if line.startswith("//"):
            while len(current_reaction["^COEFFICIENT"]) < count_left + count_right:
                current_reaction["^COEFFICIENT"].append(" ")
            while len(current_reaction["^COMPARTMENT"]) < count_left + count_right:
                current_reaction["^COMPARTMENT"].append(" ")
            reaction_dicts[current_reaction['UNIQUE-ID'][0]] = copy.deepcopy(current_reaction)
            # print(reaction_dicts[current_reaction['UNIQUE-ID'][0]])
            current_reaction = collections.defaultdict(list)
            count_left, count_right, previous_key = 0, 0, ""
            continue
        # if line.startswith('/'):
        #     current_reaction[previous_key].append(line)
        #     continue
        key = line.split(" - ")[0]
        value = " - ".join(line.split(" - ")[1:])
        if key == 'LEFT':
            count_left += 1
        if key == 'RIGHT':
            count_right += 1
        if key == "^COEFFICIENT" or key == "^COMPARTMENT":
            while len(current_reaction[key]) < count_left + count_right - 1:
                current_reaction[key].append(" ")
        current_reaction[key].append(value)
        previous_key = key
    return reaction_dicts


[docs]def create_reactions(reaction_file: str, atom_mapping_file: str) -> list:
    """
    To molfile_name MetaCyc reaction entities.

    :param reaction_file: the path to the reaction file.
    :param atom_mapping_file: the path to the atom mapping file.
    :return: the list of constructed :class:`~md_harmonize.reaction.Reaction` entities.
    """
    reaction_dict = reaction_parser(tools.open_text(reaction_file, encoding='cp1252').split("\n"))
    atom_mappings = atom_mappings_parser(tools.open_text(atom_mapping_file).split("\n"))
    reactions = []
    for reaction_name in reaction_dict:
        this_reaction = reaction_dict[reaction_name]
        coefficient_list = collections.deque(this_reaction["^COEFFICIENT"])
        coefficients = {}
        if "LEFT" in this_reaction:
            for cpd_name in this_reaction["LEFT"]:
                cpd_coefficient = coefficient_list.popleft()
                coefficients[cpd_name] = cpd_coefficient if cpd_coefficient != " " else "1"
        if "RIGHT" in this_reaction:
            for cpd_name in this_reaction["RIGHT"]:
                cpd_coefficient = coefficient_list.popleft()
                coefficients[cpd_name] = cpd_coefficient if cpd_coefficient != " " else "1"
        ecs = collections.defaultdict(list)
        if "EC-NUMBER" in this_reaction:
            for ec in this_reaction["EC-NUMBER"]:
                if "|" not in ec:
                    numbers = ec[3:].split(".")
                    ecs[len(numbers)].append(ec[3:])
                else:
                    numbers = ec[4:-1].split(".")
                    ecs[len(numbers)].append(ec[4:-1])
        this_mappings = atom_mappings[reaction_name]["ONE_TO_ONE_MAPPINGS"] if reaction_name in atom_mappings and \
                                                                               "ONE_TO_ONE_MAPPINGS" in \
                                                                               atom_mappings[reaction_name] else []
        reactions.append(reaction.Reaction(reaction_name, this_reaction["LEFT"], this_reaction["RIGHT"], ecs,
                                           this_mappings, coefficients))
    return reactions
Source code for md_harmonize.MetaCyc_parser

md_harmonize

Navigation

Related Topics