Source code for md_harmonize.MetaCyc_parser

#!/usr/bin/python3

"""

md_harmonize.MetaCyc_parser
~~~~~~~~~~~~~~~~~~~~~~~~~~~

This module provides functions to parse MetaCyc text data.

Note: All MetaCyc reactions atom_mappings are stored in a single text file.

"""

import collections
import copy

from . import tools
from . import reaction


[docs]def reaction_side_parser(reaction_side: str) -> dict: """ This is to parse FROM_SIDE or TO_SIDE in the reaction. eg: FROM-SIDE - (CPD-9147 0 8) (OXYGEN-MOLECULE 9 10) Information includes compound name and the start and end atom index in this compound used for atom mappings. The order of the atoms are the orders in the compound molfile. :param reaction_side: the text description of reaction side. :return: the dictionary of compounds and the corresponding start and end atom index in the atom mappings. """ i = 0 compounds = collections.defaultdict(list) while i < len(reaction_side): if reaction_side[i] == "(": i += 1 count = 1 start_point = i while count > 0: if reaction_side[i] == "(": count += 1 if reaction_side[i] == ")": count -= 1 i += 1 i -= 1 inner_substring = reaction_side[start_point: i] if "(" in inner_substring: compound, n, s, e = inner_substring.split() compounds[compound[1:]].append((int(s), int(e))) else: compound, s, e = inner_substring.split() compounds[compound].append((int(s), int(e))) i += 1 return compounds
[docs]def generate_one_to_one_mappings(from_side: dict, to_side: dict, indices: str) -> list: """ To generate the one to one atom mappings between two the sides of a metabolic reaction. :param from_side: the dictionary of compounds with their corresponding start and end atom indices in the from_side. :param to_side: the dictionary of compounds with their corresponding start and end atom indices in the to_side. :param indices: the string representation of mapped atoms. :return: the list of mapped atoms between the two sides (from_index, to_index). """ from_index_dict = {} mappings = [int(num) for num in indices.split(" ") if num != ""] for cpd_name in from_side: for (start_index, end_index) in from_side[cpd_name]: for i in range(start_index, end_index+1): atom_index = i - start_index from_index_dict[i] = (cpd_name, atom_index) to_index_dict = {} for cpd_name in to_side: for (start_index, end_index) in to_side[cpd_name]: for i in range(start_index, end_index+1): atom_index = i - start_index to_index_dict[i] = (cpd_name, atom_index) one_to_one_mappings = [] for to_index, from_index in enumerate(mappings): one_to_one_mappings.append((from_index_dict[from_index], to_index_dict[to_index])) return one_to_one_mappings
[docs]def atom_mappings_parser(atom_mapping_text: list) -> dict: """ This is to parse the MetaCyc reaction with atom mappings. eg: REACTION - RXN-11981 NTH-ATOM-MAPPING - 1 MAPPING-TYPE - NO-HYDROGEN-ENCODING FROM-SIDE - (CPD-12950 0 23) (WATER 24 24) TO-SIDE - (CPD-12949 0 24) INDICES - 0 1 2 3 5 4 7 6 9 10 11 13 12 14 15 16 17 8 18 19 21 20 22 24 23 note: the INDICES are atom mappings between two sides of the reaction. TO-SIDE[i] is mapped to FROM-SIDE[idx] for i, idx in enumerate(INDICES). Pay attention to the direction! :param atom_mapping_text: the text descriptions of reactions with atom mappings. :return: the dictionary of reactions with atom mappings. """ reaction_dicts = {} current_reaction = {} for line in atom_mapping_text: if line.startswith("#"): continue elif line.startswith("//"): reaction_dicts[current_reaction['REACTION']] = copy.deepcopy(current_reaction) reaction_dicts[current_reaction['REACTION']]["ONE_TO_ONE_MAPPINGS"] = \ generate_one_to_one_mappings(current_reaction["FROM-SIDE"], current_reaction["TO-SIDE"], current_reaction["INDICES"]) current_reaction = {} else: key = line.split(" - ")[0] value = " - ".join(line.split(" - ")[1:]) if key == "FROM-SIDE" or key == "TO-SIDE": current_reaction[key] = reaction_side_parser(value) else: current_reaction[key] = value return reaction_dicts
[docs]def reaction_parser(reaction_text: list) -> dict: """ This is used to parse MetaCyc reaction. eg: UNIQUE-ID - RXN-13583 TYPES - Redox-Half-Reactions ATOM-MAPPINGS - (:NO-HYDROGEN-ENCODING (1 0 2) (((WATER 0 0) (HYDROXYLAMINE 1 2)) ((NITRITE 0 2)))) CREDITS - SRI CREDITS - caspi IN-PATHWAY - HAONITRO-RXN LEFT - NITRITE ^COMPARTMENT - CCO-IN LEFT - PROTON ^COEFFICIENT - 5 ^COMPARTMENT - CCO-IN LEFT - E- ^COEFFICIENT - 4 ORPHAN? - :NO PHYSIOLOGICALLY-RELEVANT? - T REACTION-BALANCE-STATUS - :BALANCED REACTION-DIRECTION - LEFT-TO-RIGHT RIGHT - HYDROXYLAMINE ^COMPARTMENT - CCO-IN RIGHT - WATER ^COMPARTMENT - CCO-IN STD-REDUCTION-POTENTIAL - 0.1 // :param reaction_text: the text descriptions of MetaCyc reactions. :return: the dict of parsed MetaCyc reactions. """ reaction_dicts = {} current_reaction = collections.defaultdict(list) count_left, count_right, previous_key = 0, 0, "" for line in reaction_text: if line.startswith("#"): continue if line.startswith("//"): while len(current_reaction["^COEFFICIENT"]) < count_left + count_right: current_reaction["^COEFFICIENT"].append(" ") while len(current_reaction["^COMPARTMENT"]) < count_left + count_right: current_reaction["^COMPARTMENT"].append(" ") reaction_dicts[current_reaction['UNIQUE-ID'][0]] = copy.deepcopy(current_reaction) # print(reaction_dicts[current_reaction['UNIQUE-ID'][0]]) current_reaction = collections.defaultdict(list) count_left, count_right, previous_key = 0, 0, "" continue # if line.startswith('/'): # current_reaction[previous_key].append(line) # continue key = line.split(" - ")[0] value = " - ".join(line.split(" - ")[1:]) if key == 'LEFT': count_left += 1 if key == 'RIGHT': count_right += 1 if key == "^COEFFICIENT" or key == "^COMPARTMENT": while len(current_reaction[key]) < count_left + count_right - 1: current_reaction[key].append(" ") current_reaction[key].append(value) previous_key = key return reaction_dicts
[docs]def create_reactions(reaction_file: str, atom_mapping_file: str) -> list: """ To molfile_name MetaCyc reaction entities. :param reaction_file: the path to the reaction file. :param atom_mapping_file: the path to the atom mapping file. :return: the list of constructed :class:`~md_harmonize.reaction.Reaction` entities. """ reaction_dict = reaction_parser(tools.open_text(reaction_file, encoding='cp1252').split("\n")) atom_mappings = atom_mappings_parser(tools.open_text(atom_mapping_file).split("\n")) reactions = [] for reaction_name in reaction_dict: this_reaction = reaction_dict[reaction_name] coefficient_list = collections.deque(this_reaction["^COEFFICIENT"]) coefficients = {} if "LEFT" in this_reaction: for cpd_name in this_reaction["LEFT"]: cpd_coefficient = coefficient_list.popleft() coefficients[cpd_name] = cpd_coefficient if cpd_coefficient != " " else "1" if "RIGHT" in this_reaction: for cpd_name in this_reaction["RIGHT"]: cpd_coefficient = coefficient_list.popleft() coefficients[cpd_name] = cpd_coefficient if cpd_coefficient != " " else "1" ecs = collections.defaultdict(list) if "EC-NUMBER" in this_reaction: for ec in this_reaction["EC-NUMBER"]: if "|" not in ec: numbers = ec[3:].split(".") ecs[len(numbers)].append(ec[3:]) else: numbers = ec[4:-1].split(".") ecs[len(numbers)].append(ec[4:-1]) this_mappings = atom_mappings[reaction_name]["ONE_TO_ONE_MAPPINGS"] if reaction_name in atom_mappings and \ "ONE_TO_ONE_MAPPINGS" in \ atom_mappings[reaction_name] else [] reactions.append(reaction.Reaction(reaction_name, this_reaction["LEFT"], this_reaction["RIGHT"], ecs, this_mappings, coefficients)) return reactions