Source code for kegg_pull.pathway_organizer

"""
Flattening A Pathways Brite Hierarchy
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|Functionality| for flattening a pathways Brite hierarchy (ID: 'br:br08901') into a collection of its nodes, mapping a node ID to information about it, enabling combinations with other KEGG data.
"""
from __future__ import annotations
import json
import logging as log
import typing as t
from . import rest as r
from . import _utils as u


[docs] class HierarchyNode(t.TypedDict): """A dictionary with the following keys:""" name: str """The name of the node obtained directly from the Brite hierarchy.""" level: int """The level that the node appears in the hierarchy.""" parent: str | None """The key (not the name) of the parent node (None if top level node).""" children: list[str] | None """The keys (not the names) of the node's children (None if leaf node).""" entry_id: str | None """The entry ID of the node (None if the node does not correspond to a KEGG entry)."""
HierarchyNodes = dict[str, HierarchyNode] _RawHierarchyNode = t.TypedDict('_RawHierarchyNode', {'name': str, 'children': list[dict] | None})
[docs] class PathwayOrganizer(u.NonInstantiable): """ Contains methods for managing a mapping of node keys to node information, these nodes coming from a pathways Brite hierarchy. An instantiated ``PathwayOrganizer`` object must be returned from either ``PathwayOrganizer.load_from_kegg`` or ``PathwayOrganizer.load_from_json``. The ``__init__`` is not meant to be called directly. The ``__str__`` method returns a JSON string of ``hierarchy_nodes``. :ivar dict[str, HierarchyNode] hierarchy_nodes: The mapping of node keys to node information managed by the PathwayOrganizer. """ def __init__(self) -> None: super(PathwayOrganizer, self).__init__() self.hierarchy_nodes: HierarchyNodes | None = None self._filter_nodes: set[str] | None = None
[docs] @staticmethod def load_from_kegg( top_level_nodes: set[str] | None = None, filter_nodes: set[str] | None = None, kegg_rest: r.KEGGrest | None = None) -> PathwayOrganizer: """ Pulls the Brite hierarchy from the KEGG REST API and converts it to the ``hierarchy_nodes`` mapping. :param top_level_nodes: Node names in the highest level of the hierarchy to select from. If None, all top level nodes are traversed to create the ``hierarchy_nodes``. :param filter_nodes: Names (not keys) of nodes to exclude from the ``hierarchy_nodes`` mapping. Neither these nodes nor any of their children will be included. :param kegg_rest: Optional KEGGrest object for obtaining the Brite hierarchy. A new KEGGrest object is created by default. :returns: The resulting PathwayOrganizer object. """ pathway_org = PathwayOrganizer() pathway_org.hierarchy_nodes = HierarchyNodes() pathway_org._filter_nodes = filter_nodes hierarchy = PathwayOrganizer._get_hierarchy(kegg_rest=kegg_rest) valid_top_level_nodes = sorted(top_level_node['name'] for top_level_node in hierarchy) if top_level_nodes is not None: for top_level_node in list(top_level_nodes): if top_level_node not in valid_top_level_nodes: log.warning( f'Top level node name "{top_level_node}" is not recognized and will be ignored. Valid values are: ' f'"{", ".join(valid_top_level_nodes)}"') top_level_nodes.remove(top_level_node) hierarchy = [top_level_node for top_level_node in hierarchy if top_level_node['name'] in top_level_nodes] pathway_org._parse_hierarchy(level=1, raw_hierarchy_nodes=hierarchy, parent_name=None) return pathway_org
@staticmethod def _get_hierarchy(kegg_rest: r.KEGGrest | None) -> list[_RawHierarchyNode]: """ Pulls the Brite hierarchy (to be converted to hierarchy_nodes) from the KEGG REST API. :return: The list of top level nodes that branch out into the rest of the hierarchy until reaching leaf nodes. """ kegg_rest = kegg_rest if kegg_rest is not None else r.KEGGrest() kegg_response = kegg_rest.get(entry_ids=['br:br08901'], entry_field='json') text_body = kegg_response.text_body.strip() brite_hierarchy: dict = json.loads(s=text_body) return brite_hierarchy['children'] def _parse_hierarchy(self, level: int, raw_hierarchy_nodes: list[_RawHierarchyNode], parent_name: str | None) -> set[str]: """ Recursively traverses the Brite hierarchy to create the hierarchy_nodes mapping. :param level: The current level of recursion representing the level of the node in the hierarchy. :param raw_hierarchy_nodes: The list of nodes in the current branch of the hierarchy being traversed. :param parent_name: The node key of the parent node of the current branch of the hierarchy. :return: The keys of the nodes added to the hierarchy_nodes property representing the children of the parent node. """ nodes_added = set[str]() for raw_hierarchy_node in raw_hierarchy_nodes: node_name = raw_hierarchy_node['name'] if self._filter_nodes is None or node_name not in self._filter_nodes: if 'children' in raw_hierarchy_node.keys(): node_children = self._parse_hierarchy( level=level+1, raw_hierarchy_nodes=raw_hierarchy_node['children'], parent_name=node_name) if self._filter_nodes is not None: expected_n_children_added = len( [child for child in raw_hierarchy_node['children'] if child['name'] not in self._filter_nodes]) else: expected_n_children_added = len(raw_hierarchy_node['children']) assert len(node_children) == expected_n_children_added, f'Not all children added for node: {node_name}' node_key = self._add_hierarchy_node( name=node_name, level=level, parent=parent_name, children=node_children, entry_id=None) else: entry_id = node_name.split(' ')[0] entry_id = f'path:map{entry_id}' node_key = self._add_hierarchy_node( name=node_name, level=level, parent=parent_name, children=None, entry_id=entry_id) nodes_added.add(node_key) return nodes_added def _add_hierarchy_node(self, name: str, level: int, parent: str, children: set[str] | None, entry_id: str | None) -> str: """ Adds a Brite hierarchy node representation to the hierarchy_nodes property. :param name: The name of the node obtained directly from the Brite hierarchy. :param level: The level that the node appears in the hierarchy. :param parent: The key of the parent node (None if top level node). :param children: The keys of the node's children (None if leaf node). :param entry_id: The entry ID of the node; string if it represents a KEGG pathway mapping, else None. :return: The key chosen for the node, equal to its entry ID if not None, else the name of the Node. """ key = entry_id if entry_id is not None else name assert key not in self.hierarchy_nodes.keys(), f'Duplicate brite hierarchy node name {key}' children = sorted(children) if children is not None else None self.hierarchy_nodes[key] = HierarchyNode(name=name, level=level, parent=parent, children=children, entry_id=entry_id) return key def __str__(self) -> str: """ Converts the hierarchy nodes to a JSON string. :return: The JSON string version of the hierarchy nodes. """ return json.dumps(self.hierarchy_nodes, indent=2) _schema = { 'type': 'object', 'minProperties': 1, 'additionalProperties': False, 'patternProperties': { '^.+$': { 'type': 'object', 'required': ['name', 'level', 'parent', 'children', 'entry_id'], 'additionalProperties': False, 'properties': { 'name': { 'type': 'string', 'minLength': 1 }, 'level': { 'type': 'integer', 'minimum': 1 }, 'parent': { 'type': ['string', 'null'], 'minLength': 1 }, 'children': { 'minItems': 1, 'type': ['array', 'null'], 'items': { 'type': 'string', 'minLength': 1 } }, 'entry_id': { 'type': ['string', 'null'], 'minLength': 1 } } } } }
[docs] @staticmethod def load_from_json(file_path: str) -> PathwayOrganizer: """ Loads the ``hierarchy_nodes`` mapping that was cached in a JSON file using ``load_from_kegg`` followed by ``save_to_json``. :param file_path: Path to the JSON file. If reading from a ZIP archive, the file path must be in the following format: /path/to/zip-archive.zip:/path/to/file (e.g. ./archive.zip:hierarchy-nodes.json). :returns: The resulting PathwayOrganizer object. :raises ValidationError: Raised if the JSON file does not follow the correct JSON schema. Should follow the correct schema if ``hierarchy_nodes`` was cached using ``load_from_kegg`` followed by ``save_to_json`` and without any additional alteration. """ pathway_org = PathwayOrganizer() hierarchy_nodes: HierarchyNodes = u.load_json_file( file_path=file_path, json_schema=PathwayOrganizer._schema, validation_error_message=f'Failed to load the hierarchy nodes. The pathway organizer JSON file at {file_path} is ' f'corrupted and will need to be re-created.') pathway_org.hierarchy_nodes = hierarchy_nodes return pathway_org
[docs] def save_to_json(self, file_path: str) -> None: """ Saves the ``hierarchy_nodes`` mapping to a JSON file to cache it. :param file_path: The path to the JSON file to save the ``hierarchy_nodes`` mapping. If saving in a ZIP archive, the file path must be in the following format: /path/to/zip-archive.zip:/path/to/file (e.g. ./archive.zip:hierarchy-nodes.json). """ json_string = str(self) u.save_output(output_target=file_path, output_content=json_string)