Source code for cgsmiles.resolve

import re
import copy
import numpy as np
import networkx as nx
from .read_cgsmiles import read_cgsmiles
from .read_fragments import read_fragments
from .graph_utils import (merge_graphs,
                          sort_nodes_by_attr,
                          annotate_fragments,
                          set_atom_names_atomistic)
from .pysmiles_utils import (rebuild_h_atoms,
                             annotate_ez_isomers_cgsmiles)


[docs]
def compatible(left, right, left_data={}, right_data={}, legacy=True):
    """
    Check bonding descriptor compatibility according
    to the CGsmiles syntax conventions. With legacy,
    the BigSmiles convention can be used.

    The dicts of left_data and right_data are only
    used when compatibility-checking uses of the
    squashing ([!]) operator.

    Parameters
    ----------
    left: str
    right: str
    left_data: dict
    right_data: dict
    legacy: bool

    Returns
    -------
    bool
    """

    # Non-legacy only checks the first character for compatibility
    if not legacy:
        left = left[0]
        right = right[0]

    # By flipping the <> for one of the binders, we no longer need special
    #  case checking downstream
    left = left[0].translate(str.maketrans('<>','><')) + left[1:]

    if left != right:
        return False

    if left[0] != '!':
        return True

    # Only the '!' compatibility control left
    left_name = left_data.get('element', left_data.get('atomname'))
    right_name = right_data.get('element', right_data.get('atomname'))
    # Could there be nameless nodes? Anyway, not enough info to reject
    #  so we accept if they're involved
    return (left_name is None or
            right_name is None or
            (left_name == right_name))



[docs]
def match_bonding_descriptors(source, target, bond_attribute="bonding", legacy=True):
    """
    Given a source and a target graph, which have bonding
    descriptors stored as node attributes, find a pair of
    matching descriptors and return the respective nodes.
    The function also returns the bonding descriptors. If
    no bonding descriptor is found an instance of LookupError
    is raised.

    Parameters
    ----------
    source: networkx.Graph
    target: networkx.Graph
    bond_attribute: `collections.abc.Hashable`
        under which attribute are the bonding descriptors
        stored.
    legacy: bool
        which syntax convention to use when matching bonding
        descriptors (legacy=BigSmiles)

    Returns
    -------
    ((collections.abc.Hashable, collections.abc.Hashable), (str, str))
        the nodes as well as bonding descriptors

    Raises
    ------
    LookupError
        if no match is found
    """
    source_nodes = nx.get_node_attributes(source, bond_attribute)
    target_nodes = nx.get_node_attributes(target, bond_attribute)
    for source_node, bond_sources in source_nodes.items():
        source_node_data = source.nodes[source_node]
        for target_node, bond_targets in target_nodes.items():
            target_node_data = target.nodes[target_node]
            for bond_source in bond_sources:
                for bond_target in bond_targets:
                    if compatible(bond_source, bond_target,
                                  source_node_data,
                                  target_node_data,
                                  legacy=legacy):
                        return ((source_node, target_node),
                                (bond_source, bond_target))
    raise LookupError


def _adjust_hcount(molecule):
    """
    Given atoms in a molecule built from cgsmiles fragments, adjust
    the hcount such that we substract the total number of newly formed
    edges multiplied by their bond order. The 'hcount' attribute is
    updated in place.

    Parameters
    ----------
    molecule: networkx.Graph
    """
    for node in molecule.nodes:
        hcount = molecule.nodes[node]["_hcount"]
        new_degree = sum([molecule.edges[edge]["order"] for edge in molecule.edges(node)])
        init_degree = molecule.nodes[node]["_edge_orders"]
        molecule.nodes[node]["hcount"] =  max([0, hcount - (new_degree - init_degree)])


[docs]
class MoleculeResolver:
    """
    Resolve the molecule(s) described by a CGsmiles string and return a networkx.Graph
    of the molecule.

    First, this class has to be initiated using one of three class construction
    methods. When trying to read a CGsmiles string always use the first method.
    The other constructors can be used in case fragments or the lowest
    resolution molecule are defined by graphs that come from elsewhere.

    `self.from_string`:           use when fragments and lowest resolution are
                                  described in one CGsmiles string.
    `self.from_graph`:            use when fragments are described by CGsmiles
                                  strings but the lowest resolution is given
                                  as nx.Graph
    `self.from_fragment_dicts`:   use when fragments are given as nx.Graphs
                                  and the lowest resolution is provided as
                                  CGsmiles string

    Once the `MoleculeResolver` is initiated you can call the `resolve_iter` to
    loop over the different levels of resolution. The resolve iter will always
    return the previous lower resolution graph as well as the current higher
    resolution graph. For example, if the CGsmiles string describes a monomer
    sequence of a regular polymer, the lower resolution graph will be the graph
    of this monomer sequence and the higher resolution graph the full molecule.

    Basic Examples
    --------------
    Blocky-copolymer of PE and PEO with the first resolution being the sequence
    of blocks, followed by the monomer graph, and then the full molecule.

    >>> cgsmiles_str = "{[#B1][#B2][#B1]}.{#B1=[#PEO]|4,#B2=[#PE]|2}.{#PEO=[>]COC[<],#PE=[>]CC[<]}"
    >>> resolver = MoleculeResolver.from_string(cgsmiles_str)
    >>> for low_res, high_res in resolver.resolve_iter():
            print(low_res.nodes(data='fragname'))
            print(high_res.nodes(data='atomname))

    To only access the final resolution level you can simply call `resolve all`:
    >>> monomer_graph, full_molecule = resolver.resolve_all()

    Advanced API Examples
    ---------------------
    Alternatively, one could have gotten the block level graph from somewhere
    else defined as `nx.Graph` in that case:

    >>> # the string only defines the fragments
    >>> cgsmiles_str = "{#B1=[#PEO]|4,#B2=[#PE]|2}.{#PEO=[>]COC[<],#PE=[>]CC[<]}"
    >>> block_graph = nx.Graph()
    >>> block_graph.add_edges_from([(0, 1), (1, 2), (2, 3)])
    >>> nx.set_node_attributes(block_graph, {0: "B1", 1: "B2", 2: "B1"}, 'fragname')
    >>> resolver = MoleculeResolver.from_graph(cgsmiles_str, block_graph)

    Finally, there is the option of having the fragments from elsewhere for
    example a library. Then only the graph defined as CGsmiles string. In this
    case the `from_fragment_dicts` method can be used. Please note that the
    fragment graphs need to have the following attributes as a graph returned
    by the `cgsmiles.read_fragments` function.

    >>> fragment_dicts = []
    >>> for frag_string in ["{#B1=[#PEO]|4,#B2=[#PE]|2}", "{#PEO=[>]COC[<],#PE=[>]CC[<]}"]:
    >>>     frag_dict = read_fragments(frag_string)
    >>>     fragment_dicts.append(frag_dict)
    >>> cgsmiles_str = "{[#B1][#B2][#B1]}"
    >>> resolver = MoleculeResolver.from_fragment_dicts(cgsmiles_str, fragment_dicts)

    Subclassing
    -----------
    More advanced workflows can easily be implemented by subclassing the MoleculeResolver
    and adding new constructors that peform more complex preparation instructions for
    example.
    """
    def __init__(self,
                 molecule_graph,
                 fragment_dicts,
                 last_all_atom=True,
                 legacy=True):

        """
        Parameters
        ----------
        molecule_graph: networkx.Graph
            a lower resolution molecule graph to be resolved to higher
            resolutions molecule graphs. Each node must have the fragname
            with a dict entry in the next fragment_dicts list.
        fragment_dicts: list[dict[str, networkx.Graph]]
            a dict of fragment graphs per resolution. Each graph must have the
            same attributes as returned by the `cgsmiles.read_fragments`
            function.
        last_all_atom: bool
            if the last resolution is at the all atom level. If True the code
            will use pysmiles to parse the fragments and return the all-atom
            molecule. Default: True
        legacy: bool
            which syntax convention to use for matching the bonding descriptors.
            Legacy syntax adheres to the BigSmiles convention. Default syntax
            adheres to CGsmiles convention where bonding descriptors '$' match
            with every '$' and every '<' matches every '>'. With the BigSmiles
            convention a alphanumeric string may be provided that distinguishes
            these connectors. For example, '$A' would not match '$B'. However,
            such use cases should be rare and the CGsmiles convention facilitates
            usage of bonding descriptors in the Sampler where the labels are used
            to assign different probabilities.
        """
        self.meta_graph = nx.Graph()
        self.fragment_dicts = fragment_dicts
        self.molecule = molecule_graph
        self.last_all_atom = last_all_atom
        self.resolution_counter = 0
        self.resolutions = len(self.fragment_dicts)
        new_names = nx.get_node_attributes(self.molecule, "fragname")
        nx.set_node_attributes(self.meta_graph, new_names, "atomname")
        self.legacy = legacy


[docs]
    @staticmethod
    def read_fragment_strings(fragment_strings, last_all_atom=True):
        """
        Read a list of CGsmiles fragment_strings and return a list
        of dicts with the fragment graphs. If `last_all_atom` is
        True then pysmiles is used to read the last fragment string
        provided in the list.

        Parameters
        ----------
        fragment_strings: list[str]
            list of CGsmiles fragment strings
        last_all_atom: bool
            if the last string in the list is an all atom string
            and should be read using pysmiles.

        Returns
        -------
        list[dict[str, networkx.Graph]]
            a list of the fragment dicts composed of the fragment
            name and a nx.Graph describing the fragment
        """
        fragment_dicts = []
        for idx, fragment_str in enumerate(fragment_strings):
            all_atom = (idx == len(fragment_strings) - 1 and last_all_atom)
            f_dict = read_fragments(fragment_str, all_atom=all_atom)
            fragment_dicts.append(f_dict)
        return fragment_dicts



[docs]
    def resolve_disconnected_molecule(self, fragment_dict):
        """
        Given a connected graph of nodes with associated fragment graphs
        generate a disconnected graph of the fragments and annotate
        each fragment graph to the node in the higher resolution
        graph.

        Parameters
        ----------
        fragment_dict: dict[str, networkx.Graph]
            a dict of fragment graphs
        """
        for meta_node in self.meta_graph.nodes:
            fragname = self.meta_graph.nodes[meta_node]['fragname']

            # we have a virtual node and skip it
            if fragname not in fragment_dict:
                neighbors = self.meta_graph.neighbors(meta_node)
                orders = [self.meta_graph.edges[(meta_node, neigh)]["order"] for neigh in neighbors]
                if not all(np.array(orders) == 0):
                    raise SyntaxError(f"Found node #{fragname} but no corresponding fragment.")
                continue

            fragment = fragment_dict[fragname]
            correspondence = merge_graphs(self.molecule, fragment, fragid=meta_node)

            graph_frag = nx.Graph()

            for node in fragment.nodes:
                new_node = correspondence[node]
                attrs = copy.deepcopy(self.molecule.nodes[new_node])
                graph_frag.add_node(correspondence[node], **attrs)
                nx.set_node_attributes(graph_frag, [meta_node], 'fragid')
                graph_frag.nodes[new_node]['mapping'] = [(fragname, node)]
                self.molecule.nodes[new_node]['mapping'] = [(fragname, node)]
                if self.last_all_atom:
                    # we need to keep some info about the original valances
                    edge_orders = [self.molecule.edges[edge]["order"] for edge in self.molecule.edges(new_node)]
                    self.molecule.nodes[new_node]["_edge_orders"] = sum(edge_orders)
                    self.molecule.nodes[new_node]['_hcount'] = self.molecule.nodes[new_node].get('hcount', 0)

            for a, b in fragment.edges:
                new_a = correspondence[a]
                new_b = correspondence[b]
                attrs = copy.deepcopy(fragment.edges[(a, b)])
                graph_frag.add_edge(new_a,
                                    new_b,
                                    **attrs)

            self.meta_graph.nodes[meta_node]['graph'] = graph_frag



[docs]
    def edges_from_bonding_descrpt(self, all_atom=True):
        """
        Makes edges according to the bonding descriptors stored
        in the node attributes of meta_molecule residue graph.

        If a bonding descriptor is consumed it is removed from the list,
        however, the meta_graph edge gets an attribute with the
        bonding descriptors that formed the edge.

        Later unconsumed descriptors are discarded and the valence
        filled in using hydrogen atoms in case of an atomistic molecule.

        Parameters
        ----------
        all_atom: bool
            if the high resolution level graph has all-atom resolution
            default: False
        """
        edges = list(self.meta_graph.edges)
        for prev_node, node in edges:
            for _ in range(0, self.meta_graph.edges[(prev_node, node)]["order"]):
                prev_graph = self.meta_graph.nodes[prev_node]['graph']
                node_graph = self.meta_graph.nodes[node]['graph']
                try:
                    edge, bonding = match_bonding_descriptors(prev_graph,
                                                              node_graph,
                                                              legacy=self.legacy)
                except LookupError:
                    continue
                # remove used bonding descriptors
                prev_graph.nodes[edge[0]]['bonding'].remove(bonding[0])
                node_graph.nodes[edge[1]]['bonding'].remove(bonding[1])

                # bonding descriptors are assumed to have bonding order 1
                # unless they are specifically annotated
                order = int(bonding[0][-1])
                if self.molecule.nodes[edge[0]].get('aromatic', False) and\
                   self.molecule.nodes[edge[1]].get('aromatic', False):
                    order = 1.5
                self.molecule.add_edge(edge[0], edge[1], bonding=bonding, order=order)



[docs]
    def squash_atoms(self):
        """
        Applies the squash operator by removing the duplicate node
        adding, all edges from that node to the remaining one, and
        annotate the other node with the fragid of the removed
        node.
        """
        bondings = nx.get_edge_attributes(self.molecule, 'bonding')
        squashed = {}
        for edge, bonding in bondings.items():
            if not bonding[0].startswith('!'):
                continue
            # let's squash two nodes
            node_to_keep = squashed.get(edge[0], edge[0])
            node_to_remove = squashed.get(edge[1], edge[1])
            squashed[node_to_remove] = node_to_keep
            self.molecule = nx.contracted_nodes(self.molecule,
                                                node_to_keep,
                                                node_to_remove,
                                                self_loops=False)

            # add the fragment id of the sequashed node
            self.molecule.nodes[node_to_keep]['fragid'] += self.molecule.nodes[node_to_keep]['contraction'][node_to_remove]['fragid']
            self.molecule.nodes[node_to_keep]['mapping'] += self.molecule.nodes[node_to_keep]['contraction'][node_to_remove]['mapping']
            # add the isomer class of the squashed node
            if 'ez_isomer_class' in self.molecule.nodes[node_to_keep]['contraction'][node_to_remove]:
                if 'ez_isomer_class' in self.molecule.nodes[node_to_keep]:
                    self.molecule.nodes[node_to_keep]['ez_isomer_class'] += self.molecule.nodes[node_to_keep]['contraction'][node_to_remove]['ez_isomer_class']
                else:
                    self.molecule.nodes[node_to_keep]['ez_isomer_class'] = self.molecule.nodes[node_to_keep]['contraction'][node_to_remove]['ez_isomer_class']



[docs]
    def resolve(self):
        """
        Resolve a CGsmiles string once and return the next resolution.
        """
        # check if this is an all-atom level resolution
        all_atom = (self.resolution_counter == self.resolutions - 1 and self.last_all_atom)

        # get the next set of fragments
        fragment_dict = self.fragment_dicts[self.resolution_counter]

        # set the previous molecule as meta_graph
        self.meta_graph = self.molecule

        # now we have to switch the node names and the fragment names
        new_fragnames = nx.get_node_attributes(self.meta_graph, "atomname")
        nx.set_node_attributes(self.meta_graph, new_fragnames, "fragname")

        # create an empty molecule graph
        self.molecule = nx.Graph()

        # add disconnected fragments to graph
        self.resolve_disconnected_molecule(fragment_dict)

        # connect valid bonding descriptors
        self.edges_from_bonding_descrpt(all_atom=all_atom)

        # contract atoms with squash descriptors
        self.squash_atoms()
        # rebuild hydrogen in all-atom case
        if all_atom:
            _adjust_hcount(self.molecule)
            rebuild_h_atoms(self.molecule)

        # sort the atoms
        self.molecule = sort_nodes_by_attr(self.molecule, sort_attr=("fragid"))

        if all_atom:
            annotate_ez_isomers_cgsmiles(self.molecule)

        # and redo the meta molecule
        self.meta_graph = annotate_fragments(self.meta_graph,
                                             self.molecule)

        if all_atom:
            # in all-atom MD there are common naming conventions
            # that might be expected and hence we set them here
            set_atom_names_atomistic(self.molecule,
                                     self.meta_graph)

        # increment the resolution counter
        self.resolution_counter += 1

        return self.meta_graph, self.molecule



[docs]
    def resolve_iter(self):
        """
        Iterator returning all resolutions in oder.
        """
        for _ in range(self.resolutions):
            meta_graph, molecule = self.resolve()
            yield meta_graph, molecule



[docs]
    def resolve_all(self):
        """
        Resolve all layers and return final moleculs as well as the previous
        resolution graph.
        """
        *_, (meta_graph, graph) = self.resolve_iter()
        return meta_graph, graph



[docs]
    @classmethod
    def from_string(cls, cgsmiles_str, last_all_atom=True, legacy=True):
        """
        Initiate a MoleculeResolver instance from a cgsmiles string.

        Parameters
        ----------
        cgsmiles_str: str
        last_all_atom: bool
            if the last resolution is all-atom and is read using pysmiles
        legacy: bool
            which syntax convention to use for matching the bonding descriptors.
            Legacy syntax adheres to the BigSmiles convention. Default syntax
            adheres to CGsmiles convention. A more detailed explanation can be
            found in the MoleculeResolver.__init__ method.

        Returns
        -------
        :class:`MoleculeResolver`
        """
        # here we figure out how many resolutions we are dealing with
        elements = re.findall(r"\{[^\}]+\}", cgsmiles_str)
        # the first one describes our lowest resolution
        molecule = read_cgsmiles(elements[0])
        # the rest are fragment lists
        fragment_dicts = cls.read_fragment_strings(elements[1:],
                                                   last_all_atom=last_all_atom)
        resolver_obj = cls(molecule_graph=molecule,
                           fragment_dicts=fragment_dicts,
                           last_all_atom=last_all_atom,
                           legacy=legacy)
        return resolver_obj



[docs]
    @classmethod
    def from_graph(cls, cgsmiles_str, meta_graph, last_all_atom=True, legacy=True):
        """
        Initiate a MoleculeResolver instance from a cgsmiles string
        and a `meta_graph` that describes the lowest resolution.

        Parameters
        ----------
        cgsmiles_str: str
        meta_graph: networkx.Graph
            a graph describing the lowest resolution. All nodes must have the
            fragname attribute set.
        last_all_atom: bool
            if the last resolution is all-atom and is read using pysmiles
        legacy: bool
            which syntax convention to use for matching the bonding descriptors.
            Legacy syntax adheres to the BigSmiles convention. Default syntax
            adheres to CGsmiles convention. A more detailed explanation can be
            found in the MoleculeResolver.__init__ method.

        Returns
        -------
        :class:`MoleculeResolver`
        """
        # here we figure out how many resolutions we are dealing with
        elements = re.findall(r"\{[^\}]+\}", cgsmiles_str)
        # all elements are are fragment lists
        fragment_dicts = cls.read_fragment_strings(elements,
                                                   last_all_atom=last_all_atom)
        if not all('fragname' in meta_graph.nodes[n] for n in meta_graph.nodes):
            msg = "All nodes must have the fragname attribute set."
            raise IOError(msg)

        resolver_obj = cls(molecule_graph=meta_graph,
                           fragment_dicts=fragment_dicts,
                           last_all_atom=last_all_atom,
                           legacy=legacy)

        return resolver_obj



[docs]
    @classmethod
    def from_fragment_dicts(cls, cgsmiles_str, fragment_dicts, last_all_atom=True, legacy=True):
        """
        Initiate a MoleculeResolver instance from a cgsmiles string, describing
        one molecule and fragment_dicts containing fragments for each resolution.

        Parameters
        ----------
        cgsmiles_str: str
        fragment_dicts: list[dict[str, networkx.Graph]]
            a dict of fragment graphs per resolution. Each graph must have the
            same attributes as returned by the `cgsmiles.read_fragments`
            function.
        last_all_atom: bool
            if the last resolution is all-atom and is read using pysmiles
        legacy: bool
            which syntax convention to use for matching the bonding descriptors.
            Legacy syntax adheres to the BigSmiles convention. Default syntax
            adheres to CGsmiles convention. A more detailed explanation can be
            found in the MoleculeResolver.__init__ method.

        Returns
        -------
        :class:`MoleculeResolver`
        """
        # here we figure out how many resolutions we are dealing with
        elements = re.findall(r"\{[^\}]+\}", cgsmiles_str)
        if len(elements) > 1:
            msg = ("Your cgsmiles string can only describe one "
                   "resolution of a molecule when using this function.")
            raise IOError(msg)
        # the first one describes our lowest resolution
        molecule = read_cgsmiles(elements[0])
        resolver_obj = cls(molecule_graph=molecule,
                           fragment_dicts=fragment_dicts,
                           last_all_atom=last_all_atom,
                           legacy=legacy)
        return resolver_obj