Source code for cgsmiles.resolve

import re
import copy
import numpy as np
import networkx as nx
from .read_cgsmiles import read_cgsmiles
from .read_fragments import read_fragments
from .graph_utils import (merge_graphs,
                          sort_nodes_by_attr,
                          annotate_fragments,
                          set_atom_names_atomistic)
from .pysmiles_utils import (rebuild_h_atoms,
                             annotate_ez_isomers_cgsmiles)

[docs] def compatible(left, right, left_data={}, right_data={}, legacy=True): """ Check bonding descriptor compatibility according to the CGsmiles syntax conventions. With legacy, the BigSmiles convention can be used. The dicts of left_data and right_data are only used when compatibility-checking uses of the squashing ([!]) operator. Parameters ---------- left: str right: str left_data: dict right_data: dict legacy: bool Returns ------- bool """ # Non-legacy only checks the first character for compatibility if not legacy: left = left[0] right = right[0] # By flipping the <> for one of the binders, we no longer need special # case checking downstream left = left[0].translate(str.maketrans('<>','><')) + left[1:] if left != right: return False if left[0] != '!': return True # Only the '!' compatibility control left left_name = left_data.get('element', left_data.get('atomname')) right_name = right_data.get('element', right_data.get('atomname')) # Could there be nameless nodes? Anyway, not enough info to reject # so we accept if they're involved return (left_name is None or right_name is None or (left_name == right_name))
[docs] def match_bonding_descriptors(source, target, bond_attribute="bonding", legacy=True): """ Given a source and a target graph, which have bonding descriptors stored as node attributes, find a pair of matching descriptors and return the respective nodes. The function also returns the bonding descriptors. If no bonding descriptor is found an instance of LookupError is raised. Parameters ---------- source: networkx.Graph target: networkx.Graph bond_attribute: `collections.abc.Hashable` under which attribute are the bonding descriptors stored. legacy: bool which syntax convention to use when matching bonding descriptors (legacy=BigSmiles) Returns ------- ((collections.abc.Hashable, collections.abc.Hashable), (str, str)) the nodes as well as bonding descriptors Raises ------ LookupError if no match is found """ source_nodes = nx.get_node_attributes(source, bond_attribute) target_nodes = nx.get_node_attributes(target, bond_attribute) for source_node, bond_sources in source_nodes.items(): source_node_data = source.nodes[source_node] for target_node, bond_targets in target_nodes.items(): target_node_data = target.nodes[target_node] for bond_source in bond_sources: for bond_target in bond_targets: if compatible(bond_source, bond_target, source_node_data, target_node_data, legacy=legacy): return ((source_node, target_node), (bond_source, bond_target)) raise LookupError
def _adjust_hcount(molecule): """ Given atoms in a molecule built from cgsmiles fragments, adjust the hcount such that we substract the total number of newly formed edges multiplied by their bond order. The 'hcount' attribute is updated in place. Parameters ---------- molecule: networkx.Graph """ for node in molecule.nodes: hcount = molecule.nodes[node]["_hcount"] new_degree = sum([molecule.edges[edge]["order"] for edge in molecule.edges(node)]) init_degree = molecule.nodes[node]["_edge_orders"] molecule.nodes[node]["hcount"] = max([0, hcount - (new_degree - init_degree)])
[docs] class MoleculeResolver: """ Resolve the molecule(s) described by a CGsmiles string and return a networkx.Graph of the molecule. First, this class has to be initiated using one of three class construction methods. When trying to read a CGsmiles string always use the first method. The other constructors can be used in case fragments or the lowest resolution molecule are defined by graphs that come from elsewhere. `self.from_string`: use when fragments and lowest resolution are described in one CGsmiles string. `self.from_graph`: use when fragments are described by CGsmiles strings but the lowest resolution is given as nx.Graph `self.from_fragment_dicts`: use when fragments are given as nx.Graphs and the lowest resolution is provided as CGsmiles string Once the `MoleculeResolver` is initiated you can call the `resolve_iter` to loop over the different levels of resolution. The resolve iter will always return the previous lower resolution graph as well as the current higher resolution graph. For example, if the CGsmiles string describes a monomer sequence of a regular polymer, the lower resolution graph will be the graph of this monomer sequence and the higher resolution graph the full molecule. Basic Examples -------------- Blocky-copolymer of PE and PEO with the first resolution being the sequence of blocks, followed by the monomer graph, and then the full molecule. >>> cgsmiles_str = "{[#B1][#B2][#B1]}.{#B1=[#PEO]|4,#B2=[#PE]|2}.{#PEO=[>]COC[<],#PE=[>]CC[<]}" >>> resolver = MoleculeResolver.from_string(cgsmiles_str) >>> for low_res, high_res in resolver.resolve_iter(): print(low_res.nodes(data='fragname')) print(high_res.nodes(data='atomname)) To only access the final resolution level you can simply call `resolve all`: >>> monomer_graph, full_molecule = resolver.resolve_all() Advanced API Examples --------------------- Alternatively, one could have gotten the block level graph from somewhere else defined as `nx.Graph` in that case: >>> # the string only defines the fragments >>> cgsmiles_str = "{#B1=[#PEO]|4,#B2=[#PE]|2}.{#PEO=[>]COC[<],#PE=[>]CC[<]}" >>> block_graph = nx.Graph() >>> block_graph.add_edges_from([(0, 1), (1, 2), (2, 3)]) >>> nx.set_node_attributes(block_graph, {0: "B1", 1: "B2", 2: "B1"}, 'fragname') >>> resolver = MoleculeResolver.from_graph(cgsmiles_str, block_graph) Finally, there is the option of having the fragments from elsewhere for example a library. Then only the graph defined as CGsmiles string. In this case the `from_fragment_dicts` method can be used. Please note that the fragment graphs need to have the following attributes as a graph returned by the `cgsmiles.read_fragments` function. >>> fragment_dicts = [] >>> for frag_string in ["{#B1=[#PEO]|4,#B2=[#PE]|2}", "{#PEO=[>]COC[<],#PE=[>]CC[<]}"]: >>> frag_dict = read_fragments(frag_string) >>> fragment_dicts.append(frag_dict) >>> cgsmiles_str = "{[#B1][#B2][#B1]}" >>> resolver = MoleculeResolver.from_fragment_dicts(cgsmiles_str, fragment_dicts) Subclassing ----------- More advanced workflows can easily be implemented by subclassing the MoleculeResolver and adding new constructors that peform more complex preparation instructions for example. """ def __init__(self, molecule_graph, fragment_dicts, last_all_atom=True, legacy=True): """ Parameters ---------- molecule_graph: networkx.Graph a lower resolution molecule graph to be resolved to higher resolutions molecule graphs. Each node must have the fragname with a dict entry in the next fragment_dicts list. fragment_dicts: list[dict[str, networkx.Graph]] a dict of fragment graphs per resolution. Each graph must have the same attributes as returned by the `cgsmiles.read_fragments` function. last_all_atom: bool if the last resolution is at the all atom level. If True the code will use pysmiles to parse the fragments and return the all-atom molecule. Default: True legacy: bool which syntax convention to use for matching the bonding descriptors. Legacy syntax adheres to the BigSmiles convention. Default syntax adheres to CGsmiles convention where bonding descriptors '$' match with every '$' and every '<' matches every '>'. With the BigSmiles convention a alphanumeric string may be provided that distinguishes these connectors. For example, '$A' would not match '$B'. However, such use cases should be rare and the CGsmiles convention facilitates usage of bonding descriptors in the Sampler where the labels are used to assign different probabilities. """ self.meta_graph = nx.Graph() self.fragment_dicts = fragment_dicts self.molecule = molecule_graph self.last_all_atom = last_all_atom self.resolution_counter = 0 self.resolutions = len(self.fragment_dicts) new_names = nx.get_node_attributes(self.molecule, "fragname") nx.set_node_attributes(self.meta_graph, new_names, "atomname") self.legacy = legacy
[docs] @staticmethod def read_fragment_strings(fragment_strings, last_all_atom=True): """ Read a list of CGsmiles fragment_strings and return a list of dicts with the fragment graphs. If `last_all_atom` is True then pysmiles is used to read the last fragment string provided in the list. Parameters ---------- fragment_strings: list[str] list of CGsmiles fragment strings last_all_atom: bool if the last string in the list is an all atom string and should be read using pysmiles. Returns ------- list[dict[str, networkx.Graph]] a list of the fragment dicts composed of the fragment name and a nx.Graph describing the fragment """ fragment_dicts = [] for idx, fragment_str in enumerate(fragment_strings): all_atom = (idx == len(fragment_strings) - 1 and last_all_atom) f_dict = read_fragments(fragment_str, all_atom=all_atom) fragment_dicts.append(f_dict) return fragment_dicts
[docs] def resolve_disconnected_molecule(self, fragment_dict): """ Given a connected graph of nodes with associated fragment graphs generate a disconnected graph of the fragments and annotate each fragment graph to the node in the higher resolution graph. Parameters ---------- fragment_dict: dict[str, networkx.Graph] a dict of fragment graphs """ for meta_node in self.meta_graph.nodes: fragname = self.meta_graph.nodes[meta_node]['fragname'] # we have a virtual node and skip it if fragname not in fragment_dict: neighbors = self.meta_graph.neighbors(meta_node) orders = [self.meta_graph.edges[(meta_node, neigh)]["order"] for neigh in neighbors] if not all(np.array(orders) == 0): raise SyntaxError(f"Found node #{fragname} but no corresponding fragment.") continue fragment = fragment_dict[fragname] correspondence = merge_graphs(self.molecule, fragment, fragid=meta_node) graph_frag = nx.Graph() for node in fragment.nodes: new_node = correspondence[node] attrs = copy.deepcopy(self.molecule.nodes[new_node]) graph_frag.add_node(correspondence[node], **attrs) nx.set_node_attributes(graph_frag, [meta_node], 'fragid') graph_frag.nodes[new_node]['mapping'] = [(fragname, node)] self.molecule.nodes[new_node]['mapping'] = [(fragname, node)] if self.last_all_atom: # we need to keep some info about the original valances edge_orders = [self.molecule.edges[edge]["order"] for edge in self.molecule.edges(new_node)] self.molecule.nodes[new_node]["_edge_orders"] = sum(edge_orders) self.molecule.nodes[new_node]['_hcount'] = self.molecule.nodes[new_node].get('hcount', 0) for a, b in fragment.edges: new_a = correspondence[a] new_b = correspondence[b] attrs = copy.deepcopy(fragment.edges[(a, b)]) graph_frag.add_edge(new_a, new_b, **attrs) self.meta_graph.nodes[meta_node]['graph'] = graph_frag
[docs] def edges_from_bonding_descrpt(self, all_atom=True): """ Makes edges according to the bonding descriptors stored in the node attributes of meta_molecule residue graph. If a bonding descriptor is consumed it is removed from the list, however, the meta_graph edge gets an attribute with the bonding descriptors that formed the edge. Later unconsumed descriptors are discarded and the valence filled in using hydrogen atoms in case of an atomistic molecule. Parameters ---------- all_atom: bool if the high resolution level graph has all-atom resolution default: False """ edges = list(self.meta_graph.edges) for prev_node, node in edges: for _ in range(0, self.meta_graph.edges[(prev_node, node)]["order"]): prev_graph = self.meta_graph.nodes[prev_node]['graph'] node_graph = self.meta_graph.nodes[node]['graph'] try: edge, bonding = match_bonding_descriptors(prev_graph, node_graph, legacy=self.legacy) except LookupError: continue # remove used bonding descriptors prev_graph.nodes[edge[0]]['bonding'].remove(bonding[0]) node_graph.nodes[edge[1]]['bonding'].remove(bonding[1]) # bonding descriptors are assumed to have bonding order 1 # unless they are specifically annotated order = int(bonding[0][-1]) if self.molecule.nodes[edge[0]].get('aromatic', False) and\ self.molecule.nodes[edge[1]].get('aromatic', False): order = 1.5 self.molecule.add_edge(edge[0], edge[1], bonding=bonding, order=order)
[docs] def squash_atoms(self): """ Applies the squash operator by removing the duplicate node adding, all edges from that node to the remaining one, and annotate the other node with the fragid of the removed node. """ bondings = nx.get_edge_attributes(self.molecule, 'bonding') squashed = {} for edge, bonding in bondings.items(): if not bonding[0].startswith('!'): continue # let's squash two nodes node_to_keep = squashed.get(edge[0], edge[0]) node_to_remove = squashed.get(edge[1], edge[1]) squashed[node_to_remove] = node_to_keep self.molecule = nx.contracted_nodes(self.molecule, node_to_keep, node_to_remove, self_loops=False) # add the fragment id of the sequashed node self.molecule.nodes[node_to_keep]['fragid'] += self.molecule.nodes[node_to_keep]['contraction'][node_to_remove]['fragid'] self.molecule.nodes[node_to_keep]['mapping'] += self.molecule.nodes[node_to_keep]['contraction'][node_to_remove]['mapping'] # add the isomer class of the squashed node if 'ez_isomer_class' in self.molecule.nodes[node_to_keep]['contraction'][node_to_remove]: if 'ez_isomer_class' in self.molecule.nodes[node_to_keep]: self.molecule.nodes[node_to_keep]['ez_isomer_class'] += self.molecule.nodes[node_to_keep]['contraction'][node_to_remove]['ez_isomer_class'] else: self.molecule.nodes[node_to_keep]['ez_isomer_class'] = self.molecule.nodes[node_to_keep]['contraction'][node_to_remove]['ez_isomer_class']
[docs] def resolve(self): """ Resolve a CGsmiles string once and return the next resolution. """ # check if this is an all-atom level resolution all_atom = (self.resolution_counter == self.resolutions - 1 and self.last_all_atom) # get the next set of fragments fragment_dict = self.fragment_dicts[self.resolution_counter] # set the previous molecule as meta_graph self.meta_graph = self.molecule # now we have to switch the node names and the fragment names new_fragnames = nx.get_node_attributes(self.meta_graph, "atomname") nx.set_node_attributes(self.meta_graph, new_fragnames, "fragname") # create an empty molecule graph self.molecule = nx.Graph() # add disconnected fragments to graph self.resolve_disconnected_molecule(fragment_dict) # connect valid bonding descriptors self.edges_from_bonding_descrpt(all_atom=all_atom) # contract atoms with squash descriptors self.squash_atoms() # rebuild hydrogen in all-atom case if all_atom: _adjust_hcount(self.molecule) rebuild_h_atoms(self.molecule) # sort the atoms self.molecule = sort_nodes_by_attr(self.molecule, sort_attr=("fragid")) if all_atom: annotate_ez_isomers_cgsmiles(self.molecule) # and redo the meta molecule self.meta_graph = annotate_fragments(self.meta_graph, self.molecule) if all_atom: # in all-atom MD there are common naming conventions # that might be expected and hence we set them here set_atom_names_atomistic(self.molecule, self.meta_graph) # increment the resolution counter self.resolution_counter += 1 return self.meta_graph, self.molecule
[docs] def resolve_iter(self): """ Iterator returning all resolutions in oder. """ for _ in range(self.resolutions): meta_graph, molecule = self.resolve() yield meta_graph, molecule
[docs] def resolve_all(self): """ Resolve all layers and return final moleculs as well as the previous resolution graph. """ *_, (meta_graph, graph) = self.resolve_iter() return meta_graph, graph
[docs] @classmethod def from_string(cls, cgsmiles_str, last_all_atom=True, legacy=True): """ Initiate a MoleculeResolver instance from a cgsmiles string. Parameters ---------- cgsmiles_str: str last_all_atom: bool if the last resolution is all-atom and is read using pysmiles legacy: bool which syntax convention to use for matching the bonding descriptors. Legacy syntax adheres to the BigSmiles convention. Default syntax adheres to CGsmiles convention. A more detailed explanation can be found in the MoleculeResolver.__init__ method. Returns ------- :class:`MoleculeResolver` """ # here we figure out how many resolutions we are dealing with elements = re.findall(r"\{[^\}]+\}", cgsmiles_str) # the first one describes our lowest resolution molecule = read_cgsmiles(elements[0]) # the rest are fragment lists fragment_dicts = cls.read_fragment_strings(elements[1:], last_all_atom=last_all_atom) resolver_obj = cls(molecule_graph=molecule, fragment_dicts=fragment_dicts, last_all_atom=last_all_atom, legacy=legacy) return resolver_obj
[docs] @classmethod def from_graph(cls, cgsmiles_str, meta_graph, last_all_atom=True, legacy=True): """ Initiate a MoleculeResolver instance from a cgsmiles string and a `meta_graph` that describes the lowest resolution. Parameters ---------- cgsmiles_str: str meta_graph: networkx.Graph a graph describing the lowest resolution. All nodes must have the fragname attribute set. last_all_atom: bool if the last resolution is all-atom and is read using pysmiles legacy: bool which syntax convention to use for matching the bonding descriptors. Legacy syntax adheres to the BigSmiles convention. Default syntax adheres to CGsmiles convention. A more detailed explanation can be found in the MoleculeResolver.__init__ method. Returns ------- :class:`MoleculeResolver` """ # here we figure out how many resolutions we are dealing with elements = re.findall(r"\{[^\}]+\}", cgsmiles_str) # all elements are are fragment lists fragment_dicts = cls.read_fragment_strings(elements, last_all_atom=last_all_atom) if not all('fragname' in meta_graph.nodes[n] for n in meta_graph.nodes): msg = "All nodes must have the fragname attribute set." raise IOError(msg) resolver_obj = cls(molecule_graph=meta_graph, fragment_dicts=fragment_dicts, last_all_atom=last_all_atom, legacy=legacy) return resolver_obj
[docs] @classmethod def from_fragment_dicts(cls, cgsmiles_str, fragment_dicts, last_all_atom=True, legacy=True): """ Initiate a MoleculeResolver instance from a cgsmiles string, describing one molecule and fragment_dicts containing fragments for each resolution. Parameters ---------- cgsmiles_str: str fragment_dicts: list[dict[str, networkx.Graph]] a dict of fragment graphs per resolution. Each graph must have the same attributes as returned by the `cgsmiles.read_fragments` function. last_all_atom: bool if the last resolution is all-atom and is read using pysmiles legacy: bool which syntax convention to use for matching the bonding descriptors. Legacy syntax adheres to the BigSmiles convention. Default syntax adheres to CGsmiles convention. A more detailed explanation can be found in the MoleculeResolver.__init__ method. Returns ------- :class:`MoleculeResolver` """ # here we figure out how many resolutions we are dealing with elements = re.findall(r"\{[^\}]+\}", cgsmiles_str) if len(elements) > 1: msg = ("Your cgsmiles string can only describe one " "resolution of a molecule when using this function.") raise IOError(msg) # the first one describes our lowest resolution molecule = read_cgsmiles(elements[0]) resolver_obj = cls(molecule_graph=molecule, fragment_dicts=fragment_dicts, last_all_atom=last_all_atom, legacy=legacy) return resolver_obj