Source code for cgsmiles.sample

import re
import logging
import random
import time
from collections import defaultdict
import numpy as np
import networkx as nx
from .read_fragments import read_fragments
from .graph_utils import (merge_graphs,
                          sort_nodes_by_attr,
                          set_atom_names_atomistic)
from .pysmiles_utils import rebuild_h_atoms, compute_mass
from .cgsmiles_utils import find_open_bonds, find_complementary_bonding_descriptor

logger = logging.getLogger(__name__)

def _select_bonding_operator(bonds, probabilities=None):
    if probabilities:
        probs = np.array([probabilities.get(bond_type, 0) for bond_type in bonds])
        probs = probs / sum(probs)
        bonding = random.choices(bonds, weights=probs)[0]
    else:
        bonding  = random.choice(bonds)
    return bonding

def _set_bond_order_defaults(bonding):
    if isinstance(bonding, dict):
        default_dict = {}
        for bond_operator, prob in bonding.items():
            # we need to patch the bond order space
            if not bond_operator[-1].isdigit():
                bond_operator += '1'
            default_dict[bond_operator] = prob
        return default_dict
    else:
        default_list = []
        for bond_operator in bonding:
            if not bond_operator[-1].isdigit():
                bond_operator += '1'
            default_list.append(bond_operator)
        return default_list

[docs] class MoleculeSampler: """ Given a fragment string in CGsmiles format and probabilities for residues to occur, return a random molecule with target molecular weight. First, this class has to be initiated using the class construction method `from_string`, which makes sure to read and resolve the fragment graphs provided in the CGsmiles string. Once the `MoleculeSampler` is initiated you can call the `sampler` method in order to generate a new random polymer molecule from the fragment string that was provided. In addition to just randomly creating a molecule reactivities and termination probabilities can be provided to steer the outcome. We use the term and variable name `reactivity` whenever referring to a probability that a certain bonding descriptor is used to highlight the fact that the probabilistic behaviour is driven by the bonding descriptors. The sampler features a two level connectivity determination. First using the `polymer_reactivities` an open bonding descriptor from the growing polymer molecule is selected according to the probabilities provided. Subsequently, a fragment is randomly chosen that has a matching complementary bonding descriptor. If in addition the `fragment_reactivities` are provided the algorithm will select a fragment that features a bonding descriptor according to the conditional probabilities provided. The `fragment_reactivities` keyword accepts a two level dict. The first key is the bonding descriptor found on the polymer molecule (i.e. the one that will be used to extend the polymer). The second level contains a dict specifying a probability for the bonding descriptors found in the fragments (i.e. their conditional probability given the polymer descriptors). Basic Examples -------------- Random-copolymer of PMMA and PS with equal probabilities. >>> cgsmiles_str = "{#PMMA=[>]C(C)C[<]C(=O)OC,#PS=[>]CC[<]c1cccc1}" >>> polymer_reactivities = {'>':0.5, '<':0.5} >>> sampler = MoleculeSampler.from_fragment_string(cgsmiles_str, polymer_reactivities=polymer_reactivities) >>> polymer_graph = sampler.sample(target_weight=10000) One can also make a blocky copolymer by providing the conditional reactivities. To distinguish the descriptors on PMMA vs PS we can label them with a alphanumeric string. >>> cgsmiles_str = "{#PMMA=[$A]C(C)C[$B]C(=O)OC,#PS=[$C]CC[$D]c1cccc1}" >>> polymer_reactivites = {'$A':0.5, '$B':0, '$C': 0.5, '$D': 0.0} >>> fragment_reactivities = {'$A': {'$A': 0., '$C': 0., '$B': 0.7, '$D': 0.3}, '$B': {'$A': 0.7, '$C': 0.3, '$B': 0.0, '$D': 0.0 }, '$C': {'$A': 0., '$C': 0., '$B': 0.3, '$D': 0.7}, '$D': {'$A': 0.3, '$C': 0.7, '$B': 0.0, '$D': 0.0 }} >>> sampler = MoleculeSampler.from_fragment_string(cgsmiles_str, polymer_reactiviyies=polymer_reactivities, fragment_reactivities=fragment_reactivities) >>> polymer_graph = sampler.sample(target_weight=10000) One can also make a blocky copolymer but steer the probability of having head-to-head tail-to-tail vs head-to-tail addition. This is done by labeling the bonding descriptors A-D as before but this time allowing head-head and tail-tail additions (i.e. (A,A) & (A,C)) but with lower reactivities. >>> cgsmiles_str = "{#PMMA=[$A]C(C)C[$B]C(=O)OC,#PS=[$C]CC[$D]c1cccc1}" >>> polymer_reactivities = {'$A':0.25, '$B':0.25, '$C': 0.25, '$D': 0.25} >>> fragment_reactivities = {'$A': {'$A': 0.1, '$C': 0.1, '$B': 0.4, '$D': 0.4}, '$B': {'$A': 0.4, '$C': 0.4, '$B': 0.1, '$D': 0.1}, '$C': {'$A': 0.1, '$C': 0.1, '$B': 0.4, '$D': 0.4}, '$D': {'$A': 0.4, '$C': 0.4, '$B': 0.1, '$D': 0.1 }} >>> sampler = MoleculeSampler.from_fragment_string(cgsmiles_str, polymer_reactivities=polymer_reactivities, fragment_reactivities=fragment_reactivities) >>> polymer_graph = sampler.sample(target_weight=10000) Advanced API Examples --------------------- Of course the sampler is not limited to linear co-polymers. Also randomly branched molecules such as bottle-brush copolymers can be generated. To steer the branching ratio and branch extension as well as terminal end groups the `branch_term_probs`, `terminal_fragments`, and `terminal_reactivities` can be provided. For example, To generate a bottle brush polymer that has PMA in the backbone and PEG as side-chain terminated with an OH group the following CGsmiles string in combination with the above mentioned probabilities can be provided. Note that in this case we declare '$A' and '$B' to be terminal bonding descriptors. That means if they are selected all other bonding descriptors are automatically removed from the previous residue. So if PEG connects to OH '>A' is removed from PEG to avoid a trivalent PEG. Likewise, if the terminal bonding selector is not chosen (i.e. if PEG connects to PEG) it is removed from the source residue. Finally, we need to specify that '$A' has 0 chance of connecting to itself. >>> cgsmiles_str="{#PMA=[>]CC[<]C(=O)OC[>A],#PEG=[<A]COC[>A][$A],#OH=[$B]O}" >>> sampler = MoleculeSampler.from_fragment_string(cgsmiles_str, terminal_bonds=['$A', '$B], polymer_reactivities={'<': 0.1, '>': 0.1, '>A': 0.8, '<A':0.8, '$A':0.3, '$B':0.0}, fragment_reactivities={'$A': {'$A':0, '$B': 1.0},}, all_atom=True) >>> molecule = sampler.sample(target_weight=5008) This combination results into a dense brush where very PMA molecule has a PEG branch whose length is determined by a 1 in 3 probability to terminate. In contrast one can also generate sparse branched molecules. For instance, Dextran at low molecular weights is mainly a linear alpha 1-6 polyglucose polymer. Occasionally, a 1-4 branch extends from it which has lengths of 2-4. At the Martini coarse-grained level Dextran is described by three particles A, B, and C. The alpha 1-6 bonds are between A and C whereas the 1-4 bonds are between A and B. >>> cgsmiles_str="{#GLC=[$A][#A]1[#B][$B][#C]1[$C]}" >>> sampler = MoleculeSampler.from_fragment_string(cgsmiles_str, fragment_masses={'GLC': 165}, polymer_reactivities={'$A': 0.8, '$C': 0.1, '$B': 0.1}, fragment_reactivities={'$A': {'$A': 0.0, '$C': 1.0, '$B': 0.0}, '$B': {'$A': 1.0, '$C': 0.0, '$B': 0.0}, '$C': {'$A':1.0, '$C':0.0, '$B':0.0}}, all_atom=False) >>> molecule = sampler.sample(target_weight=5008) """ def __init__(self, fragment_dict, polymer_reactivities, fragment_reactivities={}, terminal_bonds=[], fragment_masses=None, all_atom=True, seed=None): """ Parameters ---------- fragment_dict: dict[str, networkx.Graph] a dict of fragment graphs at one resolution. Each graph must have the same attributes as returned by the `cgsmiles.read_fragments` function. polymer_reactivities: dict[str, float] a dict that lists all bonding descriptors in the fragments and assign a probability to them (i.e. that they will react). fragment_reactivities probability of given a certain bonding descriptor what should be the next bonding descriptor. For example: {'&A': {'&A': 0.2, '&B': 0.8}, '&B': {'&B': 0.2, '&A': 0.8}} terminal_bonds: list[str] a list of bonding descriptors for termini fragment_masses: dict[str, float] masses of the molecule fragments; if all_atom is True these can be left out and are automatically computed from the element masses all_atom: bool if the fragments are all-atom resolution seed: int set random seed for all processes; default is None """ # first initialize the random number generator if seed is None: seed = time.time_ns() logger.info("Your random seed is %i", seed) random.seed(a=seed) self.fragment_dict = fragment_dict # we need to set some defaults and attributes self.polymer_reactivities = _set_bond_order_defaults(polymer_reactivities) self.fragment_reactivities = {} for key, probs in fragment_reactivities.items(): if not key[-1].isdigit(): key += '1' self.fragment_reactivities[key] = _set_bond_order_defaults(probs) self.all_atom = all_atom # make sure to get bond order defaults self.terminal_bonds = _set_bond_order_defaults(terminal_bonds) # we need to make sure that we have the molecular # masses so we can compute the target weight self.fragments_by_bonding = defaultdict(list) self.terminals_by_bonding = defaultdict(list) if fragment_masses: guess_mass_from_PTE = False self.fragment_masses = fragment_masses elif all_atom: self.fragment_masses = {} guess_mass_from_PTE = True else: msg = ("No fragment masses were provided but the resolution" "is not all_atom. We cannot guess masses for arbitrary" "CG molecules.") raise IOError(msg) # we need to store which bonding descriptors is present in which # fragment so we can later just look them up for fragname, fraggraph in self.fragment_dict.items(): if guess_mass_from_PTE: mass = compute_mass(fraggraph) self.fragment_masses[fragname] = mass bondings = nx.get_node_attributes(fraggraph, "bonding") for node, bondings in bondings.items(): for bonding in bondings: self.fragments_by_bonding[bonding].append((fragname, node))
[docs] def add_fragment(self, molecule, open_bonds, fragments, polymer_reactivities, fragment_reactivities): """ Pick an open bonding descriptor according to `polymer_reactivities` and then pick a fragment that has the complementary bonding descriptor. For the second step conditional probabilities can be given using `fragment_reactivites`. Parameters ---------- molecule: networkx.Graph the molecule to extend open_bonds: dict[list[collections.abc.Hashable]] a dict of bonding active descriptors with list of nodes in molecule as value fragments: dict[list[str]] a dict of fragment names indexed by their bonding descriptors polymer_reactivities: the porbabilities that a bonding connector in the polymer forms a bond with the next fragment fragment_reactivities: conditional probabilties that given a bonding connector in the polymer a specific bonding connector in the fragments will be selected Returns ------- networkx.Graph the grown molecule str the fragment name of the added fragment """ # 1. get the probabilities of any bonding descriptor on the chain to # form the new bond and pick one at random from the available ones bonding = _select_bonding_operator(list(open_bonds.keys()), polymer_reactivities) # 2. get a corresponding node; it may be that one descriptor is found on # several nodes source_node = random.choice(open_bonds[bonding]) # 3. get the complementary matching bonding descriptor compl_bonds = find_complementary_bonding_descriptor(bonding, list(fragments.keys())) compl_bonding = _select_bonding_operator(compl_bonds, fragment_reactivities.get(bonding, None)) # 4. pick a new fragment that has such bonding descriptor fragname, target_node = random.choice(fragments[compl_bonding]) # 5. add the new fragment and do some book-keeping correspondence = merge_graphs(molecule, self.fragment_dict[fragname]) molecule.add_edge(source_node, correspondence[target_node], bonding=(bonding, compl_bonding), order = int(bonding[-1])) molecule.nodes[source_node]['bonding'].remove(bonding) molecule.nodes[correspondence[target_node]]['bonding'].remove(compl_bonding) # here we deal with stochastic termination of branches # we added a terminal fragments so we remove all other # bonding descriptors form the source_node if compl_bonding in self.terminal_bonds: del molecule.nodes[source_node]['bonding'] # we did not add a terminal fragment but now we need to # remove all terminal bonding descriptors from source node else: other_bonds = molecule.nodes[source_node].get('bonding', []) clean_bonds = [] for bond in other_bonds: if bond not in self.terminal_bonds: clean_bonds.append(bond) molecule.nodes[source_node]['bonding'] = clean_bonds return molecule, fragname
[docs] def sample(self, target_weight, start_fragment=None): """ From a list of cgsmiles fragment graphs generate a new random molecule by stitching them together until a target_weight is reached. Parameters ---------- target_weight: int the weight of the polymer to generate the weight is the minimum weight and can differ by one additional fragment plus terminal start_fragment: str the fragment name to start with Returns ------- networkx.Graph the graph of the molecule """ molecule = nx.Graph() if start_fragment: fragment = self.fragment_dict[start_fragment] else: # initialize the molecule; all fragments have the same probability fragname = random.choice(list(self.fragment_dict.keys())) fragment = self.fragment_dict[fragname] merge_graphs(molecule, fragment) open_bonds = find_open_bonds(molecule) current_weight = 0 # next we add monomers one after the other fragid = 1 while current_weight < target_weight: open_bonds = find_open_bonds(molecule) molecule, fragname = self.add_fragment(molecule, open_bonds, self.fragments_by_bonding, self.polymer_reactivities, self.fragment_reactivities) current_weight += self.fragment_masses[fragname] fragid += 1 if self.all_atom: rebuild_h_atoms(molecule) # sort the atoms molecule = sort_nodes_by_attr(molecule, sort_attr=("fragid")) # in all-atom MD there are common naming conventions # that might be expected and hence we set them here if self.all_atom: set_atom_names_atomistic(molecule) return molecule
[docs] @classmethod def from_fragment_string(cls, cgsmiles_str, **kwargs): """ Initiate a MoleculeSampler instance from a cgsmiles string, describing the fragments fragments making up the polymer at one resolution. Parameters ---------- cgsmiles_str: str **kwargs: same as MoleculeSampler.__init__ Returns ------- :class:`MoleculeSampler` """ fragment_strings = re.findall(r"\{[^\}]+\}", cgsmiles_str) if len(fragment_strings) > 1: raise IOError("Sampling can only be done on one resolution.") all_atom = kwargs.get('all_atom', True) fragment_dict = read_fragments(fragment_strings[0], all_atom=all_atom) sampler = cls(fragment_dict, **kwargs) return sampler