import re
import logging
import random
import time
from collections import defaultdict
import numpy as np
import networkx as nx
from .read_fragments import read_fragments
from .graph_utils import (merge_graphs,
sort_nodes_by_attr,
set_atom_names_atomistic)
from .pysmiles_utils import rebuild_h_atoms, compute_mass
from .cgsmiles_utils import find_open_bonds, find_complementary_bonding_descriptor
logger = logging.getLogger(__name__)
def _select_bonding_operator(bonds, probabilities=None):
if probabilities:
probs = np.array([probabilities.get(bond_type, 0) for bond_type in bonds])
probs = probs / sum(probs)
bonding = random.choices(bonds, weights=probs)[0]
else:
bonding = random.choice(bonds)
return bonding
def _set_bond_order_defaults(bonding):
if isinstance(bonding, dict):
default_dict = {}
for bond_operator, prob in bonding.items():
# we need to patch the bond order space
if not bond_operator[-1].isdigit():
bond_operator += '1'
default_dict[bond_operator] = prob
return default_dict
else:
default_list = []
for bond_operator in bonding:
if not bond_operator[-1].isdigit():
bond_operator += '1'
default_list.append(bond_operator)
return default_list
[docs]
class MoleculeSampler:
"""
Given a fragment string in CGsmiles format and probabilities for residues
to occur, return a random molecule with target molecular weight.
First, this class has to be initiated using the class construction
method `from_string`, which makes sure to read and resolve the fragment
graphs provided in the CGsmiles string.
Once the `MoleculeSampler` is initiated you can call the `sampler` method
in order to generate a new random polymer molecule from the fragment string
that was provided.
In addition to just randomly creating a molecule reactivities and termination
probabilities can be provided to steer the outcome. We use the term and variable
name `reactivity` whenever referring to a probability that a certain bonding
descriptor is used to highlight the fact that the probabilistic behaviour is driven
by the bonding descriptors.
The sampler features a two level connectivity determination. First using the
`polymer_reactivities` an open bonding descriptor from the growing polymer
molecule is selected according to the probabilities provided. Subsequently,
a fragment is randomly chosen that has a matching complementary bonding
descriptor.
If in addition the `fragment_reactivities` are provided the algorithm will
select a fragment that features a bonding descriptor according to the
conditional probabilities provided. The `fragment_reactivities` keyword
accepts a two level dict. The first key is the bonding descriptor found on
the polymer molecule (i.e. the one that will be used to extend the polymer).
The second level contains a dict specifying a probability for the bonding
descriptors found in the fragments (i.e. their conditional probability given
the polymer descriptors).
Basic Examples
--------------
Random-copolymer of PMMA and PS with equal probabilities.
>>> cgsmiles_str = "{#PMMA=[>]C(C)C[<]C(=O)OC,#PS=[>]CC[<]c1cccc1}"
>>> polymer_reactivities = {'>':0.5, '<':0.5}
>>> sampler = MoleculeSampler.from_fragment_string(cgsmiles_str,
polymer_reactivities=polymer_reactivities)
>>> polymer_graph = sampler.sample(target_weight=10000)
One can also make a blocky copolymer by providing the conditional
reactivities. To distinguish the descriptors on PMMA vs PS we can
label them with a alphanumeric string.
>>> cgsmiles_str = "{#PMMA=[$A]C(C)C[$B]C(=O)OC,#PS=[$C]CC[$D]c1cccc1}"
>>> polymer_reactivites = {'$A':0.5, '$B':0, '$C': 0.5, '$D': 0.0}
>>> fragment_reactivities = {'$A': {'$A': 0., '$C': 0., '$B': 0.7, '$D': 0.3},
'$B': {'$A': 0.7, '$C': 0.3, '$B': 0.0, '$D': 0.0 },
'$C': {'$A': 0., '$C': 0., '$B': 0.3, '$D': 0.7},
'$D': {'$A': 0.3, '$C': 0.7, '$B': 0.0, '$D': 0.0 }}
>>> sampler = MoleculeSampler.from_fragment_string(cgsmiles_str,
polymer_reactiviyies=polymer_reactivities,
fragment_reactivities=fragment_reactivities)
>>> polymer_graph = sampler.sample(target_weight=10000)
One can also make a blocky copolymer but steer the probability of having
head-to-head tail-to-tail vs head-to-tail addition. This is done by labeling
the bonding descriptors A-D as before but this time allowing head-head
and tail-tail additions (i.e. (A,A) & (A,C)) but with lower reactivities.
>>> cgsmiles_str = "{#PMMA=[$A]C(C)C[$B]C(=O)OC,#PS=[$C]CC[$D]c1cccc1}"
>>> polymer_reactivities = {'$A':0.25, '$B':0.25, '$C': 0.25, '$D': 0.25}
>>> fragment_reactivities = {'$A': {'$A': 0.1, '$C': 0.1, '$B': 0.4, '$D': 0.4},
'$B': {'$A': 0.4, '$C': 0.4, '$B': 0.1, '$D': 0.1},
'$C': {'$A': 0.1, '$C': 0.1, '$B': 0.4, '$D': 0.4},
'$D': {'$A': 0.4, '$C': 0.4, '$B': 0.1, '$D': 0.1 }}
>>> sampler = MoleculeSampler.from_fragment_string(cgsmiles_str,
polymer_reactivities=polymer_reactivities,
fragment_reactivities=fragment_reactivities)
>>> polymer_graph = sampler.sample(target_weight=10000)
Advanced API Examples
---------------------
Of course the sampler is not limited to linear co-polymers. Also randomly
branched molecules such as bottle-brush copolymers can be generated. To
steer the branching ratio and branch extension as well as terminal end
groups the `branch_term_probs`, `terminal_fragments`, and `terminal_reactivities`
can be provided.
For example, To generate a bottle brush polymer that has PMA in the backbone
and PEG as side-chain terminated with an OH group the following CGsmiles string
in combination with the above mentioned probabilities can be provided.
Note that in this case we declare '$A' and '$B' to be terminal bonding
descriptors. That means if they are selected all other bonding descriptors
are automatically removed from the previous residue. So if PEG connects
to OH '>A' is removed from PEG to avoid a trivalent PEG. Likewise, if
the terminal bonding selector is not chosen (i.e. if PEG connects to PEG)
it is removed from the source residue. Finally, we need to specify that
'$A' has 0 chance of connecting to itself.
>>> cgsmiles_str="{#PMA=[>]CC[<]C(=O)OC[>A],#PEG=[<A]COC[>A][$A],#OH=[$B]O}"
>>> sampler = MoleculeSampler.from_fragment_string(cgsmiles_str,
terminal_bonds=['$A', '$B],
polymer_reactivities={'<': 0.1, '>': 0.1,
'>A': 0.8, '<A':0.8,
'$A':0.3, '$B':0.0},
fragment_reactivities={'$A': {'$A':0, '$B': 1.0},},
all_atom=True)
>>> molecule = sampler.sample(target_weight=5008)
This combination results into a dense brush where very PMA molecule has a PEG
branch whose length is determined by a 1 in 3 probability to terminate.
In contrast one can also generate sparse branched molecules. For instance,
Dextran at low molecular weights is mainly a linear alpha 1-6 polyglucose
polymer. Occasionally, a 1-4 branch extends from it which has lengths of 2-4.
At the Martini coarse-grained level Dextran is described by three particles
A, B, and C. The alpha 1-6 bonds are between A and C whereas the 1-4 bonds
are between A and B.
>>> cgsmiles_str="{#GLC=[$A][#A]1[#B][$B][#C]1[$C]}"
>>> sampler = MoleculeSampler.from_fragment_string(cgsmiles_str,
fragment_masses={'GLC': 165},
polymer_reactivities={'$A': 0.8, '$C': 0.1, '$B': 0.1},
fragment_reactivities={'$A': {'$A': 0.0, '$C': 1.0, '$B': 0.0},
'$B': {'$A': 1.0, '$C': 0.0, '$B': 0.0},
'$C': {'$A':1.0, '$C':0.0, '$B':0.0}},
all_atom=False)
>>> molecule = sampler.sample(target_weight=5008)
"""
def __init__(self,
fragment_dict,
polymer_reactivities,
fragment_reactivities={},
terminal_bonds=[],
fragment_masses=None,
all_atom=True,
seed=None):
"""
Parameters
----------
fragment_dict: dict[str, networkx.Graph]
a dict of fragment graphs at one resolution. Each graph must have
the same attributes as returned by the `cgsmiles.read_fragments`
function.
polymer_reactivities: dict[str, float]
a dict that lists all bonding descriptors in the fragments
and assign a probability to them (i.e. that they will react).
fragment_reactivities
probability of given a certain bonding descriptor what
should be the next bonding descriptor. For example:
{'&A': {'&A': 0.2, '&B': 0.8}, '&B': {'&B': 0.2, '&A': 0.8}}
terminal_bonds: list[str]
a list of bonding descriptors for termini
fragment_masses: dict[str, float]
masses of the molecule fragments; if all_atom is True
these can be left out and are automatically computed from
the element masses
all_atom: bool
if the fragments are all-atom resolution
seed: int
set random seed for all processes; default is None
"""
# first initialize the random number generator
if seed is None:
seed = time.time_ns()
logger.info("Your random seed is %i", seed)
random.seed(a=seed)
self.fragment_dict = fragment_dict
# we need to set some defaults and attributes
self.polymer_reactivities = _set_bond_order_defaults(polymer_reactivities)
self.fragment_reactivities = {}
for key, probs in fragment_reactivities.items():
if not key[-1].isdigit():
key += '1'
self.fragment_reactivities[key] = _set_bond_order_defaults(probs)
self.all_atom = all_atom
# make sure to get bond order defaults
self.terminal_bonds = _set_bond_order_defaults(terminal_bonds)
# we need to make sure that we have the molecular
# masses so we can compute the target weight
self.fragments_by_bonding = defaultdict(list)
self.terminals_by_bonding = defaultdict(list)
if fragment_masses:
guess_mass_from_PTE = False
self.fragment_masses = fragment_masses
elif all_atom:
self.fragment_masses = {}
guess_mass_from_PTE = True
else:
msg = ("No fragment masses were provided but the resolution"
"is not all_atom. We cannot guess masses for arbitrary"
"CG molecules.")
raise IOError(msg)
# we need to store which bonding descriptors is present in which
# fragment so we can later just look them up
for fragname, fraggraph in self.fragment_dict.items():
if guess_mass_from_PTE:
mass = compute_mass(fraggraph)
self.fragment_masses[fragname] = mass
bondings = nx.get_node_attributes(fraggraph, "bonding")
for node, bondings in bondings.items():
for bonding in bondings:
self.fragments_by_bonding[bonding].append((fragname, node))
[docs]
def add_fragment(self,
molecule,
open_bonds,
fragments,
polymer_reactivities,
fragment_reactivities):
"""
Pick an open bonding descriptor according to `polymer_reactivities`
and then pick a fragment that has the complementary bonding descriptor.
For the second step conditional probabilities can be given using
`fragment_reactivites`.
Parameters
----------
molecule: networkx.Graph
the molecule to extend
open_bonds: dict[list[collections.abc.Hashable]]
a dict of bonding active descriptors with list of nodes
in molecule as value
fragments: dict[list[str]]
a dict of fragment names indexed by their bonding descriptors
polymer_reactivities:
the porbabilities that a bonding connector in the polymer
forms a bond with the next fragment
fragment_reactivities:
conditional probabilties that given a bonding connector in
the polymer a specific bonding connector in the fragments
will be selected
Returns
-------
networkx.Graph
the grown molecule
str
the fragment name of the added fragment
"""
# 1. get the probabilities of any bonding descriptor on the chain to
# form the new bond and pick one at random from the available ones
bonding = _select_bonding_operator(list(open_bonds.keys()), polymer_reactivities)
# 2. get a corresponding node; it may be that one descriptor is found on
# several nodes
source_node = random.choice(open_bonds[bonding])
# 3. get the complementary matching bonding descriptor
compl_bonds = find_complementary_bonding_descriptor(bonding, list(fragments.keys()))
compl_bonding = _select_bonding_operator(compl_bonds, fragment_reactivities.get(bonding, None))
# 4. pick a new fragment that has such bonding descriptor
fragname, target_node = random.choice(fragments[compl_bonding])
# 5. add the new fragment and do some book-keeping
correspondence = merge_graphs(molecule, self.fragment_dict[fragname])
molecule.add_edge(source_node,
correspondence[target_node],
bonding=(bonding, compl_bonding),
order = int(bonding[-1]))
molecule.nodes[source_node]['bonding'].remove(bonding)
molecule.nodes[correspondence[target_node]]['bonding'].remove(compl_bonding)
# here we deal with stochastic termination of branches
# we added a terminal fragments so we remove all other
# bonding descriptors form the source_node
if compl_bonding in self.terminal_bonds:
del molecule.nodes[source_node]['bonding']
# we did not add a terminal fragment but now we need to
# remove all terminal bonding descriptors from source node
else:
other_bonds = molecule.nodes[source_node].get('bonding', [])
clean_bonds = []
for bond in other_bonds:
if bond not in self.terminal_bonds:
clean_bonds.append(bond)
molecule.nodes[source_node]['bonding'] = clean_bonds
return molecule, fragname
[docs]
def sample(self, target_weight, start_fragment=None):
"""
From a list of cgsmiles fragment graphs generate a new random molecule
by stitching them together until a target_weight is reached.
Parameters
----------
target_weight: int
the weight of the polymer to generate
the weight is the minimum weight and can
differ by one additional fragment plus terminal
start_fragment: str
the fragment name to start with
Returns
-------
networkx.Graph
the graph of the molecule
"""
molecule = nx.Graph()
if start_fragment:
fragment = self.fragment_dict[start_fragment]
else:
# initialize the molecule; all fragments have the same probability
fragname = random.choice(list(self.fragment_dict.keys()))
fragment = self.fragment_dict[fragname]
merge_graphs(molecule, fragment)
open_bonds = find_open_bonds(molecule)
current_weight = 0
# next we add monomers one after the other
fragid = 1
while current_weight < target_weight:
open_bonds = find_open_bonds(molecule)
molecule, fragname = self.add_fragment(molecule,
open_bonds,
self.fragments_by_bonding,
self.polymer_reactivities,
self.fragment_reactivities)
current_weight += self.fragment_masses[fragname]
fragid += 1
if self.all_atom:
rebuild_h_atoms(molecule)
# sort the atoms
molecule = sort_nodes_by_attr(molecule, sort_attr=("fragid"))
# in all-atom MD there are common naming conventions
# that might be expected and hence we set them here
if self.all_atom:
set_atom_names_atomistic(molecule)
return molecule
[docs]
@classmethod
def from_fragment_string(cls,
cgsmiles_str,
**kwargs):
"""
Initiate a MoleculeSampler instance from a cgsmiles string, describing
the fragments fragments making up the polymer at one resolution.
Parameters
----------
cgsmiles_str: str
**kwargs:
same as MoleculeSampler.__init__
Returns
-------
:class:`MoleculeSampler`
"""
fragment_strings = re.findall(r"\{[^\}]+\}", cgsmiles_str)
if len(fragment_strings) > 1:
raise IOError("Sampling can only be done on one resolution.")
all_atom = kwargs.get('all_atom', True)
fragment_dict = read_fragments(fragment_strings[0],
all_atom=all_atom)
sampler = cls(fragment_dict,
**kwargs)
return sampler