"""
Functions for reading the fragment list.
"""
import logging
from collections import defaultdict
import networkx as nx
import pysmiles
from .read_cgsmiles import read_cgsmiles
from .dialects import _fragment_node_parser
from .pysmiles_utils import read_fragment_smiles
from .cgsmiles_utils import read_fragment_cgsmiles
logger = logging.getLogger('pysmiles')
logger.setLevel(level=logging.ERROR)
[docs]
class PeekIter(object):
"""
Custom iter that allows looking ahead, without
advancing the actual iter.
"""
def __init__(self, collection):
self.collection = iter(collection)
self._peek = None
def __next__(self):
if self._peek:
item = self._peek
self._peek = None
else:
item = next(self.collection)
return item
[docs]
def peek(self):
if self._peek:
return self._peek
try:
self._peek = next(self)
except StopIteration:
self._peek = None
return self._peek
def __iter__(self):
return self
def _find_bonded_ring_node(ring_nodes, node):
current = ring_nodes.index(node)
if current%2 == 0:
other = ring_nodes[current+1]
else:
other = ring_nodes[current-1]
return other
[docs]
def collect_ring_number(smile_iter, token, node_count, rings):
"""
When a ring identifier is found, this function will add
the current node to the rings dict.
Parameters
----------
smile_iter: :class:`PeekIter`
token: str
node_count: int
rings: dict[list]
Returns
-------
PeekIter
the advanced smiles_iter
str
the current token being processed
str
the ring id
dict[list]
the updated rings dict
"""
multi_ring = False
ring_token = token
partial_str = ""
while True:
if multi_ring and token == '%':
rings[ring_token].append(node_count)
elif multi_ring and token.isdigit():
ring_token += token
elif token == '%':
ring_token += token
multi_ring = True
elif multi_ring:
rings[ring_token].append(node_count)
ring_token = ""
elif token.isdigit():
rings[token].append(node_count)
partial_str += token
token = smile_iter.peek()
if token and not token.isdigit() and not token == '%':
break
try:
token = next(smile_iter)
except StopIteration:
break
return smile_iter, token, partial_str, rings
[docs]
def strip_bonding_descriptors(fragment_string):
"""
Processes a CGsmiles fragment string by
stripping the bonding descriptors and storing
them in a dict with reference to the atom they
refer to. Furthermore, a cleaned SMILES or CGsmiles
string is returned.
Parameters
----------
fragment_string: str
a CGsmiles fragment string
Returns
-------
str:
a canonical SMILES or CGsmiles string
dict:
a dict mapping bonding descriptors
to the nodes within the string
"""
bond_to_order = {'-': 1, '=': 2, '#': 3, '$': 4, ':': 1.5, '.': 0}
smile_iter = PeekIter(fragment_string)
bonding_descrpt = defaultdict(list)
rings = defaultdict(list)
ez_isomer_atoms = {}
attributes = defaultdict(dict)
record_attributes = False
smile = ""
node_count = 0
prev_node = 0
current_order = None
anchor = []
branch_length = 1
for token in smile_iter:
if token == '[':
peek = next(smile_iter)
if peek in ['$', '>', '<', '!']:
bond_descrp = peek
peek = next(smile_iter)
while peek != ']':
bond_descrp += peek
peek = next(smile_iter)
if smile_iter.peek() in bond_to_order and node_count == 0:
order = bond_to_order[next(smile_iter)]
elif current_order:
order = current_order
current_order = None
# we need to remove the symbol from the clean string
smile = smile[:-1]
else:
order = 1
bonding_descrpt[prev_node].append(bond_descrp + str(order))
else:
atom = token
attribute_str = ""
while peek != ']':
# we have annotations
if peek == ';' and not record_attributes:
record_attributes = True
elif record_attributes:
attribute_str += peek
else:
atom += peek
peek = next(smile_iter)
record_attributes=False
# here we do some post processing cleanup
node_attributes = _fragment_node_parser(attribute_str)
attributes[node_count].update(node_attributes)
smile = smile + atom + "]"
prev_node = node_count
node_count += 1
current_order = None
elif token == '(':
anchor.append(prev_node)
smile += token
elif token == ')':
prev_node = anchor.pop()
smile += token
# we are having a branch expansion
if smile_iter.peek() == "|":
branch_length = node_count - prev_node
elif token in bond_to_order:
current_order = bond_to_order[token]
smile += token
# for chirality assignment we need to collect rings
elif token == '%' or token.isdigit():
smile_iter, token, part_str, rings = collect_ring_number(smile_iter,
token,
prev_node,
rings)
smile += part_str
elif token in '] H . - = # $ : + -':
smile += token
# deal with ez isomers
elif token in '/ \\':
ez_isomer_atoms[node_count] = token
ez_isomer_atoms[prev_node] = token
# deal with expansion
elif token == "|":
smile += token
peek = smile_iter.peek()
count = ""
while peek.isdigit():
count += next(smile_iter)
peek = smile_iter.peek()
smile += count
node_count += branch_length * int(count) - branch_length
else:
if smile_iter.peek() and token + smile_iter.peek() in ['Cl', 'Br', 'Si', 'Mg', 'Na']:
smile += (token + next(smile_iter))
else:
smile += token
current_order = None
prev_node = node_count
node_count += 1
return smile, bonding_descrpt, ez_isomer_atoms, attributes
[docs]
def fragment_iter(fragment_str, all_atom=True):
"""
Iterates over fragments defined in a CGBigSmile string.
Fragments are named residues that consist of a single
smile string together with the BigSmile specific bonding
descriptors. The function returns the name of the
fragment as well as a plain nx.Graph of the molecule
described by the smile. Bonding descriptors are annotated
as node attributes with the keyword bonding.
Parameters
----------
fragment_str: str
the string describing the fragments
all_atom: bool
are the fragments all atom according to
OpenSmiles syntax or CGsmiles
Yields
------
str, nx.Graph
"""
for fragment in fragment_str[1:-1].split(','):
delim = fragment.find('=', 0)
fragname = fragment[1:delim]
frag_smile = fragment[delim+1:]
smiles_str, bonding_descrpt, ez_isomers, attributes = strip_bonding_descriptors(frag_smile)
# read an all_atom fragment using OpenSMILES definition
if all_atom:
mol_graph = read_fragment_smiles(smiles_str,
fragname,
bonding_descrpt,
ez_isomers,
attributes)
# we deal with a CG resolution graph
else:
mol_graph = read_fragment_cgsmiles(smiles_str,
fragname,
bonding_descrpt,
attributes)
yield fragname, mol_graph
[docs]
def read_fragments(fragment_str, all_atom=True, fragment_dict=None):
"""
Collects the fragments defined in a CGsmiles fragment string
as networkx.Graph and returns a dict of them. Bonding descriptors
are annotated as node attribtues.
Parameters
----------
fragment_str: str
string using CGsmiles fragment syntax
all_atom: bool
If the fragment strings are all-atom following
the OpenSmiles syntax. Default is True but if
set to False fragments follow the CGsmiles
syntax.
fragment_dict: dict
A dict of existing fragments. Only unique
new fragments are appended.
Returns
-------
dict
a dict of fragments and their name
"""
if fragment_dict is None:
fragment_dict = {}
frag_iter = fragment_iter(fragment_str, all_atom=all_atom)
for fragname, mol_graph in frag_iter:
if fragname not in fragment_dict:
fragment_dict[fragname] = mol_graph
return fragment_dict
# ToDos
# - remove special case hydrogen line 327ff
# - check rebuild_h and clean up