from collections import defaultdict
import re
import numpy as np
import networkx as nx
from .dialects import parse_graph_base_node
PATTERNS = {"bond_anchor": r"\[\$.*?\]",
"place_holder": r"\[\#.*?\]",
"annotation": r"\|.*?\|",
"fragment": r'#(\w+)=((?:\[.*?\]|[^,\[\]]+)*)',
"seq_pattern": r'\{([^}]*)\}(?:\.\{([^}]*)\})?'}
def _find_next_character(string, chars, start):
for idx, token in enumerate(string[start:]):
if token in chars:
return idx+start
return len(string)
def _expand_branch(mol_graph, current, anchor, recipe):
"""
Small utility function that takes a `mol_graph` and
starts to add nodes according to recipe to an anchor.
Parameters
----------
mol_graph: networkx.Graph
the starting graph of the molecule
current: collections.abc.Hashable
first node to be added to new graph
anchor: collections.abc.Hashable
anchor to which to connect current node
recipe: list[(str, int, dict, int)]
list storing tuples of node names and
the number of times the node has to be added,
a dict of attributes and the bond order
Returns
-------
networkx.Graph
"""
prev_node = anchor
for bdx, (n_mon, attributes, order) in enumerate(recipe):
if bdx == 0:
anchor = current
for _ in range(0, n_mon):
mol_graph.add_node(current, **attributes)
mol_graph.add_edge(prev_node, current, order=order)
prev_node = current
current += 1
prev_node = anchor
return mol_graph, current, prev_node
[docs]
def read_cgsmiles(pattern):
"""
Generate a networkx.Graph from a pattern string according to the
CGsmiles line notation.
The syntax scheme consists of two curly braces enclosing the graph
sequence. It can contain any enumeration of nodes by writing them
as if they were smile atoms but the atomname is given by:
`[# + fragname + ]`
This input fomat can handle branching as well as cycles following
the OpenSmiles syntax. For example, branches are indicated using
enclosing them in `( ... )`. Rings are indicated by placing a
after the closing braces of two nodes to be connected. Note that
in agreement with OpenSmiles at most two digits an be used.
The general pattern of a CGsmiles string looks like this:
`'{' + [#fragname_1][#fragname_2]... + '}'`
In addition to plain enumeration we allow some special operators
that simplify the description of large but regular graphs such as
needed to describe polymer molecules.
The expansion operator `|` and an integer number that specifies
how many times the given residue should be added within a sequence.
For example, a pentamer of Polyethylene oxide can be written as:
`{[#PEO][#PEO][#PEO][#PEO][#PEO]}`
or using the expansion operator as:
`{[#PEO]|5}`
The expansion operator also applies to branches. Here the following
convention applies. The complete branch including it's first
anchoring node is repeated. For example, to generate a PMA-g-PEG
polymer containing 9 residues the following syntax is permitted:
`{[#PMA]([#PEO][#PEO])|3}`
This is equivalent to the following CGsmiles string:
`{[#PMA]([#PEO][#PEO])[#PMA]([#PEO][#PEO])[#PMA]([#PEO][#PEO])}`
Parameters
----------
pattern: str
a string describing a graph according to CGsmiles syntax
Returns
-------
networkx.Graph
"""
mol_graph = nx.Graph()
current = 0
# stores one or more branch anchors; each next
# anchor belongs to a nested branch
branch_anchor = []
# used for storing composition protocol for
# for branches; each entry is a list of
# branches from extending from the anchor
# point
recipes = defaultdict(list)
# the previous node
prev_node = None
# do we have an open branch
branching = False
# do we have an open cycle
cycle = {}
cycle_edges = []
# each element in the for loop matches a pattern
# '[' + '#' + some alphanumeric name + ']'
symbol_to_order = {".": 0, "=": 2, "-": 1, "#": 3, "$": 4}
default_bond_order = 1
bond_order = None
prev_bond_order = None
for match in re.finditer(PATTERNS['place_holder'], pattern):
start, stop = match.span()
# we start a new branch when the residue is preceded by '('
# as in ... ([#PEO] ...
if pattern[start-1] == '(' or pattern[start-2] == '(':
branching = True
branch_anchor.append(prev_node)
# the recipe for making the branch includes the anchor;
# which is hence the first residue in the list
# at this point the bond order is still 1 unless we have an expansion
recipes[branch_anchor[-1]] = [(1, attributes, 1)]
if pattern[start-2] == '(' and pattern[start-1] in symbol_to_order:
prev_bond_order = symbol_to_order[pattern[start-1]]
# here we check if the atom is followed by a cycle marker
# in this case we have an open cycle and close it
ring_marker = ""
multi_ring = False
ring_bond_order = default_bond_order
for rdx, token in enumerate(pattern[stop:]):
if multi_ring and not token.isdigit():
ring_marker = int(ring_marker[1:])
if ring_marker in cycle:
cycle_edges.append((current,
cycle[ring_marker][0],
cycle[ring_marker][1]))
del cycle[ring_marker]
else:
cycle[ring_marker] = [current, ring_bond_order]
multi_ring = False
ring_marker = ""
ring_bond_order = default_bond_order
# we open a new multi ring
if token == "%":
multi_ring = True
ring_marker = '%'
# we open a ring or close
elif token.isdigit():
ring_marker += token
if not multi_ring:
ring_marker = int(ring_marker)
# we have a single digit marker and it is in
# cycle so we close it
if ring_marker in cycle:
cycle_edges.append((current,
cycle[ring_marker][0],
cycle[ring_marker][1]))
del cycle[ring_marker]
# the marker is not in cycle so we update cycles
else:
cycle[ring_marker] = [current, ring_bond_order]
ring_marker = ""
ring_bond_order = default_bond_order
# we found bond_order
elif token in symbol_to_order:
ring_bond_order = symbol_to_order[token]
else:
break
# check if there is a bond-order following the node
if stop < len(pattern) and pattern[stop+rdx-1] in '- + . = # $':
bond_order = symbol_to_order[pattern[stop+rdx-1]]
else:
bond_order = default_bond_order
# here we check if the atom is followed by a expansion character '|'
# as in ... [#PEO]|
if stop < len(pattern) and pattern[stop] == '|':
# eon => end of next
# we find the next character that starts a new residue, ends
# a branch or ends the complete pattern
eon = _find_next_character(pattern, ['[', ')', '(', '}'], stop)
# between the expansion character and the eon character
# is any number that corresponds to the number of times
# (i.e. monomers) that this atom should be added
n_mon = int(pattern[stop+1:eon])
else:
n_mon = 1
# the fragname starts at the second character and ends
# one before the last according to the above pattern
fragname = match.group(0)[2:-1]
# read the annotations
attributes = parse_graph_base_node(fragname)
# if this residue is part of a branch we store it in
# the recipe dict together with the anchor residue
# and expansion number
if branching:
recipes[branch_anchor[-1]].append((n_mon, attributes, prev_bond_order))
# new we add new residue as often as required
connection = []
for _ in range(0, n_mon):
mol_graph.add_node(current, **attributes)
if prev_node is not None:
mol_graph.add_edge(prev_node, current, order=prev_bond_order)
prev_bond_order = bond_order
# here we have a double edge
for cycle_edge in cycle_edges:
if cycle_edge in mol_graph.edges:
msg=("You define two edges between the same node."
"Use bond order symbols instead.")
raise SyntaxError(msg)
mol_graph.add_edge(cycle_edge[0],
cycle_edge[1],
order=cycle_edge[2])
prev_node = current
current += 1
cycle_edges = []
# here we check if the residue considered before is the
# last residue of a branch (i.e. '...[#residue])'
# that is the case if the branch closure comes before
# any new atom begins
branch_stop = _find_next_character(pattern, ['['], stop) >\
_find_next_character(pattern, [')'], stop)
# if the branch ends we reset the anchor
# and set branching False unless we are in
# a nested branch
if stop <= len(pattern) and branch_stop:
branching = False
prev_node = branch_anchor.pop()
if branch_anchor:
branching = True
#========================================
# expansion for branches
#========================================
# We need to know how often the branch has
# to be added so we first identify the branch
# terminal character ')' called eon_a.
eon_a = _find_next_character(pattern, [')'], stop)
# Then we check if the expansion character
# is next.
if (eon_a+1 < len(pattern) and pattern[eon_a+1] == "|") or\
(eon_a+2 < len(pattern) and pattern[eon_a+2] == "|"):
if pattern[eon_a+2] == "|":
anchor_order = symbol_to_order[pattern[eon_a+1]]
recipe = recipes[prev_node][0]
recipes[prev_node][0] = (recipe[0], recipe[1], anchor_order)
eon_a += 1
# If there is one we find the beginning
# of the next branch, residue or end of the string
# As before all characters inbetween are a number that
# is how often the branch is expanded.
next_characters = ['[', ')', '(', '}'] + list(symbol_to_order.keys())
eon_b = _find_next_character(pattern, next_characters, eon_a+1)
# the outermost loop goes over how often a the branch has to be
# added to the existing sequence
for idx in range(0,int(pattern[eon_a+2:eon_b])-1):
prev_anchor = None
skip = 0
# in principle each branch can contain any number of nested branches
# each branch is itself a recipe that has an anchor atom
for ref_anchor, recipe in list(recipes.items())[len(branch_anchor):]:
# starting from the first nested branch we have to do some
# math to find the anchor atom relative to the first branch
# we also skip the first residue in recipe, which is the
# anchor residue. Only the outermost branch in an expansion
# is expanded including the anchor. This allows easy description
# of graft polymers.
if prev_anchor:
offset = ref_anchor - prev_anchor
prev_node = prev_node + offset
skip = 1
# this function simply adds the residues of the paticular
# branch
mol_graph, current, prev_node = _expand_branch(mol_graph,
current=current,
anchor=prev_node,
recipe=recipe[skip:])
# if this is the first branch we want to set the anchor
# as the base anchor to which we jump back after all nested
# branches have been added
if prev_anchor is None:
base_anchor = prev_node
# store the previous anchor so we can do the math for nested
# branches
prev_anchor = ref_anchor
# all branches added; then go back to the base anchor
prev_node = base_anchor
#================================================
# bond orders for after branches #
#================================================
if pattern[eon_b] in symbol_to_order:
prev_bond_order = symbol_to_order[pattern[eon_b]]
elif eon_a+1 < len(pattern) and pattern[eon_a+1] in symbol_to_order:
prev_bond_order = symbol_to_order[pattern[eon_a+1]]
# if all branches are done we need to reset the lists
# when all nested branches are completed
if len(branch_anchor) == 0:
recipes = defaultdict(list)
# raise some errors for strange stuff
if cycle:
msg = "You have a dangling ring index."
raise SyntaxError(msg)
return mol_graph