Source code for cgsmiles.read_cgsmiles

from collections import defaultdict
import re
import numpy as np
import networkx as nx
from .dialects import parse_graph_base_node

PATTERNS = {"bond_anchor": r"\[\$.*?\]",
            "place_holder": r"\[\#.*?\]",
            "annotation": r"\|.*?\|",
            "fragment": r'#(\w+)=((?:\[.*?\]|[^,\[\]]+)*)',
            "seq_pattern": r'\{([^}]*)\}(?:\.\{([^}]*)\})?'}

def _find_next_character(string, chars, start):
    for idx, token in enumerate(string[start:]):
        if token in chars:
            return idx+start
    return len(string)

def _expand_branch(mol_graph, current, anchor, recipe):
    """
    Small utility function that takes a `mol_graph` and
    starts to add nodes according to recipe to an anchor.

    Parameters
    ----------
    mol_graph: networkx.Graph
        the starting graph of the molecule

    current: collections.abc.Hashable
        first node to be added to new graph

    anchor: collections.abc.Hashable
        anchor to which to connect current node

    recipe: list[(str, int, dict, int)]
        list storing tuples of node names and
        the number of times the node has to be added,
        a dict of attributes and the bond order

    Returns
    -------
    networkx.Graph
    """
    prev_node = anchor
    for bdx, (n_mon, attributes, order) in enumerate(recipe):
        if bdx == 0:
            anchor = current
        for _ in range(0, n_mon):
            mol_graph.add_node(current, **attributes)
            mol_graph.add_edge(prev_node, current, order=order)

            prev_node = current
            current += 1

    prev_node = anchor
    return mol_graph, current, prev_node


[docs]
def read_cgsmiles(pattern):
    """
    Generate a networkx.Graph from a pattern string according to the
    CGsmiles line notation.

    The syntax scheme consists of two curly braces enclosing the graph
    sequence. It can contain any enumeration of nodes by writing them
    as if they were smile atoms but the atomname is given by:

    `[# + fragname + ]`

    This input fomat can handle branching as well as cycles following
    the OpenSmiles syntax. For example, branches are indicated using
    enclosing them in `( ... )`. Rings are indicated by placing a
    after the closing braces of two nodes to be connected. Note that
    in agreement with OpenSmiles at most two digits an be used.

    The general pattern of a CGsmiles string looks like this:

    `'{' + [#fragname_1][#fragname_2]... + '}'`

    In addition to plain enumeration we allow some special operators
    that simplify the description of large but regular graphs such as
    needed to describe polymer molecules.

    The expansion operator `|` and an integer number that specifies
    how many times the given residue should be added within a sequence.
    For example, a pentamer of Polyethylene oxide can be written as:

    `{[#PEO][#PEO][#PEO][#PEO][#PEO]}`

    or using the expansion operator as:

    `{[#PEO]|5}`

    The expansion operator also applies to branches. Here the following
    convention applies. The complete branch including it's first
    anchoring node is repeated. For example, to generate a PMA-g-PEG
    polymer containing 9 residues the following syntax is permitted:

    `{[#PMA]([#PEO][#PEO])|3}`

    This is equivalent to the following CGsmiles string:

    `{[#PMA]([#PEO][#PEO])[#PMA]([#PEO][#PEO])[#PMA]([#PEO][#PEO])}`


    Parameters
    ----------
    pattern: str
        a string describing a graph according to CGsmiles syntax

    Returns
    -------
    networkx.Graph
    """

    mol_graph = nx.Graph()
    current = 0
    # stores one or more branch anchors; each next
    # anchor belongs to a nested branch
    branch_anchor = []
    # used for storing composition protocol for
    # for branches; each entry is a list of
    # branches from extending from the anchor
    # point
    recipes = defaultdict(list)
    # the previous node
    prev_node = None
    # do we have an open branch
    branching = False
    # do we have an open cycle
    cycle = {}
    cycle_edges = []
    # each element in the for loop matches a pattern
    # '[' + '#' + some alphanumeric name + ']'
    symbol_to_order = {".": 0, "=": 2, "-": 1, "#": 3, "$": 4}
    default_bond_order = 1
    bond_order = None
    prev_bond_order = None
    for match in re.finditer(PATTERNS['place_holder'], pattern):
        start, stop = match.span()
        # we start a new branch when the residue is preceded by '('
        # as in ... ([#PEO] ...
        if pattern[start-1] == '(' or pattern[start-2] == '(':
            branching = True
            branch_anchor.append(prev_node)
            # the recipe for making the branch includes the anchor;
            # which is hence the first residue in the list
            # at this point the bond order is still 1 unless we have an expansion
            recipes[branch_anchor[-1]] = [(1, attributes, 1)]
        if pattern[start-2] == '(' and pattern[start-1] in symbol_to_order:
            prev_bond_order = symbol_to_order[pattern[start-1]]
        # here we check if the atom is followed by a cycle marker
        # in this case we have an open cycle and close it
        ring_marker = ""
        multi_ring = False
        ring_bond_order = default_bond_order
        for rdx, token in enumerate(pattern[stop:]):
            if multi_ring and not token.isdigit():
                ring_marker = int(ring_marker[1:])
                if ring_marker in cycle:
                    cycle_edges.append((current,
                                        cycle[ring_marker][0],
                                        cycle[ring_marker][1]))
                    del cycle[ring_marker]
                else:
                    cycle[ring_marker] = [current, ring_bond_order]
                multi_ring = False
                ring_marker = ""
                ring_bond_order = default_bond_order

            # we open a new multi ring
            if token == "%":
                multi_ring = True
                ring_marker = '%'
            # we open a ring or close
            elif token.isdigit():
                ring_marker += token
                if not multi_ring:
                    ring_marker = int(ring_marker)
                    # we have a single digit marker and it is in
                    # cycle so we close it
                    if ring_marker in cycle:
                        cycle_edges.append((current,
                                            cycle[ring_marker][0],
                                            cycle[ring_marker][1]))
                        del cycle[ring_marker]
                    # the marker is not in cycle so we update cycles
                    else:
                        cycle[ring_marker] = [current, ring_bond_order]
                    ring_marker = ""
                    ring_bond_order = default_bond_order
            # we found bond_order
            elif token in symbol_to_order:
                ring_bond_order = symbol_to_order[token]
            else:
                break

        # check if there is a bond-order following the node
        if stop < len(pattern) and pattern[stop+rdx-1] in '- + . = # $':
            bond_order = symbol_to_order[pattern[stop+rdx-1]]
        else:
            bond_order = default_bond_order

        # here we check if the atom is followed by a expansion character '|'
        # as in ... [#PEO]|
        if stop < len(pattern) and pattern[stop] == '|':
            # eon => end of next
            # we find the next character that starts a new residue, ends
            # a branch or ends the complete pattern
            eon = _find_next_character(pattern, ['[', ')', '(', '}'], stop)
            # between the expansion character and the eon character
            # is any number that corresponds to the number of times
            # (i.e. monomers) that this atom should be added
            n_mon = int(pattern[stop+1:eon])
        else:
            n_mon = 1

        # the fragname starts at the second character and ends
        # one before the last according to the above pattern
        fragname = match.group(0)[2:-1]
        # read the annotations
        attributes = parse_graph_base_node(fragname)

        # if this residue is part of a branch we store it in
        # the recipe dict together with the anchor residue
        # and expansion number
        if branching:
            recipes[branch_anchor[-1]].append((n_mon, attributes, prev_bond_order))
        # new we add new residue as often as required
        connection = []
        for _ in range(0, n_mon):
            mol_graph.add_node(current, **attributes)

            if prev_node is not None:
                mol_graph.add_edge(prev_node, current, order=prev_bond_order)

            prev_bond_order = bond_order

            # here we have a double edge
            for cycle_edge in cycle_edges:
                if cycle_edge in mol_graph.edges:
                    msg=("You define two edges between the same node."
                         "Use bond order symbols instead.")
                    raise SyntaxError(msg)
                mol_graph.add_edge(cycle_edge[0],
                                   cycle_edge[1],
                                   order=cycle_edge[2])

            prev_node = current
            current += 1

        cycle_edges = []
        # here we check if the residue considered before is the
        # last residue of a branch (i.e. '...[#residue])'
        # that is the case if the branch closure comes before
        # any new atom begins
        branch_stop = _find_next_character(pattern, ['['], stop) >\
                      _find_next_character(pattern, [')'], stop)

        # if the branch ends we reset the anchor
        # and set branching False unless we are in
        # a nested branch
        if stop <= len(pattern) and branch_stop:
            branching = False
            prev_node = branch_anchor.pop()
            if branch_anchor:
                branching = True
            #========================================
            #       expansion for branches
            #========================================
            # We need to know how often the branch has
            # to be added so we first identify the branch
            # terminal character ')' called eon_a.
            eon_a = _find_next_character(pattern, [')'], stop)
            # Then we check if the expansion character
            # is next.
            if (eon_a+1 < len(pattern) and pattern[eon_a+1] == "|") or\
               (eon_a+2 < len(pattern) and pattern[eon_a+2] == "|"):
                if pattern[eon_a+2] == "|":
                    anchor_order = symbol_to_order[pattern[eon_a+1]]
                    recipe = recipes[prev_node][0]
                    recipes[prev_node][0] = (recipe[0], recipe[1], anchor_order)
                    eon_a += 1
                # If there is one we find the beginning
                # of the next branch, residue or end of the string
                # As before all characters inbetween are a number that
                # is how often the branch is expanded.
                next_characters = ['[', ')', '(', '}'] + list(symbol_to_order.keys())
                eon_b = _find_next_character(pattern, next_characters, eon_a+1)
                # the outermost loop goes over how often a the branch has to be
                # added to the existing sequence
                for idx in range(0,int(pattern[eon_a+2:eon_b])-1):
                    prev_anchor = None
                    skip = 0
                    # in principle each branch can contain any number of nested branches
                    # each branch is itself a recipe that has an anchor atom
                    for ref_anchor, recipe in list(recipes.items())[len(branch_anchor):]:
                        # starting from the first nested branch we have to do some
                        # math to find the anchor atom relative to the first branch
                        # we also skip the first residue in recipe, which is the
                        # anchor residue. Only the outermost branch in an expansion
                        # is expanded including the anchor. This allows easy description
                        # of graft polymers.
                        if prev_anchor:
                            offset = ref_anchor - prev_anchor
                            prev_node = prev_node + offset
                            skip = 1
                        # this function simply adds the residues of the paticular
                        # branch
                        mol_graph, current, prev_node = _expand_branch(mol_graph,
                                                                       current=current,
                                                                       anchor=prev_node,
                                                                       recipe=recipe[skip:])
                        # if this is the first branch we want to set the anchor
                        # as the base anchor to which we jump back after all nested
                        # branches have been added
                        if prev_anchor is None:
                            base_anchor = prev_node
                        # store the previous anchor so we can do the math for nested
                        # branches
                        prev_anchor = ref_anchor
                # all branches added; then go back to the base anchor
                prev_node = base_anchor
            #================================================
            #     bond orders for after branches            #
            #================================================
                if pattern[eon_b] in symbol_to_order:
                    prev_bond_order = symbol_to_order[pattern[eon_b]]
            elif eon_a+1 < len(pattern) and pattern[eon_a+1] in symbol_to_order:
                prev_bond_order = symbol_to_order[pattern[eon_a+1]]
            # if all branches are done we need to reset the lists
            # when all nested branches are completed
            if len(branch_anchor) == 0:
                recipes = defaultdict(list)

    # raise some errors for strange stuff
    if cycle:
        msg = "You have a dangling ring index."
        raise SyntaxError(msg)

    return mol_graph