Source code for cgsmiles.read_fragments

"""
Functions for reading the fragment list.
"""
import logging
from collections import defaultdict
import networkx as nx
import pysmiles
from .read_cgsmiles import read_cgsmiles
from .dialects import _fragment_node_parser
from .pysmiles_utils import read_fragment_smiles
from .cgsmiles_utils import read_fragment_cgsmiles

logger = logging.getLogger('pysmiles')
logger.setLevel(level=logging.ERROR)


[docs]
class PeekIter(object):
    """
    Custom iter that allows looking ahead, without
    advancing the actual iter.
    """
    def __init__(self, collection):
        self.collection = iter(collection)
        self._peek = None

    def __next__(self):
        if self._peek:
            item = self._peek
            self._peek = None
        else:
            item = next(self.collection)
        return item


[docs]
    def peek(self):
        if self._peek:
            return self._peek
        try:
            self._peek = next(self)
        except StopIteration:
            self._peek = None
        return self._peek


    def __iter__(self):
        return self


def _find_bonded_ring_node(ring_nodes, node):
    current = ring_nodes.index(node)
    if current%2 == 0:
        other = ring_nodes[current+1]
    else:
        other = ring_nodes[current-1]
    return other


[docs]
def collect_ring_number(smile_iter, token, node_count, rings):
    """
    When a ring identifier is found, this function will add
    the current node to the rings dict.

    Parameters
    ----------
    smile_iter: :class:`PeekIter`
    token: str
    node_count: int
    rings: dict[list]

    Returns
    -------
    PeekIter
        the advanced smiles_iter
    str
        the current token being processed
    str
        the ring id
    dict[list]
        the updated rings dict
    """
    multi_ring = False
    ring_token = token
    partial_str = ""
    while True:
        if multi_ring and token == '%':
            rings[ring_token].append(node_count)
        elif multi_ring and token.isdigit():
            ring_token += token
        elif token == '%':
            ring_token += token
            multi_ring = True
        elif multi_ring:
            rings[ring_token].append(node_count)
            ring_token = ""
        elif token.isdigit():
            rings[token].append(node_count)

        partial_str += token
        token = smile_iter.peek()
        if token and not token.isdigit() and not token == '%':
            break

        try:
            token = next(smile_iter)
        except StopIteration:
            break

    return smile_iter, token, partial_str, rings



[docs]
def strip_bonding_descriptors(fragment_string):
    """
    Processes a CGsmiles fragment string by
    stripping the bonding descriptors and storing
    them in a dict with reference to the atom they
    refer to. Furthermore, a cleaned SMILES or CGsmiles
    string is returned.

    Parameters
    ----------
    fragment_string: str
        a CGsmiles fragment string

    Returns
    -------
    str:
        a canonical SMILES or CGsmiles string
    dict:
        a dict mapping bonding descriptors
        to the nodes within the string
    """
    bond_to_order = {'-': 1, '=': 2, '#': 3, '$': 4, ':': 1.5, '.': 0}
    smile_iter = PeekIter(fragment_string)
    bonding_descrpt = defaultdict(list)
    rings = defaultdict(list)
    ez_isomer_atoms = {}
    attributes = defaultdict(dict)
    record_attributes = False
    smile = ""
    node_count = 0
    prev_node = 0
    current_order = None
    anchor = []
    branch_length = 1
    for token in smile_iter:
        if token == '[':
            peek = next(smile_iter)
            if peek in ['$', '>', '<', '!']:
                bond_descrp = peek
                peek = next(smile_iter)
                while peek != ']':
                    bond_descrp += peek
                    peek = next(smile_iter)
                if smile_iter.peek() in bond_to_order and node_count == 0:
                    order = bond_to_order[next(smile_iter)]
                elif current_order:
                    order = current_order
                    current_order = None
                    # we need to remove the symbol from the clean string
                    smile = smile[:-1]
                else:
                    order = 1
                bonding_descrpt[prev_node].append(bond_descrp + str(order))
            else:
                atom = token
                attribute_str = ""
                while peek != ']':
                    # we have annotations
                    if peek == ';' and not record_attributes:
                        record_attributes = True
                    elif record_attributes:
                        attribute_str += peek
                    else:
                        atom += peek
                    peek = next(smile_iter)

                record_attributes=False
                # here we do some post processing cleanup
                node_attributes = _fragment_node_parser(attribute_str)
                attributes[node_count].update(node_attributes)

                smile = smile + atom + "]"
                prev_node = node_count
                node_count += 1
                current_order = None
        elif token == '(':
            anchor.append(prev_node)
            smile += token
        elif token == ')':
            prev_node = anchor.pop()
            smile += token
            # we are having a branch expansion
            if smile_iter.peek() == "|":
                branch_length = node_count - prev_node
        elif token in bond_to_order:
            current_order = bond_to_order[token]
            smile += token
        # for chirality assignment we need to collect rings
        elif token == '%' or token.isdigit():
            smile_iter, token, part_str, rings = collect_ring_number(smile_iter,
                                                                     token,
                                                                     prev_node,
                                                                     rings)
            smile += part_str
        elif token in '] H . - = # $ : + -':
            smile += token
        # deal with ez isomers
        elif token in '/ \\':
            ez_isomer_atoms[node_count] = token
            ez_isomer_atoms[prev_node] = token
        # deal with expansion
        elif token == "|":
            smile += token
            peek = smile_iter.peek()
            count = ""
            while peek.isdigit():
                count += next(smile_iter)
                peek = smile_iter.peek()
            smile += count
            node_count += branch_length * int(count) - branch_length
        else:
            if smile_iter.peek() and token + smile_iter.peek() in ['Cl', 'Br', 'Si', 'Mg', 'Na']:
                smile += (token + next(smile_iter))
            else:
                smile += token
            current_order = None
            prev_node = node_count
            node_count += 1

    return smile, bonding_descrpt, ez_isomer_atoms, attributes



[docs]
def fragment_iter(fragment_str, all_atom=True):
    """
    Iterates over fragments defined in a CGBigSmile string.
    Fragments are named residues that consist of a single
    smile string together with the BigSmile specific bonding
    descriptors. The function returns the name of the
    fragment as well as a plain nx.Graph of the molecule
    described by the smile. Bonding descriptors are annotated
    as node attributes with the keyword bonding.

    Parameters
    ----------
    fragment_str: str
        the string describing the fragments

    all_atom: bool
        are the fragments all atom according to
        OpenSmiles syntax or CGsmiles

    Yields
    ------
    str, nx.Graph
    """

    for fragment in fragment_str[1:-1].split(','):
        delim = fragment.find('=', 0)
        fragname = fragment[1:delim]
        frag_smile = fragment[delim+1:]
        smiles_str, bonding_descrpt, ez_isomers, attributes = strip_bonding_descriptors(frag_smile)
        # read an all_atom fragment using OpenSMILES definition
        if all_atom:
            mol_graph = read_fragment_smiles(smiles_str,
                                             fragname,
                                             bonding_descrpt,
                                             ez_isomers,
                                             attributes)
        # we deal with a CG resolution graph
        else:
            mol_graph = read_fragment_cgsmiles(smiles_str,
                                               fragname,
                                               bonding_descrpt,
                                               attributes)
        yield fragname, mol_graph



[docs]
def read_fragments(fragment_str, all_atom=True, fragment_dict=None):
    """
    Collects the fragments defined in a CGsmiles fragment string
    as networkx.Graph and returns a dict of them. Bonding descriptors
    are annotated as node attribtues.

    Parameters
    ----------
    fragment_str: str
        string using CGsmiles fragment syntax

    all_atom: bool
        If the fragment strings are all-atom following
        the OpenSmiles syntax. Default is True but if
        set to False fragments follow the CGsmiles
        syntax.

    fragment_dict: dict
        A dict of existing fragments. Only unique
        new fragments are appended.

    Returns
    -------
    dict
        a dict of fragments and their name
    """
    if fragment_dict is None:
        fragment_dict = {}

    frag_iter = fragment_iter(fragment_str, all_atom=all_atom)

    for fragname, mol_graph in frag_iter:
        if fragname not in fragment_dict:
            fragment_dict[fragname] = mol_graph
    return fragment_dict


# ToDos
# - remove special case hydrogen line 327ff
# - check rebuild_h and clean up