void SmilesLoader::_parseMolecule()

in core/indigo-core/molecule/src/smiles_loader.cpp [1363:1811]


void SmilesLoader::_parseMolecule()
{
    _cycles.clear();
    _atom_stack.clear();
    _pending_bonds_pool.clear();

    bool first_atom = true;
    bool inside_polymer = false;

    while (!_scanner.isEOF())
    {
        int next = _scanner.lookNext();

        if ((isspace(next)) || next == '|')
            break;

        _BondDesc* bond = 0;
        bool added_bond = false;

        if (!first_atom)
        {
            while (isdigit(next) || next == '%')
            {
                int number;

                _scanner.skip(1);
                if (next == '%')
                    number = _scanner.readIntFix(2);
                else
                    number = next - '0';

                while (_cycles.size() <= number)
                    _cycles.push().clear();

                // closing some previously numbered atom, like the last '1' in c1ccccc1
                if (_cycles[number].beg >= 0)
                {
                    _bonds.push(_BondDesc{});
                    bond = &_bonds.top();
                    bond->beg = _atom_stack.top();
                    bond->end = _cycles[number].beg;
                    _cycles[number].clear();
                    added_bond = true;

                    if (_qmol != 0)
                        bond->index = _qmol->addBond(bond->beg, bond->end, new QueryMolecule::Bond());

                    _atoms[bond->beg].neighbors.add(bond->end);
                    _atoms[bond->end].closure(number, bond->beg);

                    break;
                }
                // closing some previous pending bond, like the last '1' in C-1=CC=CC=C1'
                else if (_cycles[number].pending_bond >= 0)
                {
                    bond = &_bonds[_cycles[number].pending_bond];
                    bond->end = _atom_stack.top();
                    added_bond = true;
                    _atoms[bond->end].neighbors.add(bond->beg);
                    _atoms[bond->beg].closure(number, bond->end);

                    if (_qmol != 0)
                    {
                        QS_DEF(Array<char>, bond_str);
                        std::unique_ptr<QueryMolecule::Bond> qbond = std::make_unique<QueryMolecule::Bond>();

                        bond_str.readString(_pending_bonds_pool.at(_cycles[number].pending_bond_str), false);
                        _readBond(bond_str, *bond, qbond, smarts_mode);
                        bond->index = _qmol->addBond(bond->beg, bond->end, qbond.release());
                    }

                    _cycles[number].clear();

                    break;
                }
                // opening new cycle, like the first '1' in c1ccccc1
                else
                {
                    _cycles[number].beg = _atom_stack.top();
                    _cycles[number].pending_bond = -1;
                    _atoms[_cycles[number].beg].pending(number);
                }
                next = _scanner.lookNext();
            }
            if (added_bond)
                continue;
        }

        if (next == '.')
        {
            _scanner.skip(1);

            if (smarts_mode && _balance == 0)
            {
                _inside_smarts_component = false;
                _atom_stack.clear(); // needed to detect errors like "C.(C)(C)"
            }
            else
            {
                if (_atom_stack.size() < 1)
                    ; // we allow misplaced dots because we are so kind
                else
                    _atom_stack.pop();
            }
            first_atom = true;
            continue;
        }

        if (next == '(')
        {
            _scanner.skip(1);

            if (smarts_mode && first_atom)
            {
                if (_balance > 0)
                    throw Error("hierarchical component-level grouping is not allowed");
                _current_compno++;
                _inside_smarts_component = true;
            }
            else
            {
                if (_atom_stack.size() < 1)
                    throw Error("probably misplaced '('");
                _atom_stack.push(_atom_stack.top());
            }

            _balance++;
            continue;
        }

        if (next == ')')
        {
            _scanner.skip(1);

            if (_balance <= 0)
                throw Error("unexpected ')'");

            _balance--;

            _atom_stack.pop();

            continue;
        }

        if (!first_atom)
        {
            _bonds.push(_BondDesc{});
            bond = &_bonds.top();
            bond->beg = _atom_stack.top();
            added_bond = true;
        }

        std::unique_ptr<QueryMolecule::Bond> qbond;

        if (added_bond)
        {
            QS_DEF(Array<char>, bond_str);

            bond_str.clear();
            while (strchr("-=#:@!;,&~?/\\", next) != NULL)
            {
                bond_str.push(_scanner.readChar());
                next = _scanner.lookNext();
            }

            if (_qmol != 0)
                qbond = std::make_unique<QueryMolecule::Bond>();

            // empty bond designator?
            if (bond_str.size() < 1)
            {
                // In SMARTS mode a missing bond symbol is interpreted as "single or aromatic".
                // (http://www.daylight.com/dayhtml/doc/theory/theory.smarts.html)
                // But if both atoms cannot be aromatic, then empty bond can be just
                // replaced to single.
                // Such case is processed after
            }
            else
                _readBond(bond_str, *bond, qbond, smarts_mode);

            // The bond "directions" are already saved in _BondDesc::dir,
            // so we can safely discard them. We are doing that to succeed
            // the later check 'pending bond vs. closing bond'.
            {
                int i;

                for (i = 0; i < bond_str.size(); i++)
                    if (bond_str[i] == '/' || bond_str[i] == '\\')
                    {
                        // Aromatic bonds can be a part of cis-trans configuration
                        // For example in Cn1c2ccccc2c(-c2ccccc2)n/c(=N\O)c1=O or O\N=c1/c(=O)c2ccccc2c(=O)/c/1=N\O
                        if (_atoms[bond->beg].aromatic && bond_str.size() == 1)
                        {
                            // Erase bond type info
                            bond_str[i] = '?';
                            bond->type = -1; // single of aromatic
                        }
                        else
                            bond_str[i] = '-';
                    }
            }

            if (bond_str.size() > 0)
            {
                if (isdigit(next) || next == '%')
                {
                    int number;
                    _scanner.skip(1);

                    if (next == '%')
                        number = _scanner.readIntFix(2);
                    else
                        number = next - '0';

                    // closing some previous numbered atom, like the last '1' in C1C=CC=CC=1
                    if (number >= 0 && number < _cycles.size() && _cycles[number].beg >= 0)
                    {
                        bond->end = _cycles[number].beg;

                        if (_qmol != 0)
                            bond->index = _qmol->addBond(bond->beg, bond->end, qbond.release());

                        _atoms[bond->end].closure(number, bond->beg);
                        _atoms[bond->beg].neighbors.add(bond->end);

                        _cycles[number].clear();
                        continue;
                    }
                    // closing some previous pending cycle bond, like the last '1' in C=1C=CC=CC=1
                    else if (number >= 0 && number < _cycles.size() && _cycles[number].pending_bond >= 0)
                    {
                        int pending_bond_idx = _cycles[number].pending_bond;
                        _BondDesc& pending_bond = _bonds[pending_bond_idx];

                        // transfer direction from closing bond to pending bond
                        if (bond->dir > 0)
                        {
                            if (bond->dir == pending_bond.dir)
                            {
                                if (!ignore_closing_bond_direction_mismatch)
                                    throw Error("cycle %d: closing bond direction does not match pending bond direction", number);
                            }
                            else
                                pending_bond.dir = 3 - bond->dir;
                        }

                        // apart from the direction, check that the closing bond matches the pending bond
                        const char* str = _pending_bonds_pool.at(_cycles[number].pending_bond_str);

                        if (bond_str.size() > 0)
                        {
                            if (!ignore_closing_bond_direction_mismatch)
                            {
                                if ((int)strlen(str) != bond_str.size() || memcmp(str, bond_str.ptr(), strlen(str)) != 0)
                                    throw Error("cycle %d: closing bond description %.*s does not match pending bond description %s", number, bond_str.size(),
                                                bond_str.ptr(), str);
                            }
                        }
                        else
                        {
                            bond_str.readString(str, false);
                            _readBond(bond_str, *bond, qbond, smarts_mode);
                        }

                        if (_qmol != 0)
                            pending_bond.index = _qmol->addBond(pending_bond.beg, bond->beg, qbond.release());

                        pending_bond.end = bond->beg;
                        _atoms[pending_bond.end].neighbors.add(pending_bond.beg);
                        _atoms[pending_bond.beg].closure(number, pending_bond.end);

                        // forget the closing bond but move its index here
                        // Bond order should correspons to atoms order
                        // Bond order for the following two molecules should be the same
                        // because later we add cis constraint:
                        //    CCSc1nnc2c(OC=Nc3ccccc-23)n1 |c:9|
                        //    CCSc1nnc-2c(OC=Nc3ccccc-23)n1 |c:9|
                        // Without this shift the 9th bond in the second structure is not double
                        _bonds.top() = pending_bond;
                        _bonds.remove(pending_bond_idx);
                        for (int i = 0; i < _cycles.size(); i++)
                            if (_cycles[i].pending_bond >= pending_bond_idx)
                                --_cycles[i].pending_bond;

                        _cycles[number].clear();
                        continue;
                    }
                    // opening some pending cycle bond, like the first '1' in C=1C=CC=CC=1
                    else
                    {
                        while (_cycles.size() <= number)
                            _cycles.push().clear();
                        _cycles[number].pending_bond = _bonds.size() - 1;
                        _cycles[number].pending_bond_str = _pending_bonds_pool.add(bond_str);
                        _cycles[number].beg = -1; // have it already in the bond
                        _atoms[bond->beg].pending(number);

                        continue;
                    }
                }
            }
        }

        _AtomDesc& atom = _atoms.push(_neipool);

        std::unique_ptr<QueryMolecule::Atom> qatom;

        if (_qmol != 0)
            qatom = std::make_unique<QueryMolecule::Atom>();

        if (added_bond)
            bond->end = _atoms.size() - 1;

        QS_DEF(Array<char>, atom_str);

        atom_str.clear();

        bool brackets = false;

        if (next == '[')
        {
            _scanner.skip(1);
            int cnt = 1;

            while (1)
            {
                if (_scanner.isEOF())
                    throw Error("'[' without a ']'");
                char c = _scanner.readChar();
                if (c == '[')
                    cnt++;
                else if (c == ']')
                {
                    cnt--;
                    if (cnt == 0)
                        break;
                }
                atom_str.push(c);
            }
            brackets = true;
        }
        else if (next == -1)
            throw Error("unexpected end of input");
        else
        {
            _scanner.skip(1);
            atom_str.push(static_cast<char>(next));
            if (next == 'B' && _scanner.lookNext() == 'r')
                atom_str.push(_scanner.readChar());
            else if (next == 'C' && _scanner.lookNext() == 'l')
                atom_str.push(_scanner.readChar());
        }

        _readAtom(atom_str, brackets, atom, qatom, smarts_mode, inside_rsmiles);
        atom.brackets = brackets;

        if (_qmol != 0)
        {
            _qmol->addAtom(qatom.release());

            if (added_bond)
                bond->index = _qmol->addBond(bond->beg, bond->end, qbond.release());
        }

        if (added_bond)
        {
            _atoms[bond->beg].neighbors.add(bond->end);
            _atoms[bond->end].neighbors.add(bond->beg);
            _atoms[bond->end].parent = bond->beg;
            // when going from a polymer atom, make the new atom belong
            // to the same polymer
            if (_atoms[bond->beg].polymer_index >= 0)
            {
                // ... unless it goes from the polymer end
                // and is not in braces
                if (!_atoms[bond->beg].ends_polymer || (_atom_stack.size() >= 2 && _atom_stack.top() == _atom_stack[_atom_stack.size() - 2]))
                    _atoms[bond->end].polymer_index = _atoms[bond->beg].polymer_index;
            }
        }

        if (_inside_smarts_component)
        {
            _qmol->components.expandFill(_atoms.size(), 0);
            _qmol->components[_atoms.size() - 1] = _current_compno;
        }

        if (!first_atom)
            _atom_stack.pop();
        _atom_stack.push(_atoms.size() - 1);
        first_atom = false;

        while (_scanner.lookNext() == '{')
            _handleCurlyBrace(atom, inside_polymer);
        if (inside_polymer)
            atom.polymer_index = _polymer_repetitions.size() - 1;
    }

    if (smarts_mode)
    {
        // SMARTS mode: process only empty bonds, and replace them with single or aromatic
        for (int i = 0; i < _bonds.size(); i++)
        {
            int index = _bonds[i].index;

            QueryMolecule::Bond& qbond = _qmol->getBond(index);

            if (qbond.type != QueryMolecule::OP_NONE || _bonds[i].type == _ANY_BOND)
                continue;

            // A missing bond symbol is interpreted as "single or aromatic".
            // (http://www.daylight.com/dayhtml/doc/theory/theory.smarts.html)
            // But if an atom cannot be aromatic, then bond can be as single
            const Edge& edge = _qmol->getEdge(index);
            QueryMolecule::Atom& q_beg = _qmol->getAtom(edge.beg);
            QueryMolecule::Atom& q_end = _qmol->getAtom(edge.end);

            std::unique_ptr<QueryMolecule::Bond> new_qbond = std::make_unique<QueryMolecule::Bond>(QueryMolecule::BOND_ORDER, BOND_SINGLE);

            bool beg_can_be_aromatic = q_beg.possibleValue(QueryMolecule::ATOM_AROMATICITY, ATOM_AROMATIC);
            bool end_can_be_aromatic = q_end.possibleValue(QueryMolecule::ATOM_AROMATICITY, ATOM_AROMATIC);

            int beg_label = _qmol->getAtomNumber(edge.beg);
            if (beg_label != -1)
                beg_can_be_aromatic &= Element::canBeAromatic(beg_label);

            int end_label = _qmol->getAtomNumber(edge.end);
            if (end_label != -1)
                end_can_be_aromatic &= Element::canBeAromatic(end_label);

            if (beg_can_be_aromatic && end_can_be_aromatic)
            {
                new_qbond.reset(QueryMolecule::Bond::oder(new_qbond.release(), new QueryMolecule::Bond(QueryMolecule::BOND_ORDER, BOND_AROMATIC)));
            }

            _qmol->resetBond(index, new_qbond.release());
        }
    }

    int i;

    for (i = 0; i < _cycles.size(); i++)
    {
        if (_cycles[i].beg >= 0)
            throw Error("cycle %d not closed", i);
    }

    if (inside_polymer)
        throw Error("polymer not closed");
}