in core/indigo-core/molecule/src/smiles_loader.cpp [1363:1811]
void SmilesLoader::_parseMolecule()
{
_cycles.clear();
_atom_stack.clear();
_pending_bonds_pool.clear();
bool first_atom = true;
bool inside_polymer = false;
while (!_scanner.isEOF())
{
int next = _scanner.lookNext();
if ((isspace(next)) || next == '|')
break;
_BondDesc* bond = 0;
bool added_bond = false;
if (!first_atom)
{
while (isdigit(next) || next == '%')
{
int number;
_scanner.skip(1);
if (next == '%')
number = _scanner.readIntFix(2);
else
number = next - '0';
while (_cycles.size() <= number)
_cycles.push().clear();
// closing some previously numbered atom, like the last '1' in c1ccccc1
if (_cycles[number].beg >= 0)
{
_bonds.push(_BondDesc{});
bond = &_bonds.top();
bond->beg = _atom_stack.top();
bond->end = _cycles[number].beg;
_cycles[number].clear();
added_bond = true;
if (_qmol != 0)
bond->index = _qmol->addBond(bond->beg, bond->end, new QueryMolecule::Bond());
_atoms[bond->beg].neighbors.add(bond->end);
_atoms[bond->end].closure(number, bond->beg);
break;
}
// closing some previous pending bond, like the last '1' in C-1=CC=CC=C1'
else if (_cycles[number].pending_bond >= 0)
{
bond = &_bonds[_cycles[number].pending_bond];
bond->end = _atom_stack.top();
added_bond = true;
_atoms[bond->end].neighbors.add(bond->beg);
_atoms[bond->beg].closure(number, bond->end);
if (_qmol != 0)
{
QS_DEF(Array<char>, bond_str);
std::unique_ptr<QueryMolecule::Bond> qbond = std::make_unique<QueryMolecule::Bond>();
bond_str.readString(_pending_bonds_pool.at(_cycles[number].pending_bond_str), false);
_readBond(bond_str, *bond, qbond, smarts_mode);
bond->index = _qmol->addBond(bond->beg, bond->end, qbond.release());
}
_cycles[number].clear();
break;
}
// opening new cycle, like the first '1' in c1ccccc1
else
{
_cycles[number].beg = _atom_stack.top();
_cycles[number].pending_bond = -1;
_atoms[_cycles[number].beg].pending(number);
}
next = _scanner.lookNext();
}
if (added_bond)
continue;
}
if (next == '.')
{
_scanner.skip(1);
if (smarts_mode && _balance == 0)
{
_inside_smarts_component = false;
_atom_stack.clear(); // needed to detect errors like "C.(C)(C)"
}
else
{
if (_atom_stack.size() < 1)
; // we allow misplaced dots because we are so kind
else
_atom_stack.pop();
}
first_atom = true;
continue;
}
if (next == '(')
{
_scanner.skip(1);
if (smarts_mode && first_atom)
{
if (_balance > 0)
throw Error("hierarchical component-level grouping is not allowed");
_current_compno++;
_inside_smarts_component = true;
}
else
{
if (_atom_stack.size() < 1)
throw Error("probably misplaced '('");
_atom_stack.push(_atom_stack.top());
}
_balance++;
continue;
}
if (next == ')')
{
_scanner.skip(1);
if (_balance <= 0)
throw Error("unexpected ')'");
_balance--;
_atom_stack.pop();
continue;
}
if (!first_atom)
{
_bonds.push(_BondDesc{});
bond = &_bonds.top();
bond->beg = _atom_stack.top();
added_bond = true;
}
std::unique_ptr<QueryMolecule::Bond> qbond;
if (added_bond)
{
QS_DEF(Array<char>, bond_str);
bond_str.clear();
while (strchr("-=#:@!;,&~?/\\", next) != NULL)
{
bond_str.push(_scanner.readChar());
next = _scanner.lookNext();
}
if (_qmol != 0)
qbond = std::make_unique<QueryMolecule::Bond>();
// empty bond designator?
if (bond_str.size() < 1)
{
// In SMARTS mode a missing bond symbol is interpreted as "single or aromatic".
// (http://www.daylight.com/dayhtml/doc/theory/theory.smarts.html)
// But if both atoms cannot be aromatic, then empty bond can be just
// replaced to single.
// Such case is processed after
}
else
_readBond(bond_str, *bond, qbond, smarts_mode);
// The bond "directions" are already saved in _BondDesc::dir,
// so we can safely discard them. We are doing that to succeed
// the later check 'pending bond vs. closing bond'.
{
int i;
for (i = 0; i < bond_str.size(); i++)
if (bond_str[i] == '/' || bond_str[i] == '\\')
{
// Aromatic bonds can be a part of cis-trans configuration
// For example in Cn1c2ccccc2c(-c2ccccc2)n/c(=N\O)c1=O or O\N=c1/c(=O)c2ccccc2c(=O)/c/1=N\O
if (_atoms[bond->beg].aromatic && bond_str.size() == 1)
{
// Erase bond type info
bond_str[i] = '?';
bond->type = -1; // single of aromatic
}
else
bond_str[i] = '-';
}
}
if (bond_str.size() > 0)
{
if (isdigit(next) || next == '%')
{
int number;
_scanner.skip(1);
if (next == '%')
number = _scanner.readIntFix(2);
else
number = next - '0';
// closing some previous numbered atom, like the last '1' in C1C=CC=CC=1
if (number >= 0 && number < _cycles.size() && _cycles[number].beg >= 0)
{
bond->end = _cycles[number].beg;
if (_qmol != 0)
bond->index = _qmol->addBond(bond->beg, bond->end, qbond.release());
_atoms[bond->end].closure(number, bond->beg);
_atoms[bond->beg].neighbors.add(bond->end);
_cycles[number].clear();
continue;
}
// closing some previous pending cycle bond, like the last '1' in C=1C=CC=CC=1
else if (number >= 0 && number < _cycles.size() && _cycles[number].pending_bond >= 0)
{
int pending_bond_idx = _cycles[number].pending_bond;
_BondDesc& pending_bond = _bonds[pending_bond_idx];
// transfer direction from closing bond to pending bond
if (bond->dir > 0)
{
if (bond->dir == pending_bond.dir)
{
if (!ignore_closing_bond_direction_mismatch)
throw Error("cycle %d: closing bond direction does not match pending bond direction", number);
}
else
pending_bond.dir = 3 - bond->dir;
}
// apart from the direction, check that the closing bond matches the pending bond
const char* str = _pending_bonds_pool.at(_cycles[number].pending_bond_str);
if (bond_str.size() > 0)
{
if (!ignore_closing_bond_direction_mismatch)
{
if ((int)strlen(str) != bond_str.size() || memcmp(str, bond_str.ptr(), strlen(str)) != 0)
throw Error("cycle %d: closing bond description %.*s does not match pending bond description %s", number, bond_str.size(),
bond_str.ptr(), str);
}
}
else
{
bond_str.readString(str, false);
_readBond(bond_str, *bond, qbond, smarts_mode);
}
if (_qmol != 0)
pending_bond.index = _qmol->addBond(pending_bond.beg, bond->beg, qbond.release());
pending_bond.end = bond->beg;
_atoms[pending_bond.end].neighbors.add(pending_bond.beg);
_atoms[pending_bond.beg].closure(number, pending_bond.end);
// forget the closing bond but move its index here
// Bond order should correspons to atoms order
// Bond order for the following two molecules should be the same
// because later we add cis constraint:
// CCSc1nnc2c(OC=Nc3ccccc-23)n1 |c:9|
// CCSc1nnc-2c(OC=Nc3ccccc-23)n1 |c:9|
// Without this shift the 9th bond in the second structure is not double
_bonds.top() = pending_bond;
_bonds.remove(pending_bond_idx);
for (int i = 0; i < _cycles.size(); i++)
if (_cycles[i].pending_bond >= pending_bond_idx)
--_cycles[i].pending_bond;
_cycles[number].clear();
continue;
}
// opening some pending cycle bond, like the first '1' in C=1C=CC=CC=1
else
{
while (_cycles.size() <= number)
_cycles.push().clear();
_cycles[number].pending_bond = _bonds.size() - 1;
_cycles[number].pending_bond_str = _pending_bonds_pool.add(bond_str);
_cycles[number].beg = -1; // have it already in the bond
_atoms[bond->beg].pending(number);
continue;
}
}
}
}
_AtomDesc& atom = _atoms.push(_neipool);
std::unique_ptr<QueryMolecule::Atom> qatom;
if (_qmol != 0)
qatom = std::make_unique<QueryMolecule::Atom>();
if (added_bond)
bond->end = _atoms.size() - 1;
QS_DEF(Array<char>, atom_str);
atom_str.clear();
bool brackets = false;
if (next == '[')
{
_scanner.skip(1);
int cnt = 1;
while (1)
{
if (_scanner.isEOF())
throw Error("'[' without a ']'");
char c = _scanner.readChar();
if (c == '[')
cnt++;
else if (c == ']')
{
cnt--;
if (cnt == 0)
break;
}
atom_str.push(c);
}
brackets = true;
}
else if (next == -1)
throw Error("unexpected end of input");
else
{
_scanner.skip(1);
atom_str.push(static_cast<char>(next));
if (next == 'B' && _scanner.lookNext() == 'r')
atom_str.push(_scanner.readChar());
else if (next == 'C' && _scanner.lookNext() == 'l')
atom_str.push(_scanner.readChar());
}
_readAtom(atom_str, brackets, atom, qatom, smarts_mode, inside_rsmiles);
atom.brackets = brackets;
if (_qmol != 0)
{
_qmol->addAtom(qatom.release());
if (added_bond)
bond->index = _qmol->addBond(bond->beg, bond->end, qbond.release());
}
if (added_bond)
{
_atoms[bond->beg].neighbors.add(bond->end);
_atoms[bond->end].neighbors.add(bond->beg);
_atoms[bond->end].parent = bond->beg;
// when going from a polymer atom, make the new atom belong
// to the same polymer
if (_atoms[bond->beg].polymer_index >= 0)
{
// ... unless it goes from the polymer end
// and is not in braces
if (!_atoms[bond->beg].ends_polymer || (_atom_stack.size() >= 2 && _atom_stack.top() == _atom_stack[_atom_stack.size() - 2]))
_atoms[bond->end].polymer_index = _atoms[bond->beg].polymer_index;
}
}
if (_inside_smarts_component)
{
_qmol->components.expandFill(_atoms.size(), 0);
_qmol->components[_atoms.size() - 1] = _current_compno;
}
if (!first_atom)
_atom_stack.pop();
_atom_stack.push(_atoms.size() - 1);
first_atom = false;
while (_scanner.lookNext() == '{')
_handleCurlyBrace(atom, inside_polymer);
if (inside_polymer)
atom.polymer_index = _polymer_repetitions.size() - 1;
}
if (smarts_mode)
{
// SMARTS mode: process only empty bonds, and replace them with single or aromatic
for (int i = 0; i < _bonds.size(); i++)
{
int index = _bonds[i].index;
QueryMolecule::Bond& qbond = _qmol->getBond(index);
if (qbond.type != QueryMolecule::OP_NONE || _bonds[i].type == _ANY_BOND)
continue;
// A missing bond symbol is interpreted as "single or aromatic".
// (http://www.daylight.com/dayhtml/doc/theory/theory.smarts.html)
// But if an atom cannot be aromatic, then bond can be as single
const Edge& edge = _qmol->getEdge(index);
QueryMolecule::Atom& q_beg = _qmol->getAtom(edge.beg);
QueryMolecule::Atom& q_end = _qmol->getAtom(edge.end);
std::unique_ptr<QueryMolecule::Bond> new_qbond = std::make_unique<QueryMolecule::Bond>(QueryMolecule::BOND_ORDER, BOND_SINGLE);
bool beg_can_be_aromatic = q_beg.possibleValue(QueryMolecule::ATOM_AROMATICITY, ATOM_AROMATIC);
bool end_can_be_aromatic = q_end.possibleValue(QueryMolecule::ATOM_AROMATICITY, ATOM_AROMATIC);
int beg_label = _qmol->getAtomNumber(edge.beg);
if (beg_label != -1)
beg_can_be_aromatic &= Element::canBeAromatic(beg_label);
int end_label = _qmol->getAtomNumber(edge.end);
if (end_label != -1)
end_can_be_aromatic &= Element::canBeAromatic(end_label);
if (beg_can_be_aromatic && end_can_be_aromatic)
{
new_qbond.reset(QueryMolecule::Bond::oder(new_qbond.release(), new QueryMolecule::Bond(QueryMolecule::BOND_ORDER, BOND_AROMATIC)));
}
_qmol->resetBond(index, new_qbond.release());
}
}
int i;
for (i = 0; i < _cycles.size(); i++)
{
if (_cycles[i].beg >= 0)
throw Error("cycle %d not closed", i);
}
if (inside_polymer)
throw Error("polymer not closed");
}