in core/indigo-core/molecule/src/smiles_loader.cpp [288:1338]
void SmilesLoader::_readOtherStuff()
{
MoleculeCisTrans& cis_trans = _bmol->cis_trans;
QS_DEF(Array<int>, to_remove);
std::unordered_set<int> _overtly_defined_abs;
to_remove.clear();
while (1)
{
char c = _scanner.readChar();
if (c == '|')
break;
if (c == 'w') // 'ANY' stereocenters
{
char wmode = 0;
if (_scanner.lookNext() == 'U')
wmode = 'U';
if (_scanner.lookNext() == 'D')
wmode = 'D';
if (wmode)
_scanner.skip(1);
if (_scanner.readChar() != ':')
throw Error("colon expected after 'w%c'", wmode);
while (isdigit(_scanner.lookNext()))
{
int atom_idx = _scanner.readUnsigned();
// handle wiggly bonds
if (!wmode)
{
// This either bond can mark stereocenter or cis-trans double bond
// For example CC=CN |w:1.0|
const Vertex& v = _bmol->getVertex(atom_idx);
bool found = false;
for (int nei : v.neighbors())
{
int edge_idx = v.neiEdge(nei);
if (_bmol->getBondOrder(edge_idx) == BOND_DOUBLE && _bmol->getBondTopology(edge_idx) != TOPOLOGY_RING)
{
cis_trans.ignore(edge_idx);
found = true;
}
}
if (!found)
{
if (_bmol->isPossibleStereocenter(atom_idx))
{
// Check if the stereocenter has already been marked as any
// For example [H]C1(O)c2ccnn2[C@@H](O)c2ccnn12 |r,w:1.0,1.1|
if (!_bmol->stereocenters.exists(atom_idx))
_bmol->addStereocenters(atom_idx, MoleculeStereocenters::ATOM_ANY, 0, false);
}
}
}
if (_scanner.lookNext() == '.')
{
_scanner.skip(1);
auto bond_idx = _scanner.readUnsigned();
if (!_has_directions_on_rings)
_has_directions_on_rings = _bmol->getBondTopology(bond_idx) == TOPOLOGY_RING;
if (bond_idx < _bmol->edgeCount() && atom_idx < _bmol->vertexCount())
{
auto& v = _bmol->getEdge(bond_idx);
if (v.end == atom_idx)
_bmol->swapEdgeEnds(bond_idx);
if (v.beg == atom_idx)
_bmol->setBondDirection(bond_idx, wmode == 'U' ? BOND_UP : (wmode == 'D' ? BOND_DOWN : BOND_EITHER));
}
}
if (_scanner.lookNext() == ',')
_scanner.skip(1);
}
}
else if (c == 'a') // 'ABS' stereocenters
{
if (_scanner.readChar() != ':')
throw Error("colon expected after 'a'");
while (isdigit(_scanner.lookNext()))
{
int idx = _scanner.readUnsigned();
if (_bmol->stereocenters.exists(idx))
{
_bmol->stereocenters.setType(idx, MoleculeStereocenters::ATOM_ABS, 0);
}
else
{
_bmol->addStereocenters(idx, MoleculeStereocenters::ATOM_ABS, 0, false);
_bmol->stereocenters.setTetrahydral(idx, false);
}
_overtly_defined_abs.insert(idx);
if (_scanner.lookNext() == ',')
_scanner.skip(1);
}
}
else if (c == 'o') // 'OR' stereocenters
{
int groupno = _scanner.readUnsigned();
if (_scanner.readChar() != ':')
throw Error("colon expected after 'o'");
while (isdigit(_scanner.lookNext()))
{
int idx = _scanner.readUnsigned();
if (_bmol->stereocenters.exists(idx))
_bmol->stereocenters.setType(idx, MoleculeStereocenters::ATOM_OR, groupno);
else
{
_bmol->addStereocenters(idx, MoleculeStereocenters::ATOM_OR, groupno, false);
_bmol->stereocenters.setTetrahydral(idx, false);
}
if (_scanner.lookNext() == ',')
_scanner.skip(1);
}
}
else if (c == '&') // 'AND' stereocenters
{
int groupno = _scanner.readUnsigned();
if (_scanner.readChar() != ':')
throw Error("colon expected after '&'");
while (isdigit(_scanner.lookNext()))
{
int idx = _scanner.readUnsigned();
if (_bmol->stereocenters.exists(idx))
_bmol->stereocenters.setType(idx, MoleculeStereocenters::ATOM_AND, groupno);
else
{
_bmol->addStereocenters(idx, MoleculeStereocenters::ATOM_AND, groupno, false);
_bmol->stereocenters.setTetrahydral(idx, false);
}
if (_scanner.lookNext() == ',')
_scanner.skip(1);
}
}
else if (c == '^') // radicals
{
int rad = _scanner.readIntFix(1);
int radical;
if (rad == 1)
radical = RADICAL_DOUBLET;
else if (rad == 3)
radical = RADICAL_SINGLET;
else if (rad == 4)
radical = RADICAL_TRIPLET;
else
throw Error("unsupported radical number: %d", rad);
if (_scanner.readChar() != ':')
throw Error("colon expected after radical number");
while (isdigit(_scanner.lookNext()))
{
int idx = _scanner.readUnsigned();
if (_mol != 0)
_mol->setAtomRadical(idx, radical);
else
_qmol->resetAtom(idx, QueryMolecule::Atom::und(_qmol->releaseAtom(idx), new QueryMolecule::Atom(QueryMolecule::ATOM_RADICAL, radical)));
if (_scanner.lookNext() == ',')
_scanner.skip(1);
}
}
else if (c == '$') // pseudoatoms
{
QS_DEF(Array<char>, label);
for (int i = _bmol->vertexBegin(); i != _bmol->vertexEnd(); i = _bmol->vertexNext(i))
{
label.clear();
while (1)
{
if (_scanner.isEOF())
throw Error("end of input while reading $...$ block");
c = _scanner.readChar();
if (c == ';' || c == '$')
break;
label.push(c);
}
if (c == '$' && i != _bmol->vertexEnd() - 1)
throw Error("only %d atoms found in pseudo-atoms $...$ block", i + 1);
if (c == ';' && i == _bmol->vertexEnd() - 1)
throw Error("extra ';' in pseudo-atoms $...$ block");
if (label.size() > 0)
{
label.push(0);
int rnum;
if (label.size() > 3 && strncmp(label.ptr(), "_R", 2) == 0 && sscanf(label.ptr() + 2, "%d", &rnum) == 1)
{
// ChemAxon's Extended SMILES notation for R-sites
if (_qmol != 0)
_qmol->resetAtom(i, new QueryMolecule::Atom(QueryMolecule::ATOM_RSITE, 0));
_bmol->allowRGroupOnRSite(i, rnum);
// check multiple R-sites notation
BufferScanner strscan(label.ptr());
QS_DEF(Array<char>, word);
while (!strscan.isEOF())
{
strscan.skip(1);
strscan.readWord(word, ",;");
if (word.size() >= 3 && strncmp(word.ptr(), "_R", 2) == 0 && sscanf(word.ptr() + 2, "%d", &rnum) == 1)
_bmol->allowRGroupOnRSite(i, rnum);
}
}
else if (label.size() > 4 && strncmp(label.ptr(), "_AP", 3) == 0 && sscanf(label.ptr() + 3, "%d", &rnum) == 1)
{
// That is ChemAxon's Extended SMILES notation for attachment
// points. We mark the atom for removal and place attachment point
// markers on its neighbors.
int k;
const Vertex& v = _bmol->getVertex(i);
for (k = v.neiBegin(); k != v.neiEnd(); k = v.neiNext(k))
_bmol->addAttachmentPoint(rnum, v.neiVertex(k));
to_remove.push(i);
}
else
{
// That is ChemAxon's Extended SMILES notation for pseudoatoms and
// special atoms A,Q,X,M and AH,QH,XH,MH
if (label.size() > 3 &&
(strncmp(label.ptr() + label.size() - 3, "_p", 2) == 0 || strncmp(label.ptr() + label.size() - 3, "_e", 2) == 0))
{
label.pop();
label.pop();
label.pop();
label.push(0);
}
if (_mol != 0)
{
const auto atomNumber = _mol->getAtomNumber(i);
if (ELEM_MIN < atomNumber && atomNumber < ELEM_MAX)
{
_mol->setAlias(i, label.ptr());
}
else
{
_mol->setPseudoAtom(i, label.ptr());
}
}
else
{
if (label.size() == 2 && label[0] == 'Q')
{
std::unique_ptr<QueryMolecule::Atom> atom(_qmol->releaseAtom(i));
atom->removeConstraints(QueryMolecule::ATOM_NUMBER);
_qmol->resetAtom(
i, QueryMolecule::Atom::und(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_H)),
QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_C))));
}
else if (label.size() == 3 && label[0] == 'Q' && label[1] == 'H')
{
std::unique_ptr<QueryMolecule::Atom> atom(_qmol->releaseAtom(i));
atom->removeConstraints(QueryMolecule::ATOM_NUMBER);
_qmol->resetAtom(i, QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_C)));
}
else if (label.size() == 3 && label[0] == 'A' && label[1] == 'H')
{
std::unique_ptr<QueryMolecule::Atom> x_atom = std::make_unique<QueryMolecule::Atom>();
x_atom->type = QueryMolecule::OP_NONE;
_qmol->resetAtom(i, x_atom.release());
}
else if (label.size() == 2 && label[0] == 'X')
{
std::unique_ptr<QueryMolecule::Atom> x_atom = std::make_unique<QueryMolecule::Atom>();
x_atom->type = QueryMolecule::OP_OR;
x_atom->children.add(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_F));
x_atom->children.add(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_Cl));
x_atom->children.add(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_Br));
x_atom->children.add(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_I));
x_atom->children.add(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_At));
std::unique_ptr<QueryMolecule::Atom> atom(_qmol->releaseAtom(i));
atom->removeConstraints(QueryMolecule::ATOM_NUMBER);
_qmol->resetAtom(i, x_atom.release());
}
else if (label.size() == 3 && label[0] == 'X' && label[1] == 'H')
{
std::unique_ptr<QueryMolecule::Atom> x_atom = std::make_unique<QueryMolecule::Atom>();
x_atom->type = QueryMolecule::OP_OR;
x_atom->children.add(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_F));
x_atom->children.add(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_Cl));
x_atom->children.add(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_Br));
x_atom->children.add(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_I));
x_atom->children.add(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_At));
x_atom->children.add(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_H));
std::unique_ptr<QueryMolecule::Atom> atom(_qmol->releaseAtom(i));
atom->removeConstraints(QueryMolecule::ATOM_NUMBER);
_qmol->resetAtom(i, x_atom.release());
}
else if (label.size() == 2 && label[0] == 'M')
{
std::unique_ptr<QueryMolecule::Atom> x_atom = std::make_unique<QueryMolecule::Atom>();
x_atom->type = QueryMolecule::OP_AND;
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_C)));
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_N)));
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_O)));
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_F)));
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_P)));
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_S)));
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_Cl)));
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_Se)));
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_Br)));
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_I)));
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_At)));
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_He)));
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_Ne)));
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_Ar)));
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_Kr)));
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_Xe)));
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_Rn)));
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_H)));
std::unique_ptr<QueryMolecule::Atom> atom(_qmol->releaseAtom(i));
atom->removeConstraints(QueryMolecule::ATOM_NUMBER);
_qmol->resetAtom(i, x_atom.release());
}
else if (label.size() == 3 && label[0] == 'M' && label[1] == 'H')
{
std::unique_ptr<QueryMolecule::Atom> x_atom = std::make_unique<QueryMolecule::Atom>();
x_atom->type = QueryMolecule::OP_AND;
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_C)));
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_N)));
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_O)));
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_F)));
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_P)));
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_S)));
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_Cl)));
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_Se)));
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_Br)));
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_I)));
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_At)));
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_He)));
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_Ne)));
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_Ar)));
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_Kr)));
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_Xe)));
x_atom->children.add(QueryMolecule::Atom::nicht(new QueryMolecule::Atom(QueryMolecule::ATOM_NUMBER, ELEM_Rn)));
std::unique_ptr<QueryMolecule::Atom> atom(_qmol->releaseAtom(i));
atom->removeConstraints(QueryMolecule::ATOM_NUMBER);
_qmol->resetAtom(i, x_atom.release());
}
else
{
const auto atomNumber = _qmol->getAtomNumber(i);
if (ELEM_MIN < atomNumber && atomNumber < ELEM_MAX)
{
_qmol->setAlias(i, label.ptr());
}
else
{
std::unique_ptr<QueryMolecule::Atom> atom(_qmol->releaseAtom(i));
atom->removeConstraints(QueryMolecule::ATOM_NUMBER);
_qmol->resetAtom(
i, QueryMolecule::Atom::und(atom.release(), new QueryMolecule::Atom(QueryMolecule::ATOM_PSEUDO, label.ptr())));
}
}
}
}
}
}
}
else if (c == 'c' || c == 't') // CIS and TRANS bonds
{
if (_scanner.readChar() != ':')
throw Error("colon expected after '%c' identifier", c);
while (isdigit(_scanner.lookNext()))
{
int idx = _scanner.readUnsigned();
bool skip = false;
if (ignore_cistrans_errors && !MoleculeCisTrans::isGeomStereoBond(*_bmol, _bonds[idx].index, nullptr, false))
skip = true;
if (!skip)
{
_bmol->restoreSubstituents(_bonds[idx].index);
const int* subst = _bmol->cis_trans.getSubstituents(_bonds[idx].index);
int parity = ((c == 'c') ? MoleculeCisTrans::CIS : MoleculeCisTrans::TRANS);
/* CXSmiles doc says:
the double bond has the representation a1-a2=a3-a4, where
a1 is the smallest atom index of the generated smiles connected to a2
a2 is the double bond smaller atom index in the generated smiles
a3 is the double bond larger atom index in the generated smiles
a4 is the smallest atom index of the generated smiles connected to a3
* We need to know if the calculated substituents' indices are not "smallest"
* (i.e. they have other substituent with smaller index on the same side).
* In that case, we invert the parity.
*/
if (subst[1] != -1 && subst[1] < subst[0])
parity = 3 - parity;
if (subst[3] != -1 && subst[3] < subst[2])
parity = 3 - parity;
_bmol->cis_trans.setParity(_bonds[idx].index, parity);
}
if (_scanner.lookNext() == ',')
_scanner.skip(1);
}
}
else if (c == '(') // atom coordinates
{
for (int i = _bmol->vertexBegin(); i != _bmol->vertexEnd(); i = _bmol->vertexNext(i))
{
float x, y, z = 0;
x = _scanner.readFloat();
if (_scanner.readChar() != ',')
throw Error("expected comma after X coordinate");
y = _scanner.readFloat();
if (_scanner.lookNext() != ';' && _scanner.lookNext() != ')')
{
if (_scanner.readChar() != ',')
throw Error("expected comma after Y coordinate");
if (_scanner.lookNext() == ';')
_scanner.skip(1);
else if (_scanner.lookNext() == ')')
;
else
z = _scanner.readFloat();
}
else
{
_scanner.skip(1);
if (_scanner.readChar() != ';')
throw Error("expected ';' after coordinates");
}
_bmol->setAtomXyz(i, x, y, z);
}
if (_scanner.readChar() != ')')
throw Error("expected ')' after coordinates");
_has_atom_coordinates = true;
}
else if (c == 'h') // highlighting (Indigo's own extension)
{
c = _scanner.readChar();
int a = false;
if (c == 'a')
a = true;
else if (c != 'b')
throw Error("expected 'a' or 'b' after 'h', got '%c'", c);
if (_scanner.readChar() != ':')
throw Error("colon expected after 'h%c'", a ? 'a' : 'b');
while (isdigit(_scanner.lookNext()))
{
int idx = _scanner.readUnsigned();
if (a)
_bmol->highlightAtom(idx);
else
_bmol->highlightBond(idx);
if (_scanner.lookNext() == ',')
_scanner.skip(1);
}
}
else if (c == 'r')
{
if (_scanner.lookNext() == 'b')
{
if (_qmol == 0)
throw Error("'rb' is allowed only within queries");
_scanner.skip(1);
if (_scanner.readChar() != ':')
throw Error("colon expected after 'rb' identifier");
while (isdigit(_scanner.lookNext()))
{
// remove 'x' or 'r' configured ATOM_RING_BONDS
int atom_idx = _scanner.readUnsigned();
QueryMolecule::Atom& atom = _qmol->getAtom(atom_idx);
if (atom.hasConstraint(QueryMolecule::ATOM_RING_BONDS))
atom.removeConstraints(QueryMolecule::ATOM_RING_BONDS);
if (_scanner.readChar() != ':')
throw Error("colon expected after 'rb:n'");
if (_scanner.lookNext() == '*')
{
_scanner.skip(1);
int rbonds = 0;
const Vertex& vertex = _qmol->getVertex(atom_idx);
for (int k = vertex.neiBegin(); k != vertex.neiEnd(); k = vertex.neiNext(k))
if (_qmol->getEdgeTopology(vertex.neiEdge(k)) == TOPOLOGY_RING)
rbonds++;
_qmol->resetAtom(atom_idx, QueryMolecule::Atom::und(_qmol->releaseAtom(atom_idx),
new QueryMolecule::Atom(QueryMolecule::ATOM_RING_BONDS_AS_DRAWN, rbonds)));
}
else
{
int rbcount = _scanner.readUnsigned();
if (rbcount)
{
_qmol->resetAtom(atom_idx, QueryMolecule::Atom::und(
_qmol->releaseAtom(atom_idx),
new QueryMolecule::Atom(QueryMolecule::ATOM_RING_BONDS, rbcount, (rbcount < 4 ? rbcount : 100))));
}
else
_qmol->resetAtom(
atom_idx, QueryMolecule::Atom::und(_qmol->releaseAtom(atom_idx), new QueryMolecule::Atom(QueryMolecule::ATOM_RING_BONDS, 0)));
}
if (_scanner.lookNext() == ',')
_scanner.skip(1);
}
}
else
{
// All stereocenters are relative instead of abs
MoleculeStereocenters& s = _bmol->stereocenters;
for (int i = s.begin(); i != s.end(); i = s.next(i))
{
int atom = s.getAtomIndex(i);
if (s.getType(atom) == MoleculeStereocenters::ATOM_ABS && !ignore_no_chiral_flag &&
_overtly_defined_abs.find(atom) == _overtly_defined_abs.end())
s.setType(atom, MoleculeStereocenters::ATOM_AND, 1);
}
}
}
else if ((c == 'S') && (_scanner.lookNext() == 'g'))
{
// SGroup block found
_scanner.skip(1);
int sg_type = -1;
// Data S-group - 'SgD:atomic_indexes:field_name:data_value:query_op:unit:tag:(coords)'
// Optional coordinates in parenthesis if necessary, separated by colon characters.
// The field values with special characters are escaped.
// If atomic coordinates are exported (with option c ) (-1) is used in the coordinate field for Data S-group attached to the atoms.
if (_scanner.lookNext() == 'D')
{
_scanner.skip(1);
sg_type = SGroup::SG_TYPE_DAT;
}
if (_scanner.readChar() != ':')
throw Error("colon expected after 'Sg'");
// If not a data S-group - get group type after colon
//
// 'Sg:type:atomic_indexes:subscript:superscript:head_bond_indexes:tail_bond_indexes:bracket
//
// atomic_indexes - Atom indexes separated with commas
// subscript - Subscript of the S-group. If the subscript equals the keyword of the S-group this field can be empty. Escaped field.
// superscript - Superscript of the S-group. Only connectivity and flip information is allowed. This field can be empty. Escaped field.
// *_bond_indexes - The indexes of bonds that share a common bracket in case of ladder-type polymers.
// head_bond_indexes - Head crossing bond indexes. This field can be empty.
// tail_bond_indexes - Tail crossing bond indexes. This field can be empty.
// bracket - bracket orientation, bracket type followed by the coordinates (4 pair, separated with commas). Bracket orientation
// can be s or d (single or double), bracket type can be b,c,r,s for braces, chevrons, round and square, respectively.
// The brackets are written between parentheses and separated with semicolons.
if (sg_type == -1)
{
int sg = _scanner.lookNext();
constexpr size_t sg_type_max_len = 3;
char pchar_sg_type[sg_type_max_len];
std::string sg_type_str;
if (sg == 'n')
{
sg_type = SGroup::SG_TYPE_SRU;
_scanner.skip(1);
if (_scanner.readChar() != ':')
throw Error("colon expected after 'Sg:n'");
}
else if (sg == 'g')
{
_scanner.readCharsFix(sizeof(pchar_sg_type), pchar_sg_type);
sg_type_str = std::string(pchar_sg_type, sizeof(pchar_sg_type));
if (sg_type_str == "gen")
{
if (_scanner.readChar() != ':')
throw Error("colon expected after 'Sg:%s'", sg_type_str.c_str());
sg_type = SGroup::SG_TYPE_GEN;
}
else
throw Error("unexpected 'Sg' %s", sg_type_str.c_str());
}
else
{
throw Error("Unsupported Sg type");
}
}
int idx = _bmol->sgroups.addSGroup(sg_type);
auto& sgroup = _bmol->sgroups.getSGroup(idx);
// add brackets
Vec2f* p = sgroup.brackets.push();
p[0].set(0, 0);
p[1].set(0, 0);
p = sgroup.brackets.push();
p[0].set(0, 0);
p[1].set(0, 0);
while (isdigit(_scanner.lookNext()))
{
auto atom_idx = _scanner.readUnsigned();
sgroup.atoms.push(atom_idx);
if (_scanner.lookNext() == ',')
_scanner.skip(1);
}
if (_scanner.lookNext() != ':')
continue;
_scanner.skip(1); // skip ':'
const char* word_delimiter = ":,|";
if (sg_type == SGroup::SG_TYPE_DAT)
{
DataSGroup& dsg = static_cast<DataSGroup&>(sgroup);
// field_name
_scanner.readWord(dsg.name, word_delimiter);
if (_scanner.lookNext() != ':') // No more fields
continue;
_scanner.skip(1); // Skip :
// data_value
_scanner.readWord(dsg.data, word_delimiter);
if (_scanner.lookNext() != ':') // No more fields
continue;
_scanner.skip(1); // Skip :
// query_op
_scanner.readWord(dsg.queryoper, word_delimiter);
if (_scanner.lookNext() != ':') // No more fields
continue;
_scanner.skip(1); // Skip :
// unit
_scanner.readWord(dsg.description, word_delimiter);
if (_scanner.lookNext() != ':') // No more fields
continue;
_scanner.skip(1); // Skip :
// tag
int next = _scanner.lookNext();
if (next > 0 && next != ':' && next != ',')
{
dsg.tag = static_cast<char>(next);
_scanner.skip(1); // Skip tag
}
if (_scanner.lookNext() != ':') // No more fields
continue;
_scanner.skip(1); // Skip :
// (coords)
if (_scanner.lookNext() != '(') // No more fields
continue;
long long pos = _scanner.tell();
constexpr char minus1[] = "(-1)";
constexpr size_t minus1_len = sizeof(minus1) - 1;
if (_scanner.length() - pos >= minus1_len)
{
// check for (-1)
char buf[minus1_len];
_scanner.read(minus1_len, buf);
if (strncmp(buf, minus1, sizeof(buf)) == 0)
continue;
_scanner.seek(pos, SEEK_SET);
}
_scanner.skip(1); // Skip (
dsg.display_pos.x = _scanner.readFloat();
c = _scanner.readChar();
if (c != ',')
throw Error("Data S-group coord error");
dsg.display_pos.y = _scanner.readFloat();
c = _scanner.readChar();
if (c != ')')
throw Error("Data S-group coord error");
}
else
{
QS_DEF(Array<char>, subscript);
QS_DEF(Array<char>, conn_arr);
std::string connectivity, flip;
subscript.clear();
conn_arr.clear();
_scanner.readWord(subscript, word_delimiter);
if (_scanner.lookNext() == ':')
{
_scanner.skip(1);
_scanner.readWord(conn_arr, word_delimiter);
if (conn_arr.find('#') >= 0)
{
// Possible encoded symbols. Try to decode
BufferScanner word_scan{conn_arr};
while (!word_scan.isEOF())
connectivity += readSgChar(word_scan);
}
else
{
connectivity = conn_arr.ptr();
}
// If ',' in field - it is both connectivity and flip
std::size_t pos = connectivity.find(',');
if (pos != std::string::npos)
{
flip = connectivity.substr(pos + 1);
connectivity = connectivity.substr(0, pos);
}
}
// Set fields for SRU S-Group
if (sg_type == SGroup::SG_TYPE_SRU)
{
RepeatingUnit& ru = static_cast<RepeatingUnit&>(sgroup);
if (subscript.size())
ru.subscript.readString(subscript.ptr(), true);
if (connectivity == "ht")
ru.connectivity = RepeatingUnit::HEAD_TO_TAIL;
else if (connectivity == "hh")
ru.connectivity = RepeatingUnit::HEAD_TO_HEAD;
else if (connectivity == "eu")
ru.connectivity = RepeatingUnit::EITHER;
}
if (_scanner.lookNext() != ':')
continue;
_scanner.skip(1); // skip :
// head_bond_indexes - Head crossing bond indexes. This field can be empty.
while (isdigit(_scanner.lookNext()))
{
/*auto atom_idx =*/std::ignore = _scanner.readUnsigned();
// no support for now
if (_scanner.lookNext() == ',')
_scanner.skip(1);
}
if (_scanner.lookNext() != ':')
continue;
_scanner.skip(1); // skip :
// tail_bond_indexes - Tail crossing bond indexes. This field can be empty.
while (isdigit(_scanner.lookNext()))
{
/*auto atom_idx =*/std::ignore = _scanner.readUnsigned();
// no support for now
if (_scanner.lookNext() == ',')
_scanner.skip(1);
}
if (_scanner.lookNext() != ':')
continue;
_scanner.skip(1); // skip :
// bracket - bracket orientation, bracket type followed by the coordinates (4 pair, separated with commas).
if (_scanner.lookNext() != '(')
continue;
_scanner.skip(1); // skip (
/*char br_orient = */ std::ignore = _scanner.readChar();
c = _scanner.readChar();
if (c != ',')
throw Error("S-group bracket orientation format error");
/*char br_type =*/std::ignore = _scanner.readChar();
c = _scanner.readChar();
int count = 0;
constexpr int bracket_coord_count = 8;
while (c == ',' && count < bracket_coord_count)
{
std::ignore = _scanner.readFloat();
c = _scanner.readChar();
++count;
}
if (count < bracket_coord_count)
throw Error("S-group bracket orientation format error");
if (c == ',')
c = _scanner.readChar();
if (c != ')')
throw Error("S-group bracket orientation format error");
}
}
else if ((c == 'R') && (_scanner.lookNext() == 'G'))
{
// RGroup block found
_scanner.skip(1);
if (_scanner.readChar() != ':')
throw Error("colon expected after 'RG'");
MoleculeRGroups* rgroups = &_bmol->rgroups;
QS_DEF(Array<char>, label);
while (1)
{
if ((_scanner.lookNext() == '_') || (_scanner.lookNext() == 'L'))
label.clear();
else if (_scanner.lookNext() == '|')
break;
while (1)
{
if (_scanner.isEOF())
throw Error("end of input while reading RG block");
c = _scanner.readChar();
if (c == '=')
break;
label.push(c);
}
if (label.size() > 0)
{
label.push(0);
int rnum;
if (label.size() > 3 && strncmp(label.ptr(), "_R", 2) == 0 && sscanf(label.ptr() + 2, "%d", &rnum) == 1)
{
// RGroup description found
QS_DEF(Array<char>, rgdesc);
RGroup& rgroup = rgroups->getRGroup(rnum);
while (1)
{
if (_scanner.isEOF())
throw Error("end of input while reading RG block");
if (_scanner.lookNext() == '{')
{
_scanner.skip(1);
_scanner.readWord(rgdesc, "}");
_scanner.skip(1);
}
else if (_scanner.lookNext() == ',')
{
_scanner.skip(1);
continue;
}
else if ((_scanner.lookNext() == '_') || (_scanner.lookNext() == 'L') || (_scanner.lookNext() == '|'))
{
break;
}
else
{
_scanner.skip(1);
continue;
}
if (rgdesc.size() > 0)
{
rgdesc.pop();
std::unique_ptr<BaseMolecule> fragment(_bmol->neu());
BufferScanner rg_scanner(rgdesc);
SmilesLoader rg_loader(rg_scanner);
if (_bmol->isQueryMolecule())
{
rg_loader.loadQueryMolecule(fragment.get()->asQueryMolecule());
}
else
{
rg_loader.loadMolecule(fragment.get()->asMolecule());
}
rgroup.fragments.add(fragment.release());
}
}
}
else if (label.size() > 3 && strncmp(label.ptr(), "LOG", 3) == 0)
{
// RGroup logic block found
while (1)
{
label.clear();
if ((_scanner.lookNext() == '{') || (_scanner.lookNext() == '_'))
{
if (_scanner.lookNext() == '{')
_scanner.skip(1);
while (1)
{
if (_scanner.isEOF())
throw Error("end of input while reading LOG block");
c = _scanner.readChar();
if (c == ':')
break;
label.push(c);
}
}
else if (_scanner.lookNext() == '}')
{
_scanner.skip(1);
break;
}
else
break;
if (label.size() > 0)
{
label.push(0);
if (label.size() > 3 && strncmp(label.ptr(), "_R", 2) == 0 && sscanf(label.ptr() + 2, "%d", &rnum) == 1)
{
RGroup& rgroup = rgroups->getRGroup(rnum);
int if_then = 0;
int rest_h = 0;
QS_DEF(Array<char>, occurrence_str);
if (_scanner.lookNext() == '_')
{
label.clear();
while (1)
{
if (_scanner.isEOF())
throw Error("end of input while reading LOG block");
c = static_cast<char>(_scanner.lookNext());
if (c == ';')
break;
label.push(c);
_scanner.skip(1);
}
label.push(0);
if (label.size() > 3 && strncmp(label.ptr(), "_R", 2) == 0 && sscanf(label.ptr() + 2, "%d", &rnum) == 1)
{
if_then = rnum;
}
}
rgroup.if_then = if_then;
if (_scanner.lookNext() == ';')
{
_scanner.skip(1);
if (_scanner.lookNext() == 'H')
{
rest_h = 1;
_scanner.skip(1);
}
}
rgroup.rest_h = rest_h;
if (_scanner.lookNext() == ';')
{
_scanner.skip(1);
if (_scanner.lookNext() == '.')
{
_scanner.skip(1);
break;
}
}
_scanner.readWord(occurrence_str, ".}");
_readRGroupOccurrenceRanges(occurrence_str.ptr(), rgroup.occurrence);
_scanner.skip(1);
}
}
}
}
}
}
}
else if (c == 'u')
{
if (_qmol == 0)
throw Error("'u' is allowed only within queries");
if (_scanner.readChar() != ':')
throw Error("colon expected after 'u' identifier");
while (isdigit(_scanner.lookNext()))
{
int atom_idx = _scanner.readUnsigned();
_qmol->resetAtom(atom_idx,
QueryMolecule::Atom::und(_qmol->releaseAtom(atom_idx), new QueryMolecule::Atom(QueryMolecule::ATOM_UNSATURATION, 0)));
if (_scanner.lookNext() == ',')
_scanner.skip(1);
}
}
else if (c == 's')
{
if (_qmol == 0)
throw Error("'s' is allowed only within queries");
if (_scanner.readChar() != ':')
throw Error("colon expected after 's' identifier");
while (isdigit(_scanner.lookNext()))
{
int atom_idx = _scanner.readUnsigned();
if (_scanner.readChar() != ':')
throw Error("colon expected after 's:n'");
int subs = -2;
if (_scanner.lookNext() == '*')
{
_scanner.skip(1);
}
else
{
subs = _scanner.readUnsigned();
if (!subs)
subs = -1;
}
QueryMolecule::Atom& atom = _qmol->getAtom(atom_idx);
// remove what was set with 'D'
if (atom.hasConstraint(QueryMolecule::ATOM_SUBSTITUENTS))
atom.removeConstraints(QueryMolecule::ATOM_SUBSTITUENTS);
switch (subs)
{
case -1:
_qmol->resetAtom(atom_idx,
QueryMolecule::Atom::und(_qmol->releaseAtom(atom_idx), new QueryMolecule::Atom(QueryMolecule::ATOM_SUBSTITUENTS, 0)));
break;
case -2:
_qmol->resetAtom(atom_idx,
QueryMolecule::Atom::und(_qmol->releaseAtom(atom_idx), new QueryMolecule::Atom(QueryMolecule::ATOM_SUBSTITUENTS_AS_DRAWN,
_qmol->getVertex(atom_idx).degree())));
break;
default:
_qmol->resetAtom(atom_idx, QueryMolecule::Atom::und(_qmol->releaseAtom(atom_idx), new QueryMolecule::Atom(QueryMolecule::ATOM_SUBSTITUENTS,
subs, (subs < 6 ? subs : 100))));
break;
}
if (_scanner.lookNext() == ',')
_scanner.skip(1);
}
}
}
if (to_remove.size() > 0)
_bmol->removeAtoms(to_remove);
}