in core/indigo-core/molecule/src/smiles_saver.cpp [82:704]
void SmilesSaver::_saveMolecule()
{
_validate(*_bmol);
int i, j, k;
_ignored_vertices.clear_resize(_bmol->vertexEnd());
_ignored_vertices.zerofill();
if (ignore_hydrogens)
{
if (_qmol != 0)
throw Error("ignore_hydrogens does not make sense for query molecules");
for (i = _bmol->vertexBegin(); i < _bmol->vertexEnd(); i = _bmol->vertexNext(i))
if (_mol->asMolecule().convertableToImplicitHydrogen(i))
_ignored_vertices[i] = 1;
}
_checkRGroupsAndAttachmentPoints();
_checkSRU();
_touched_cistransbonds = 0;
_complicated_cistrans.clear_resize(_bmol->edgeEnd());
_complicated_cistrans.zerofill();
_ban_slashes.clear_resize(_bmol->edgeEnd());
_ban_slashes.zerofill();
_cis_trans_parity.clear_resize(_bmol->edgeEnd());
_cis_trans_parity.zerofill();
_markCisTrans();
_atoms.clear();
while (_atoms.size() < _bmol->vertexEnd())
_atoms.push(_neipool);
_written_atoms.clear();
_written_bonds.clear();
_written_components = 0;
_aromatic_bonds.clear();
_hcount.clear_resize(_bmol->vertexEnd());
_hcount_ignored.clear_resize(_bmol->vertexEnd());
for (i = _bmol->vertexBegin(); i < _bmol->vertexEnd(); i = _bmol->vertexNext(i))
{
_hcount[i] = 0;
if (_mol != 0)
{
if (!_mol->isPseudoAtom(i) && !_mol->isTemplateAtom(i) && !_mol->isRSite(i))
_hcount[i] = _mol->getImplicitH_NoThrow(i, -1);
}
else
{
int atom_number = _bmol->getAtomNumber(i);
int atom_charge = _bmol->getAtomCharge(i);
_hcount[i] = MoleculeSavers::getHCount(*_bmol, i, atom_number, atom_charge);
}
_hcount_ignored[i] = 0;
const Vertex& vertex = _bmol->getVertex(i);
if (ignore_hydrogens)
{
if (_hcount[i] >= 0)
{
for (j = vertex.neiBegin(); j != vertex.neiEnd(); j = vertex.neiNext(j))
{
int idx = vertex.neiVertex(j);
if (_bmol->getAtomNumber(idx) == ELEM_H && _bmol->getAtomIsotope(idx) == 0)
if (_ignored_vertices[idx])
_hcount_ignored[i]++;
}
_hcount[i] += _hcount_ignored[i];
}
}
if (_bmol->getAtomAromaticity(i) == ATOM_AROMATIC)
{
_atoms[i].aromatic = true;
// From the SMILES specification:
// Please note that only atoms on the following list
// can be considered aromatic: C, N, O, P, S, As, Se, and * (wildcard).
static int allowed_lowercase[] = {ELEM_C, ELEM_N, ELEM_O, ELEM_P, ELEM_S, ELEM_Se, ELEM_As};
if (_bmol->atomNumberBelongs(i, allowed_lowercase, NELEM(allowed_lowercase)))
_atoms[i].lowercase = true;
}
}
DfsWalk walk(*_bmol);
walk.ignored_vertices = _ignored_vertices.ptr();
walk.vertex_ranks = vertex_ranks;
if (_bmol->sgroups.isPolimer())
walk.vertex_classes = _polymer_indices.ptr();
if (separate_rsites)
{
for (i = _bmol->vertexBegin(); i < _bmol->vertexEnd(); i = _bmol->vertexNext(i))
{
if (_bmol->isRSite(i))
// We break the DFS walk when going through R-sites. For details, see
// http://blueobelisk.shapado.com/questions/how-r-group-atoms-should-be-represented-in-smiles
walk.mustBeRootVertex(i);
}
}
walk.walk();
const Array<DfsWalk::SeqElem>& v_seq = walk.getSequence();
Array<int> v_to_comp_group;
v_to_comp_group.resize(v_seq.size());
v_to_comp_group.fffill();
if (_qmol != nullptr && smarts_mode)
{
if (v_seq.size() < 1)
return; // No atoms to save
std::set<int> components;
int cur_component = -1;
for (i = 0; i < v_seq.size(); ++i && _qmol->components.size() > 0)
{
// In v_seq each fragment started with vertex which parent == -1
// In SMARTS some fragments could be grouped (component-level grouping)
// In QueryMolecule group number stored in "".components" member. GroupId == 0 means no group defined.
// Each fragment - connected graph, so all vertexes should belong to one group.
// All group fragments should go one by one - in SMARTS its inside "()".
if (v_seq[i].parent_vertex < 0) // New Fragment
{
int new_component = 0;
if (_qmol->components.size() > v_seq[i].idx)
{
new_component = _qmol->components[v_seq[i].idx];
// if component defined for new fragment(id>0) and its different from previous and seen before
if (new_component > 0 && new_component != cur_component && components.count(new_component))
{
// According to the DfsWalk code, the groups components should be neighbors.
// If will be found case when it wrong - add code to rearrange fragments
throw Error("SMARTS fragments need to reaarange.");
}
components.emplace(new_component);
}
cur_component = new_component;
}
else
{
if (_qmol->components.size() > v_seq[i].idx && cur_component != _qmol->components[v_seq[i].idx])
{
// Fragment contains atoms from different components - something went wrong
throw Error("Fragment contains atoms from different components.");
}
}
v_to_comp_group[i] = cur_component;
}
}
// fill up neighbor lists for the stereocenters calculation
for (i = 0; i < v_seq.size(); i++)
{
int v_idx = v_seq[i].idx;
int e_idx = v_seq[i].parent_edge;
int v_prev_idx = v_seq[i].parent_vertex;
_Atom& atom = _atoms[v_idx];
if (e_idx >= 0)
{
if (walk.isClosure(e_idx))
{
for (k = atom.neighbors.begin(); k != atom.neighbors.end(); k = atom.neighbors.next(k))
{
if (atom.neighbors[k] == -1)
{
atom.neighbors[k] = v_prev_idx;
break;
}
}
if (k == atom.neighbors.end())
throw Error("internal: can not put closing bond to its place");
}
else
{
atom.neighbors.add(v_prev_idx);
atom.parent = v_prev_idx;
}
_atoms[v_prev_idx].neighbors.add(v_idx);
}
if (e_idx < 0 || !walk.isClosure(e_idx))
{
int openings = walk.numOpenings(v_idx);
for (j = 0; j < openings; j++)
atom.neighbors.add(-1);
}
// also, detect polymer borders
if (_polymer_indices[v_idx] >= 0 && (v_prev_idx == -1 || _polymer_indices[v_prev_idx] != _polymer_indices[v_idx]))
atom.starts_polymer = true;
if (v_prev_idx >= 0 && _polymer_indices[v_prev_idx] >= 0 && _polymer_indices[v_prev_idx] != _polymer_indices[v_idx])
_atoms[v_prev_idx].ends_polymer = true;
}
_written_atoms_inv.clear_resize(_bmol->vertexEnd());
_written_atoms_inv.fffill();
for (i = 0; i < v_seq.size(); i++)
{
int v_idx = v_seq[i].idx;
int e_idx = v_seq[i].parent_edge;
if (e_idx == -1 || !walk.isClosure(e_idx))
{
_written_atoms.push(v_idx);
_written_atoms_inv[v_idx] = _written_atoms.size() - 1;
}
if (e_idx != -1)
_written_bonds.push(e_idx);
}
// detect chiral configurations
MoleculeStereocenters& stereocenters = _bmol->stereocenters;
for (i = stereocenters.begin(); i != stereocenters.end(); i = stereocenters.next(i))
{
int atom_idx, type, group, pyramid[4];
stereocenters.get(i, atom_idx, type, group, pyramid);
if (type < MoleculeStereocenters::ATOM_AND || !stereocenters.isTetrahydral(atom_idx))
continue;
int implicit_h_idx = -1;
if (pyramid[3] == -1)
implicit_h_idx = 3;
else
for (j = 0; j < 4; j++)
if (_ignored_vertices[pyramid[j]])
{
implicit_h_idx = j;
break;
}
int pyramid_mapping[4];
int counter = 0;
_Atom& atom = _atoms[atom_idx];
if (atom.parent != -1)
for (k = 0; k < 4; k++)
if (pyramid[k] == atom.parent)
{
pyramid_mapping[counter++] = k;
break;
}
if (implicit_h_idx != -1)
pyramid_mapping[counter++] = implicit_h_idx;
for (j = atom.neighbors.begin(); j != atom.neighbors.end(); j = atom.neighbors.next(j))
{
if (atom.neighbors[j] == atom.parent)
continue;
for (k = 0; k < 4; k++)
if (atom.neighbors[j] == pyramid[k])
{
if (counter >= 4)
throw Error("internal: pyramid overflow");
pyramid_mapping[counter++] = k;
break;
}
}
if (counter == 4)
{
// move the 'from' atom to the end
counter = pyramid_mapping[0];
pyramid_mapping[0] = pyramid_mapping[1];
pyramid_mapping[1] = pyramid_mapping[2];
pyramid_mapping[2] = pyramid_mapping[3];
pyramid_mapping[3] = counter;
}
else if (counter != 3)
throw Error("cannot calculate chirality");
if (MoleculeStereocenters::isPyramidMappingRigid(pyramid_mapping))
_atoms[atom_idx].chirality = 1;
else
_atoms[atom_idx].chirality = 2;
}
MoleculeAlleneStereo& allene_stereo = _bmol->allene_stereo;
for (i = allene_stereo.begin(); i != allene_stereo.end(); i = allene_stereo.next(i))
{
int atom_idx, left, right, parity, subst[4], subst_map[4] = {-1, -1, -1, -1};
allene_stereo.get(i, atom_idx, left, right, subst, parity);
for (j = 0; j < 4; j++)
if (subst[j] >= 0)
subst_map[j] = _written_atoms_inv[subst[j]];
// Daylight doc says: Hydrogens attached to substituted allene-like atoms
// are taken to be immediately following that atom
if (subst_map[0] < 0)
subst_map[0] = _written_atoms_inv[left];
else if (subst_map[1] < 0)
subst_map[1] = _written_atoms_inv[left];
if (subst_map[2] < 0)
subst_map[2] = _written_atoms_inv[right];
else if (subst_map[3] < 0)
subst_map[3] = _written_atoms_inv[right];
if (subst_map[0] < 0 || subst_map[1] < 0 || subst_map[2] < 0 || subst_map[3] < 0)
throw Error("internal: allene subsituent not mapped");
if (subst_map[1] < subst_map[0])
{
std::swap(subst_map[0], subst_map[1]);
parity = 3 - parity;
}
if (subst_map[3] < subst_map[2])
{
std::swap(subst_map[2], subst_map[3]);
parity = 3 - parity;
}
_atoms[atom_idx].chirality = 3 - parity;
}
if (canonize_chiralities)
{
QS_DEF(Array<int>, marked);
QS_DEF(Array<int>, ids);
stereocenters = _bmol->stereocenters;
marked.clear_resize(_bmol->vertexEnd());
marked.zerofill();
for (i = 0; i < _written_atoms.size(); i++)
{
if (marked[i])
continue;
int idx = _written_atoms[i];
if (_atoms[idx].chirality == 0 || !stereocenters.isTetrahydral(idx))
continue;
int type = stereocenters.getType(idx);
if (type != MoleculeStereocenters::ATOM_AND && type != MoleculeStereocenters::ATOM_OR)
continue;
ids.clear();
ids.push(idx);
int group = stereocenters.getGroup(idx);
for (j = i + 1; j < _written_atoms.size(); j++)
{
if (marked[j])
continue;
int idx2 = _written_atoms[j];
if (_atoms[idx2].chirality == 0)
continue;
int type2 = stereocenters.getType(idx2);
int group2 = stereocenters.getGroup(idx2);
if (type2 == type && group2 == group)
{
ids.push(idx2);
marked[j] = 1;
}
}
if (_atoms[ids[0]].chirality == 1)
for (j = 0; j < ids.size(); j++)
_atoms[ids[j]].chirality = 3 - _atoms[ids[j]].chirality;
}
}
// write the SMILES itself
// cycle_numbers[i] == -1 means that the number is available
// cycle_numbers[i] == n means that the number is used by vertex n
QS_DEF(Array<int>, cycle_numbers);
int rsites_closures_starting_num = 91;
int rbonds = _countRBonds() + _n_attachment_points;
if (rbonds > 9)
rsites_closures_starting_num = 99 - rbonds;
cycle_numbers.clear();
cycle_numbers.push(0); // never used
bool first_component = true;
for (i = 0; i < v_seq.size(); i++)
{
int v_idx = v_seq[i].idx;
int e_idx = v_seq[i].parent_edge;
int v_prev_idx = v_seq[i].parent_vertex;
bool write_atom = true;
if (v_prev_idx >= 0)
{
int branches = walk.numBranches(v_prev_idx);
if (branches > 1)
if (_atoms[v_prev_idx].branch_cnt > 0 && _atoms[v_prev_idx].paren_written)
_output.writeChar(')');
/*
* Fix IND-673 unused if-statement
*/
// if (v_prev_idx >= 0)
// {
if (branches > 1)
if (_atoms[v_prev_idx].branch_cnt < branches - 1)
{
if (walk.isClosure(e_idx))
_atoms[v_prev_idx].paren_written = false;
else
{
_output.writeChar('(');
_atoms[v_prev_idx].paren_written = true;
}
}
_atoms[v_prev_idx].branch_cnt++;
if (_atoms[v_prev_idx].branch_cnt > branches)
throw Error("unexpected branch");
/*
* Fix IND-673 unused if-statement
*/
// }
const Edge& edge = _bmol->getEdge(e_idx);
bool bond_written = true;
int dir = 0;
int bond_order = _bmol->getBondOrder(e_idx);
// SMARTS and KET loaders store directions in
if (!smarts_mode || _qmol == nullptr) // || (original_format != ket && original_format != smarts))
if (bond_order == BOND_SINGLE)
dir = _calcBondDirection(e_idx, v_prev_idx);
if ((dir == 1 && v_idx == edge.end) || (dir == 2 && v_idx == edge.beg))
_output.writeChar('/');
else if ((dir == 2 && v_idx == edge.end) || (dir == 1 && v_idx == edge.beg))
_output.writeChar('\\');
else if (smarts_mode && _qmol != 0)
QueryMolecule::writeSmartsBond(_output, &_qmol->getBond(e_idx), false);
else if (bond_order == BOND_DOUBLE)
_output.writeChar('=');
else if (bond_order == BOND_TRIPLE)
_output.writeChar('#');
else if (bond_order == BOND_AROMATIC && _shouldWriteAromaticBond(e_idx))
_output.writeChar(':');
else if (bond_order == BOND_SINGLE && _atoms[edge.beg].aromatic && _atoms[edge.end].aromatic)
_output.writeChar('-');
else
bond_written = false;
if (walk.isClosure(e_idx))
{
for (j = 1; j < cycle_numbers.size(); j++)
if (cycle_numbers[j] == v_idx)
break;
if (j == cycle_numbers.size())
throw Error("cycle number not found");
_writeCycleNumber(j);
cycle_numbers[j] = -1;
write_atom = false;
}
}
else
{
if (!first_component)
{
// group == 0 means no group set.
int prev_group = v_to_comp_group[i - 1];
int new_group = v_to_comp_group[i];
bool different_groups = new_group != prev_group;
if (smarts_mode && prev_group && different_groups) // if component group ended
_output.writeChar(')');
_output.writeChar('.');
if (smarts_mode && new_group && different_groups) // if new group started
_output.writeChar('(');
}
else
{
if (smarts_mode && v_to_comp_group[i] > 0) // component level grouping set for this fragment
_output.writeChar('(');
first_component = false;
}
_written_components++;
}
if (write_atom)
{
if (!smarts_mode)
_writeAtom(v_idx, _atoms[v_idx].aromatic, _atoms[v_idx].lowercase, _atoms[v_idx].chirality);
else if (_qmol != 0)
{
int aam = _bmol->reaction_atom_mapping[v_idx];
QueryMolecule::writeSmartsAtom(_output, &_qmol->getAtom(v_idx), aam, _atoms[v_idx].chirality, 0, false, false, _qmol->original_format);
}
else
throw Error("SMARTS format available for query only!");
QS_DEF(Array<int>, closing);
walk.getNeighborsClosing(v_idx, closing);
for (int ap = 1; ap <= _bmol->attachmentPointCount(); ap++)
{
int idx = 0, atom_idx;
for (idx = 0; (atom_idx = _bmol->getAttachmentPoint(ap, idx)) != -1; idx++)
if (atom_idx == v_idx)
{
// Here is the problem: if we have an attachment point, the resulting
// SMILES is supposed to contain an extra atom not present in the given
// molecule. For example, chlorine with an attachment point will
// become Cl%91.[*:1]%91 |;_AP1| (two atoms).
// We can not modify the given molecule, but we want the closure to
// be here. To achieve that, we add a link to an imagimary atom with
// incredibly big number.
closing.push(10000 + ap);
}
}
for (j = 0; j < closing.size(); j++)
{
bool ap = (closing[j] >= 10000);
bool rsite = !ap && (separate_rsites && _bmol->isRSite(closing[j]));
if (ap || rsite)
{
cycle_numbers.expandFill(rsites_closures_starting_num + 1, -1);
for (k = rsites_closures_starting_num; k < cycle_numbers.size(); k++)
if (cycle_numbers[k] == -1)
break;
}
else
{
for (k = 1; k < cycle_numbers.size(); k++)
if (cycle_numbers[k] == -1)
break;
}
if (ap)
{
_attachment_indices.push(closing[j] - 10000);
_attachment_cycle_numbers.push(k);
}
if (k == cycle_numbers.size())
cycle_numbers.push(v_idx);
else
cycle_numbers[k] = v_idx;
_writeCycleNumber(k);
}
if (_atoms[v_idx].starts_polymer)
_output.writeString("{-}");
if (_atoms[v_idx].ends_polymer)
_output.writeString("{+n}");
}
}
if (smarts_mode && v_to_comp_group[i - 1] > 0) // if group set for last fragment - add finish )
_output.writeChar(')');
if (write_extra_info && chemaxon && !smarts_mode) // no extended block in SMARTS
{
// Before we write the |...| block (ChemAxon's Extended SMILES),
// we must clean up the mess we did with the attachment points
// (see big comment above). That is, we append separated atoms,
// not present in the original molecule, to the end, and connect
// them with the "cycle" closure to the real atoms that are the
// attachment points.
for (i = 0; i < _attachment_indices.size(); i++)
{
_output.printf(".[*:%d]", _attachment_indices[i]);
_writeCycleNumber(_attachment_cycle_numbers[i]);
}
_comma = false;
_writeRingCisTrans();
_writeStereogroups();
_writeRadicals();
_writePseudoAtoms();
_writeHighlighting();
_writeRGroups();
_writeSGroups();
_writeRingBonds();
_writeUnsaturated();
_writeSubstitutionCounts();
if (_bmol->hasAtropoStereoBonds())
_writeWedges();
if (_comma)
_output.writeChar('|');
}
}