in core/indigo-core/molecule/src/molecule_ionize.cpp [65:321]
int MoleculePkaModel::buildPkaModel(int max_level, float threshold, const char* filename)
{
// QS_DEF(Array<int>, order);
// QS_DEF(Molecule, can_mol);
QS_DEF(Array<char>, fp);
RedBlackStringObjMap<Array<float>> acid_pkas;
RedBlackStringObjMap<Array<float>> basic_pkas;
RedBlackStringObjMap<Array<int>> acid_pka_cids;
RedBlackStringObjMap<Array<int>> basic_pka_cids;
AromaticityOptions opts;
const char* a_pka_sites_id = "ACID PKA SITES";
const char* a_pka_values_id = "ACID PKA VALUES";
const char* b_pka_sites_id = "BASIC PKA SITES";
const char* b_pka_values_id = "BASIC PKA VALUES";
const char* compound_cid = "PUBCHEM_COMPOUND_CID";
Molecule mol;
int level = 0;
_model.adv_a_pkas.clear();
_model.adv_b_pkas.clear();
for (;;)
{
FileScanner fscanner(filename);
SdfLoader loader(fscanner);
int a_count = 0;
int b_count = 0;
acid_pkas.clear();
basic_pkas.clear();
acid_pka_cids.clear();
basic_pka_cids.clear();
int mol_count = 0;
while (!loader.isEOF())
{
mol_count++;
loader.readNext();
BufferScanner bscanner(loader.data);
MolfileLoader mol_loader(bscanner);
mol_loader.stereochemistry_options.ignore_errors = true;
mol_loader.loadMolecule(mol);
int cid = 0;
if (loader.properties.contains(compound_cid))
{
BufferScanner scan_cid(loader.properties.at(compound_cid));
cid = scan_cid.readInt();
}
// mol.aromatize(opts);
// _checkCanonicalOrder(mol, can_mol, order);
_removeExtraHydrogens(mol);
if (loader.properties.contains(a_pka_sites_id))
{
const char* aids = loader.properties.at(a_pka_sites_id);
const char* apkas = loader.properties.at(a_pka_values_id);
BufferScanner scan_aids(aids);
Array<int> a_sites;
while (!scan_aids.isEOF())
{
int idx = scan_aids.readInt1() - 1;
a_sites.push(idx);
}
BufferScanner scan_apka(apkas);
Array<float> a_pka;
while (!scan_apka.isEOF())
{
float val = scan_apka.readFloat();
a_pka.push(val);
}
if (a_sites.size() > 1)
{
// printf("Warning: multiple acid sites detected. Skip this compound (%d)\n", cid);
}
else
{
for (int i = 0; i < a_sites.size(); i++)
{
fp.clear();
// int idx = order.find(a_sites[i]);
int idx = a_sites[i];
float val = a_pka[i];
getAtomLocalFingerprint(mol, idx, fp, level);
if (fp.size() == 0)
continue;
if (!acid_pkas.find(fp.ptr()))
{
acid_pkas.insert(fp.ptr());
acid_pka_cids.insert(fp.ptr());
a_count++;
}
acid_pkas.at(fp.ptr()).push(val);
acid_pka_cids.at(fp.ptr()).push(cid);
}
}
}
if (loader.properties.contains(b_pka_sites_id))
{
const char* bids = loader.properties.at(b_pka_sites_id);
const char* bpkas = loader.properties.at(b_pka_values_id);
BufferScanner scan_bids(bids);
Array<int> b_sites;
while (!scan_bids.isEOF())
{
int idx = scan_bids.readInt1() - 1;
b_sites.push(idx);
}
BufferScanner scan_bpka(bpkas);
Array<float> b_pka;
while (!scan_bpka.isEOF())
{
float val = scan_bpka.readFloat();
b_pka.push(val);
}
if (b_sites.size() > 1)
{
// printf("Warning: multiple protonation sites detected. Skip this compound (%d)\n", cid);
}
else
{
for (int i = 0; i < b_sites.size(); i++)
{
fp.clear();
// int idx = order.find(b_sites[i]);
int idx = b_sites[i];
float val = b_pka[i];
getAtomLocalFingerprint(mol, idx, fp, level);
if (fp.size() == 0)
continue;
if (!basic_pkas.find(fp.ptr()))
{
basic_pkas.insert(fp.ptr());
basic_pka_cids.insert(fp.ptr());
b_count++;
}
basic_pkas.at(fp.ptr()).push(val);
basic_pka_cids.at(fp.ptr()).push(cid);
}
}
}
}
bool model_ready = true;
float max_deviation = 0.f;
for (int i = 0; i < acid_pkas.size(); i++)
{
const char* fpkey = acid_pkas.key(i);
Array<float>& pkas = acid_pkas.value(i);
float pka_sum = 0.f;
float pka_dev = 0.f;
float pka_min = 100.f;
float pka_max = -100.f;
for (int k = 0; k < pkas.size(); k++)
{
float val = pkas.at(k);
pka_sum = pka_sum + val;
if (pka_min > val)
pka_min = val;
if (pka_max < val)
pka_max = val;
}
if ((pka_max - pka_sum / pkas.size()) > (pka_sum / pkas.size() - pka_min))
pka_dev = pka_max - pka_sum / pkas.size();
else
pka_dev = pka_sum / pkas.size() - pka_min;
if (pka_dev > max_deviation)
max_deviation = pka_dev;
if (_model.adv_a_pkas.find(fpkey))
_model.adv_a_pkas.remove(fpkey);
_model.adv_a_pkas.insert(fpkey);
_model.adv_a_pkas.at(fpkey).push(pka_sum / pkas.size());
_model.adv_a_pkas.at(fpkey).push(pka_dev);
if (pka_dev > threshold)
model_ready = false;
}
for (int i = 0; i < basic_pkas.size(); i++)
{
const char* fpkey = basic_pkas.key(i);
Array<float>& pkas = basic_pkas.value(i);
float pka_sum = 0.f;
float pka_dev = 0.f;
float pka_min = 100.f;
float pka_max = -100.f;
for (int k = 0; k < pkas.size(); k++)
{
float val = pkas.at(k);
pka_sum = pka_sum + val;
if (pka_min > val)
pka_min = val;
if (pka_max < val)
pka_max = val;
}
if ((pka_max - pka_sum / pkas.size()) > (pka_sum / pkas.size() - pka_min))
pka_dev = pka_max - pka_sum / pkas.size();
else
pka_dev = pka_sum / pkas.size() - pka_min;
if (pka_dev > max_deviation)
max_deviation = pka_dev;
if (_model.adv_b_pkas.find(fpkey))
_model.adv_b_pkas.remove(fpkey);
_model.adv_b_pkas.insert(fpkey);
_model.adv_b_pkas.at(fpkey).push(pka_sum / pkas.size());
_model.adv_b_pkas.at(fpkey).push(pka_dev);
if (pka_dev > threshold)
model_ready = false;
}
if (model_ready)
{
break;
}
else
{
if (level >= max_level)
{
level = -1;
break;
}
else
{
level++;
_model.max_deviations.push(max_deviation);
}
}
}
_model.level = level;
_model.advanced_model_ready = true;
return level;
}