int MoleculePkaModel::buildPkaModel()

in core/indigo-core/molecule/src/molecule_ionize.cpp [65:321]


int MoleculePkaModel::buildPkaModel(int max_level, float threshold, const char* filename)
{
    //   QS_DEF(Array<int>, order);
    //   QS_DEF(Molecule, can_mol);
    QS_DEF(Array<char>, fp);
    RedBlackStringObjMap<Array<float>> acid_pkas;
    RedBlackStringObjMap<Array<float>> basic_pkas;
    RedBlackStringObjMap<Array<int>> acid_pka_cids;
    RedBlackStringObjMap<Array<int>> basic_pka_cids;
    AromaticityOptions opts;

    const char* a_pka_sites_id = "ACID PKA SITES";
    const char* a_pka_values_id = "ACID PKA VALUES";
    const char* b_pka_sites_id = "BASIC PKA SITES";
    const char* b_pka_values_id = "BASIC PKA VALUES";
    const char* compound_cid = "PUBCHEM_COMPOUND_CID";

    Molecule mol;
    int level = 0;

    _model.adv_a_pkas.clear();
    _model.adv_b_pkas.clear();

    for (;;)
    {
        FileScanner fscanner(filename);
        SdfLoader loader(fscanner);

        int a_count = 0;
        int b_count = 0;
        acid_pkas.clear();
        basic_pkas.clear();
        acid_pka_cids.clear();
        basic_pka_cids.clear();

        int mol_count = 0;
        while (!loader.isEOF())
        {
            mol_count++;
            loader.readNext();
            BufferScanner bscanner(loader.data);
            MolfileLoader mol_loader(bscanner);
            mol_loader.stereochemistry_options.ignore_errors = true;
            mol_loader.loadMolecule(mol);

            int cid = 0;
            if (loader.properties.contains(compound_cid))
            {
                BufferScanner scan_cid(loader.properties.at(compound_cid));
                cid = scan_cid.readInt();
            }

            //         mol.aromatize(opts);
            //         _checkCanonicalOrder(mol, can_mol, order);
            _removeExtraHydrogens(mol);

            if (loader.properties.contains(a_pka_sites_id))
            {
                const char* aids = loader.properties.at(a_pka_sites_id);
                const char* apkas = loader.properties.at(a_pka_values_id);

                BufferScanner scan_aids(aids);
                Array<int> a_sites;
                while (!scan_aids.isEOF())
                {
                    int idx = scan_aids.readInt1() - 1;
                    a_sites.push(idx);
                }
                BufferScanner scan_apka(apkas);
                Array<float> a_pka;
                while (!scan_apka.isEOF())
                {
                    float val = scan_apka.readFloat();
                    a_pka.push(val);
                }

                if (a_sites.size() > 1)
                {
                    //               printf("Warning: multiple acid sites detected. Skip this compound (%d)\n", cid);
                }
                else
                {
                    for (int i = 0; i < a_sites.size(); i++)
                    {
                        fp.clear();
                        //                  int idx = order.find(a_sites[i]);
                        int idx = a_sites[i];
                        float val = a_pka[i];

                        getAtomLocalFingerprint(mol, idx, fp, level);

                        if (fp.size() == 0)
                            continue;

                        if (!acid_pkas.find(fp.ptr()))
                        {
                            acid_pkas.insert(fp.ptr());
                            acid_pka_cids.insert(fp.ptr());
                            a_count++;
                        }
                        acid_pkas.at(fp.ptr()).push(val);
                        acid_pka_cids.at(fp.ptr()).push(cid);
                    }
                }
            }

            if (loader.properties.contains(b_pka_sites_id))
            {

                const char* bids = loader.properties.at(b_pka_sites_id);
                const char* bpkas = loader.properties.at(b_pka_values_id);

                BufferScanner scan_bids(bids);
                Array<int> b_sites;
                while (!scan_bids.isEOF())
                {
                    int idx = scan_bids.readInt1() - 1;
                    b_sites.push(idx);
                }
                BufferScanner scan_bpka(bpkas);
                Array<float> b_pka;
                while (!scan_bpka.isEOF())
                {
                    float val = scan_bpka.readFloat();
                    b_pka.push(val);
                }

                if (b_sites.size() > 1)
                {
                    //               printf("Warning: multiple protonation sites detected. Skip this compound (%d)\n", cid);
                }
                else
                {
                    for (int i = 0; i < b_sites.size(); i++)
                    {
                        fp.clear();
                        //                  int idx = order.find(b_sites[i]);
                        int idx = b_sites[i];
                        float val = b_pka[i];

                        getAtomLocalFingerprint(mol, idx, fp, level);

                        if (fp.size() == 0)
                            continue;

                        if (!basic_pkas.find(fp.ptr()))
                        {
                            basic_pkas.insert(fp.ptr());
                            basic_pka_cids.insert(fp.ptr());
                            b_count++;
                        }
                        basic_pkas.at(fp.ptr()).push(val);
                        basic_pka_cids.at(fp.ptr()).push(cid);
                    }
                }
            }
        }

        bool model_ready = true;
        float max_deviation = 0.f;

        for (int i = 0; i < acid_pkas.size(); i++)
        {
            const char* fpkey = acid_pkas.key(i);
            Array<float>& pkas = acid_pkas.value(i);
            float pka_sum = 0.f;
            float pka_dev = 0.f;
            float pka_min = 100.f;
            float pka_max = -100.f;
            for (int k = 0; k < pkas.size(); k++)
            {
                float val = pkas.at(k);
                pka_sum = pka_sum + val;
                if (pka_min > val)
                    pka_min = val;
                if (pka_max < val)
                    pka_max = val;
            }

            if ((pka_max - pka_sum / pkas.size()) > (pka_sum / pkas.size() - pka_min))
                pka_dev = pka_max - pka_sum / pkas.size();
            else
                pka_dev = pka_sum / pkas.size() - pka_min;

            if (pka_dev > max_deviation)
                max_deviation = pka_dev;

            if (_model.adv_a_pkas.find(fpkey))
                _model.adv_a_pkas.remove(fpkey);
            _model.adv_a_pkas.insert(fpkey);
            _model.adv_a_pkas.at(fpkey).push(pka_sum / pkas.size());
            _model.adv_a_pkas.at(fpkey).push(pka_dev);

            if (pka_dev > threshold)
                model_ready = false;
        }

        for (int i = 0; i < basic_pkas.size(); i++)
        {
            const char* fpkey = basic_pkas.key(i);
            Array<float>& pkas = basic_pkas.value(i);
            float pka_sum = 0.f;
            float pka_dev = 0.f;
            float pka_min = 100.f;
            float pka_max = -100.f;
            for (int k = 0; k < pkas.size(); k++)
            {
                float val = pkas.at(k);
                pka_sum = pka_sum + val;
                if (pka_min > val)
                    pka_min = val;
                if (pka_max < val)
                    pka_max = val;
            }

            if ((pka_max - pka_sum / pkas.size()) > (pka_sum / pkas.size() - pka_min))
                pka_dev = pka_max - pka_sum / pkas.size();
            else
                pka_dev = pka_sum / pkas.size() - pka_min;

            if (pka_dev > max_deviation)
                max_deviation = pka_dev;

            if (_model.adv_b_pkas.find(fpkey))
                _model.adv_b_pkas.remove(fpkey);
            _model.adv_b_pkas.insert(fpkey);
            _model.adv_b_pkas.at(fpkey).push(pka_sum / pkas.size());
            _model.adv_b_pkas.at(fpkey).push(pka_dev);

            if (pka_dev > threshold)
                model_ready = false;
        }

        if (model_ready)
        {
            break;
        }
        else
        {
            if (level >= max_level)
            {
                level = -1;
                break;
            }
            else
            {
                level++;
                _model.max_deviations.push(max_deviation);
            }
        }
    }

    _model.level = level;
    _model.advanced_model_ready = true;

    return level;
}