std::string SequenceSaver::saveHELM()

in core/indigo-core/molecule/src/sequence_saver.cpp [1235:1506]


std::string SequenceSaver::saveHELM(KetDocument& document, std::vector<std::deque<std::string>> sequences)
{
    std::string helm_string = "";
    int peptide_idx = 0;
    int rna_idx = 0;
    int chem_idx = 0;
    using MonomerInfo = std::tuple<HELMType, int, int>;
    std::map<std::string, MonomerInfo> monomer_id_to_monomer_info;
    const auto& monomers = document.monomers();
    const auto& templates = document.templates();
    const auto& variant_templates = document.ambiguousTemplates();
    std::map<std::string, std::map<int, std::string>> mol_atom_to_ap;
    for (auto& sequence : sequences)
    {
        int monomer_idx = 0;
        int polymer_idx = -1;
        std::string helm_polymer_class = "";
        HELMType helm_type = HELMType::Unknown;
        MonomerClass prev_monomer_class = MonomerClass::Unknown;
        for (auto monomer_id : sequence)
        {
            const auto& monomer = monomers.at(monomer_id);
            auto monomer_class = document.getMonomerClass(monomer_id);
            if (monomer_idx == 0)
            {
                if (helm_string.size() > 0)
                    helm_string += '|';
                // start new polymer
                std::string helm_polymer_class;
                if (monomer->monomerType() == KetBaseMonomer::MonomerType::Monomer && templates.at(monomer->templateId()).hasStringProp("classHELM"))
                    helm_polymer_class = templates.at(monomer->templateId()).getStringProp("classHELM");
                else
                    helm_polymer_class = get_helm_class(monomer_class);
                helm_string += helm_polymer_class;
                helm_type = getHELMTypeFromString(helm_polymer_class);
                if (helm_polymer_class == kHELMPolymerTypePEPTIDE)
                    polymer_idx = ++peptide_idx;
                else if (helm_polymer_class == kHELMPolymerTypeRNA)
                    polymer_idx = ++rna_idx;
                else if (helm_polymer_class == kHELMPolymerTypeCHEM)
                    polymer_idx = ++chem_idx;
                helm_string += std::to_string(polymer_idx);
                helm_string += '{';
            }
            if (monomer_idx && monomer_class != MonomerClass::Base &&
                !((prev_monomer_class == MonomerClass::Base || prev_monomer_class == MonomerClass::Sugar) && monomer_class == MonomerClass::Phosphate))
                helm_string += '.'; // separator between sugar and base and between sugar or base and phosphate
            if (monomer_class == MonomerClass::Base && prev_monomer_class != MonomerClass::Sugar)
                throw Error("Wrong monomer sequence: base monomer %s after %s monomer.", monomer->alias().c_str(),
                            MonomerTemplate::MonomerClassToStr(prev_monomer_class).c_str());
            if (monomer_class == MonomerClass::Base)
                helm_string += '(';
            if (monomer->monomerType() == KetBaseMonomer::MonomerType::Monomer)
            {
                if (templates.at(monomer->templateId()).unresolved())
                    helm_string += '*';
                else
                    add_monomer(document, monomer, helm_string);
            }
            else if (monomer->monomerType() == KetBaseMonomer::MonomerType::AmbiguousMonomer)
            {
                const auto& templ = variant_templates.at(monomer->templateId());
                if (monomer_class != MonomerClass::Base)
                    helm_string += '(';
                std::string variants;
                bool mixture = (templ.subtype() == "mixture");
                for (const auto& option : templ.options())
                {
                    if (variants.size() > 0)
                        variants += mixture ? '+' : ',';
                    auto alias = templates.at(option.templateId()).getStringProp("alias");
                    if (alias.size() > 1)
                        variants += '[';
                    variants += alias;
                    if (alias.size() > 1)
                        variants += ']';
                    auto num = mixture ? option.ratio() : option.probability();
                    if (num.has_value())
                    {
                        variants += ':';
                        std::ostringstream ss;
                        ss << num.value();
                        variants += ss.str();
                    }
                }
                helm_string += variants;
                if (monomer_class != MonomerClass::Base)
                    helm_string += ')';
            }
            if (monomer_class == MonomerClass::Base)
                helm_string += ')';
            monomer_idx++;
            monomer_id_to_monomer_info.emplace(std::make_pair(monomer_id, std::make_tuple(helm_type, polymer_idx, monomer_idx)));
            prev_monomer_class = monomer_class;
        }
        if (monomer_idx)
            helm_string += '}'; // Finish polymer
    }
    auto& molecules = document.jsonMolecules();
    int molecule_idx = 0;
    rapidjson::Document json{};
    std::map<std::string, std::vector<int>> molecules_connections;
    if (molecules.Size() > 0)
    {
        auto process_ep = [&molecules_connections](const KetConnectionEndPoint& ep) {
            if (ep.hasStringProp("moleculeId"))
            {
                const auto& mol_id = ep.getStringProp("moleculeId");
                if (molecules_connections.count(mol_id) == 0)
                    molecules_connections.try_emplace(mol_id);
                if (ep.hasStringProp("atomId"))
                    molecules_connections.at(mol_id).push_back(std::stoi(ep.getStringProp("atomId")));
            }
        };
        for (const auto& connection : document.nonSequenceConnections())
        {
            process_ep(connection.ep1());
            process_ep(connection.ep2());
        }
    }
    for (rapidjson::SizeType i = 0; i < molecules.Size(); i++)
    {
        const auto& molecule = molecules[i];
        std::string mol_id = "mol" + std::to_string(molecule_idx++);
        rapidjson::Value marr(rapidjson::kArrayType);
        marr.PushBack(json.CopyFrom(molecule, json.GetAllocator()), json.GetAllocator());
        MoleculeJsonLoader loader(marr);
        BaseMolecule* pbmol;
        Molecule mol;
        QueryMolecule qmol;
        try
        {
            loader.loadMolecule(mol);
            pbmol = &mol;
        }
        catch (...)
        {
            loader.loadMolecule(qmol);
            pbmol = &qmol;
        }
        // convert Sup sgroup without name attachment points to rg-labels
        auto& sgroups = pbmol->sgroups;
        int ap_count = 0;
        for (int i = sgroups.begin(); i != sgroups.end(); i = sgroups.next(i))
        {
            auto& sgroup = sgroups.getSGroup(i);
            if (sgroup.sgroup_type != SGroup::SG_TYPE_SUP)
                continue;
            Superatom& sa = static_cast<Superatom&>(sgroup);
            if (sa.subscript.size() != 0 && sa.subscript.ptr()[0] != 0)
                continue;
            // convert leaving atom H to rg-ref
            auto res = mol_atom_to_ap.try_emplace(mol_id);
            auto& atom_to_ap = res.first;
            static std::string apid_prefix{'R'};
            for (int ap_id = sa.attachment_points.begin(); ap_id != sa.attachment_points.end(); ap_id = sa.attachment_points.next(ap_id))
            {
                ap_count++;
                auto& ap = sa.attachment_points.at(ap_id);
                std::string apid = apid_prefix + ap.apid.ptr();
                atom_to_ap->second.emplace(ap.aidx, apid);
                int leaving_atom = ap.lvidx;
                int ap_idx = std::stoi(ap.apid.ptr());
                if (pbmol == &mol)
                {
                    mol.resetAtom(leaving_atom, ELEM_RSITE);
                    mol.allowRGroupOnRSite(leaving_atom, ap_idx);
                }
                else
                {
                    auto rsite = std::make_unique<QueryMolecule::Atom>(QueryMolecule::ATOM_RSITE, 0);
                    qmol.resetAtom(leaving_atom, rsite.release());
                    qmol.allowRGroupOnRSite(leaving_atom, ap_idx);
                }
            }
            sgroups.remove(i);
        }
        // check direct monomer to molecule connections without attachment point
        if (molecules_connections.count(mol_id) > 0 && ap_count == 0)
        {
            int ap_idx = 1;
            auto res = mol_atom_to_ap.try_emplace(mol_id);
            auto& atom_to_ap = res.first;
            static std::string apid_prefix{'R'};
            for (auto atom_id : molecules_connections.at(mol_id))
            {
                std::string apid = apid_prefix + std::to_string(ap_idx);
                atom_to_ap->second.emplace(atom_id, apid);
                // add leaving atom and set it as R-site
                auto leaving_atom = pbmol->addAtom(ELEM_RSITE);
                pbmol->addBond(atom_id, leaving_atom, BOND_SINGLE);
                pbmol->allowRGroupOnRSite(leaving_atom, ap_idx++);
            }
        }
        // generate smiles
        std::string smiles;
        StringOutput s_out(smiles);
        SmilesSaver saver(s_out);
        saver.separate_rsites = false;
        if (pbmol == &mol)
            saver.saveMolecule(mol);
        else
            saver.saveQueryMolecule(qmol);
        // save as chem
        if (helm_string.size() > 0)
            helm_string += '|';
        helm_string += "CHEM";
        int polymer_idx = ++chem_idx;
        helm_string += std::to_string(polymer_idx);
        helm_string += "{[";
        helm_string += smiles;
        helm_string += "]}";
        monomer_id_to_monomer_info.emplace(std::make_pair(mol_id, std::make_tuple(HELMType::Chem, polymer_idx, 1)));
    }
    helm_string += '$';
    // Add connections
    int connections_count = 0;
    for (const auto& connection : document.nonSequenceConnections())
    {
        // add connection
        if (connections_count)
            helm_string += '|';
        const auto& ep_1 = connection.ep1();
        const auto& ep_2 = connection.ep2();
        if (!(ep_1.hasStringProp("monomerId") || ep_1.hasStringProp("moleculeId")) || !(ep_2.hasStringProp("monomerId") || ep_2.hasStringProp("moleculeId")))
            throw Error("Endpoint without monomer or molecule id");
        bool has_mon_id1 = ep_1.hasStringProp("monomerId");
        bool has_mon_id2 = ep_2.hasStringProp("monomerId");
        const auto& monomer_id_1 = has_mon_id1 ? ep_1.getStringProp("monomerId") : ep_1.getStringProp("moleculeId");
        const auto& monomer_id_2 = has_mon_id2 ? ep_2.getStringProp("monomerId") : ep_2.getStringProp("moleculeId");
        const auto& id1 = has_mon_id1 ? document.monomerIdByRef(monomer_id_1) : monomer_id_1;
        const auto& id2 = has_mon_id2 ? document.monomerIdByRef(monomer_id_2) : monomer_id_2;
        auto [type_1, pol_num_1, mon_num_1] = monomer_id_to_monomer_info.at(id1);
        auto [type_2, pol_num_2, mon_num_2] = monomer_id_to_monomer_info.at(id2);
        connections_count++;
        helm_string += getStringFromHELMType(type_1);
        helm_string += std::to_string(pol_num_1);
        helm_string += ',';
        helm_string += getStringFromHELMType(type_2);
        helm_string += std::to_string(pol_num_2);
        helm_string += ',';
        helm_string += std::to_string(mon_num_1);
        helm_string += ":";
        if (ep_1.hasStringProp("atomId"))
            helm_string += mol_atom_to_ap.at(id1).at(std::stoi(ep_1.getStringProp("atomId")));
        else if (ep_1.hasStringProp("attachmentPointId"))
            helm_string += ep_1.getStringProp("attachmentPointId");
        else if (connection.connType() == KetConnection::TYPE::HYDROGEN)
            helm_string += HelmHydrogenPair;
        else
            helm_string += '?';
        helm_string += '-';
        helm_string += std::to_string(mon_num_2);
        helm_string += ':';
        if (ep_2.hasStringProp("atomId"))
            helm_string += mol_atom_to_ap.at(id2).at(std::stoi(ep_2.getStringProp("atomId")));
        else if (ep_2.hasStringProp("attachmentPointId"))
            helm_string += ep_2.getStringProp("attachmentPointId");
        else if (connection.connType() == KetConnection::TYPE::HYDROGEN)
            helm_string += HelmHydrogenPair;
        else
            helm_string += '?';
    }
    helm_string += '$';
    // Add polymer groups
    helm_string += '$';
    // Add ExtendedAnnotation
    helm_string += '$';
    // Add helm version
    helm_string += "V2.0";
    return helm_string;
}