void SequenceLoader::loadIdt()

in core/indigo-core/molecule/src/sequence_loader.cpp [882:1291]


void SequenceLoader::loadIdt(KetDocument& document)
{
    const auto IDT_DEF_SUGAR = "dR";
    const auto IDT_DEF_PHOSPHATE = "P";
    const auto IDT_MODIFIED_PHOSPHATE = "sP";
    constexpr int MAX_STD_TOKEN_SIZE = 2;
    _row = 0;
    std::string invalid_symbols;
    while (!_scanner.isEOF())
    {
        _seq_id = 0;
        _last_monomer_idx = -1;
        _col = 0;

        using token_t = std::pair<std::string, bool>;
        std::queue<token_t> tokens; // second=true if token folowed by *
        std::string cur_token;

        while (true)
        {
            if (_scanner.isEOL())
            {
                if (cur_token.size())
                    tokens.emplace(cur_token, false);
                break;
            }
            auto ch = _scanner.readChar();

            switch (ch)
            {
            case ' ':
                if (cur_token.size())
                    tokens.emplace(cur_token, false);
                continue;
            case '/': {
                if (cur_token.size())
                    throw Error("Sugar prefix could not be used with modified monomer.");
                // read till next '/'
                ch = 0;
                while (!_scanner.isEOL())
                {
                    ch = _scanner.readChar();
                    if (ch == '/')
                        break;
                    cur_token += ch;
                }
                if (ch != '/')
                    throw Error("Unexpected end of data");
                if (cur_token == "")
                    throw Error("Invalid modification: empty string.");
                if (cur_token.size() < 3)
                    throw Error("Invalid modification: %s.", cur_token.c_str());
                cur_token += ch;
                break;
            }
            case '(': { // read till ')'
                cur_token += ch;
                ch = 0;
                while (!_scanner.isEOL())
                {
                    ch = _scanner.readChar();
                    if (ch == ')')
                        break;
                    cur_token += ch;
                }
                if (ch != ')')
                    throw Error("Unexpected end of data");
                if (cur_token == "")
                    throw Error("Invalid ambiguous monomer: empty string.");
                cur_token += ch;
                break;
            }
            case 'A':
            case 'T':
            case 'C':
            case 'G':
            case 'U':
            case 'I':
            case 'R':
            case 'Y':
            case 'M':
            case 'K':
            case 'S':
            case 'W':
            case 'H':
            case 'B':
            case 'V':
            case 'D':
            case 'N':
                cur_token += ch;
                break;
            case 'r':
            case '+':
            case 'm':
                if (cur_token.size())
                    throw Error("Sugar prefix '%s' whithout base.", cur_token.c_str());
                else
                    cur_token += ch;
                continue;
                break;
            default:
                if (invalid_symbols.size())
                    invalid_symbols += ',';
                invalid_symbols += ch;
                continue;
                break;
            }

            if (_scanner.lookNext() == '*')
            {
                tokens.emplace(cur_token, true);
                _scanner.skip(1);
            }
            else
                tokens.emplace(cur_token, false);
            cur_token = "";
        }
        while (!_scanner.isEOF() && _scanner.isEOL()) // Skip EOL characters
            _scanner.skip(1);

        IdtModification modification = IdtModification::FIVE_PRIME_END;

        token_t prev_token;
        while (tokens.size() > 0)
        {
            token_t token = tokens.front();
            tokens.pop();

            std::string phosphate = IDT_DEF_PHOSPHATE;
            std::string sugar = IDT_DEF_SUGAR;
            std::string idt_alias = "";
            std::string base = "";
            std::string single_monomer = "";
            std::string single_monomer_alias = "";
            std::string single_monomer_class;
            bool unresolved = false;
            bool ambiguous_monomer = false;

            if (token.first.back() == '/')
            {
                token.first.pop_back();
                idt_alias = token.first;
                if ((idt_alias == "5Phos" || idt_alias == "3Phos") && (token.second || prev_token.second))
                    throw Error("Symbol '*' could be placed only between two nucleotides/nucleosides.");
            }
            else
            {

                if (token.first.size() > MAX_STD_TOKEN_SIZE)
                    if (token.first.back() == ')')
                        idt_alias = token.first;
                    else
                        throw Error("Wrong IDT syntax: '%s'", token.first.c_str());
                else
                    idt_alias = token.first.back();
                if (token.first.size() > 1)
                {
                    auto ch = token.first[0];
                    if (ch != '(')
                    {
                        switch (ch)
                        {
                        case 'r':
                            sugar = "R";
                            break;
                        case '+':
                            sugar = "LR";
                            break;
                        case 'm':
                            sugar = "mR";
                            break;
                        default:
                            throw Error("Wrong IDT syntax: '%s'", token.first.c_str());
                        }
                        if (idt_alias.back() == ')')
                            idt_alias.erase(0, 1);
                    }
                }
            }

            if (STANDARD_MIXED_BASES.count(idt_alias) != 0 || idt_alias.back() == ')')
                ambiguous_monomer = true;

            if (idt_alias.size() == 1 || ambiguous_monomer)
            {
                if (IDT_STANDARD_BASES.count(idt_alias) == 0 && !ambiguous_monomer)
                {
                    if (invalid_symbols.size())
                        invalid_symbols += ',';
                    invalid_symbols += idt_alias[0];
                    continue;
                }

                if (ambiguous_monomer)
                {
                    auto mixed_base = idt_alias;
                    std::optional<std::array<float, 4>> ratios;
                    if (mixed_base.back() == ')')
                    {
                        mixed_base = idt_alias.substr(1, idt_alias.size() - 2);
                        auto check_mixed_base = [](const std::string& base) {
                            if (base.size() < 2)
                                return;
                            auto count = base.substr(1, base.size() - 1);
                            for (auto ch : count)
                            {
                                if (!std::isdigit(ch))
                                    throw Error("Invalid mixed base - only numerical index allowed.");
                            }
                        };
                        if (auto pos = mixed_base.find(':'); pos != std::string::npos)
                        {
                            auto ratios_str = mixed_base.substr(pos + 1, mixed_base.size() - pos - 1);
                            mixed_base = mixed_base.substr(0, pos);
                            check_mixed_base(mixed_base);
                            if (ratios_str.size() != 8)
                                throw Exception("Invalid IDT ambiguous monomer %s", idt_alias.c_str());
                            auto stof = [](const std::string& arg) -> float {
                                try
                                {
                                    return std::stof(arg);
                                }
                                catch (...)
                                {
                                    throw Error("Invalid number '%s'", arg.c_str());
                                }
                            };
                            ratios.emplace(std::array<float, 4>{stof(ratios_str.substr(0, 2)), stof(ratios_str.substr(2, 2)), stof(ratios_str.substr(4, 2)),
                                                                stof(ratios_str.substr(6, 2))});
                            idt_alias = '(' + mixed_base + ')';
                            mixed_base = mixed_base[0];
                        }
                        else
                        {
                            check_mixed_base(mixed_base);
                        }
                    }
                    if (sugar == "R" && RNA_DNA_MIXED_BASES.count(mixed_base) == 0)
                        idt_alias = 'r' + idt_alias;
                    if (!document.hasAmbiguousMonomerTemplate(idt_alias))
                    {
                        auto it = STANDARD_MIXED_BASES.find(mixed_base);
                        if (it == STANDARD_MIXED_BASES.end())
                            throw Error("Unknown mixed base '%s'", mixed_base.c_str());

                        std::vector<KetAmbiguousMonomerOption> options;
                        for (auto template_alias : it->second)
                        {
                            if (sugar == "R" && template_alias == "T") // U instead of T for RNA
                                template_alias = "U";
                            auto& template_id = _library.getMonomerTemplateIdByAlias(MonomerClass::Base, template_alias);
                            if (template_id.size() == 0)
                                throw Error("Monomer base template '%s' not found", template_alias.c_str());
                            auto& option = options.emplace_back(template_id);
                            if (ratios.has_value())
                            {
                                option.setRatio(ratios.value()[IDT_BASE_TO_RATIO_IDX.at(template_alias)]);
                            }
                            auto& monomer_template = _library.getMonomerTemplateById(template_id);
                            checkAddTemplate(document, monomer_template);
                            _alias_to_id.emplace(template_alias, template_id);
                        }
                        auto& templ = document.addAmbiguousMonomerTemplate("mixture", idt_alias, idt_alias, IdtAlias(), options);
                        static const std::map<std::string, KetAttachmentPoint> aps{{"R1", -1}};
                        templ.setAttachmentPoints(aps);
                        _var_alias_to_id.emplace(idt_alias, idt_alias);
                    }
                    else
                    {
                        if (ratios.has_value())
                            throw Error("Ambiguous monomer %s redefinion", idt_alias.c_str());
                    }
                }
                base = idt_alias;

                if (base == "I")
                    base = "In"; // use correct alias for Inosine

                if (tokens.size() == 0)
                {
                    if (token.second)
                        throw Error("Invalid IDT sequence: '*' couldn't be the last symbol.");
                    modification = IdtModification::THREE_PRIME_END;
                    phosphate = "";
                }
                else if (token.second)
                {
                    phosphate = IDT_MODIFIED_PHOSPHATE;
                }

                _alias_to_id.emplace(sugar, checkAddTemplate(document, MonomerClass::Sugar, sugar));
                if (base.size() > 0 && !ambiguous_monomer)
                    _alias_to_id.emplace(base, checkAddTemplate(document, MonomerClass::Base, base));
                if (phosphate.size() > 0)
                    _alias_to_id.emplace(phosphate, checkAddTemplate(document, MonomerClass::Phosphate, phosphate));
            }
            else
            {
                if (tokens.size() == 0)
                {
                    modification = IdtModification::THREE_PRIME_END;
                    // Corner case: /3Phos/ after standard monomer - no additional P should be added
                    if (prev_token.first.size() > 0 && prev_token.first.size() <= MAX_STD_TOKEN_SIZE && idt_alias == "3Phos")
                        continue;
                }

                sugar = "";
                IdtModification alias_mod;
                const std::string& mgt_id = _library.getMGTidByIdtAlias(idt_alias, alias_mod);
                if (mgt_id.size())
                {
                    // Check that alias modification can be used in current position
                    check_monomer_place(idt_alias, modification, alias_mod, prev_token.first.size() > 0);
                    MonomerGroupTemplate& mgt = _library.getMonomerGroupTemplateById(mgt_id);
                    const MonomerTemplate& sugar_template = mgt.getTemplateByClass(MonomerClass::Sugar);
                    sugar = sugar_template.getStringProp("alias");
                    _alias_to_id.emplace(sugar, sugar_template.id());
                    checkAddTemplate(document, sugar_template);
                    if (alias_mod == IdtModification::THREE_PRIME_END)
                    {
                        if (token.second)
                            throw Error("Monomer /%s/ doesn't have phosphate, so '*' couldn't be applied.", idt_alias.c_str());
                        phosphate = "";
                    }
                    else
                    {
                        if (mgt.hasTemplateClass(MonomerClass::Phosphate))
                        {
                            if (token.second) // * means that 'sP' should be used
                            {
                                phosphate = IDT_MODIFIED_PHOSPHATE;
                                _alias_to_id.emplace(phosphate, checkAddTemplate(document, MonomerClass::Phosphate, phosphate));
                            }
                            else // use phosphate from template
                            {
                                const MonomerTemplate& phosphate_template = mgt.getTemplateByClass(MonomerClass::Phosphate);
                                phosphate = phosphate_template.getStringProp("alias");
                                _alias_to_id.emplace(phosphate, phosphate_template.id());
                                checkAddTemplate(document, phosphate_template);
                            }
                        }
                        else
                        {
                            if (token.second)
                                throw Error("Monomer /%s/ doesn't have phosphate, so '*' couldn't be applied.", idt_alias.c_str());
                            phosphate = "";
                        }
                    }
                    if (mgt.hasTemplateClass(MonomerClass::Base))
                    {
                        const MonomerTemplate& base_template = mgt.getTemplateByClass(MonomerClass::Base);
                        base = base_template.getStringProp("alias");
                        _alias_to_id.emplace(base, base_template.id());
                        checkAddTemplate(document, base_template);
                    }
                }
                else
                {
                    IdtModification alias_mod;
                    auto& monomer_template_id = _library.getMonomerTemplateIdByIdtAlias(idt_alias, alias_mod);
                    if (monomer_template_id.size())
                    {
                        if (token.second)
                            throw Error("'*' couldn't be applied to monomer /%s/.", idt_alias.c_str());
                        check_monomer_place(idt_alias, modification, alias_mod, prev_token.first.size() > 0);
                        const MonomerTemplate& monomer_template = _library.getMonomerTemplateById(monomer_template_id);
                        checkAddTemplate(document, monomer_template);
                        single_monomer = monomer_template_id;
                        single_monomer_alias = monomer_template.getStringProp("alias");
                    }
                    else // IDT alias not found
                    {
                        single_monomer = "unknown_monomer_with_idt_alias_" + idt_alias;
                        single_monomer_alias = idt_alias;
                        auto monomer_class = MonomerClass::CHEM;
                        // Unresoved monomer could be in any position
                        MonomerTemplate monomer_template(single_monomer, monomer_class, IdtAlias(idt_alias, idt_alias, idt_alias, idt_alias), true);
                        monomer_template.setStringProp("alias", idt_alias);
                        for (auto ap : {"R1", "R2", "R3", "R4"})
                            monomer_template.AddAttachmentPoint(ap, -1);
                        checkAddTemplate(document, monomer_template);
                    }
                }
            }

            if (single_monomer.size())
            {
                auto monomer_idx = document.monomers().size();
                auto& monomer = document.addMonomer(single_monomer_alias, single_monomer);
                monomer->setIntProp("seqid", _seq_id);
                monomer->setPosition(getBackboneMonomerPosition());
                if (_last_monomer_idx >= 0)
                    addMonomerConnection(document, _last_monomer_idx, monomer_idx);
                _last_monomer_idx = static_cast<int>(monomer_idx);
            }
            else
                addNucleotide(document, base, sugar, phosphate, false, ambiguous_monomer);

            _seq_id++;
            _col++;

            prev_token = token; // save to check */3Phos/ case
            modification = IdtModification::INTERNAL;
        }
        _row += 2;
    }

    if (invalid_symbols.size())
        throw Error("Invalid symbols in the sequence: %s", invalid_symbols.c_str());
}