void SequenceLoader::loadIdt()

in core/indigo-core/molecule/src/sequence_loader.cpp [600:880]


void SequenceLoader::loadIdt(BaseMolecule& mol)
{
    const auto IDT_DEF_SUGAR = "dR";
    const auto IDT_DEF_PHOSPHATE = "P";
    const auto IDT_MODIFIED_PHOSPHATE = "sP";
    constexpr int MAX_STD_TOKEN_SIZE = 2;
    _row = 0;
    mol.clear();
    std::string invalid_symbols;
    while (!_scanner.isEOF())
    {
        _seq_id = 0;
        _last_monomer_idx = -1;
        _col = 0;

        using token_t = std::pair<std::string, bool>;
        std::queue<token_t> tokens; // second=true if token folowed by *
        std::string cur_token;

        while (true)
        {
            if (_scanner.isEOL())
            {
                if (cur_token.size())
                    tokens.emplace(cur_token, false);
                break;
            }
            auto ch = _scanner.readChar();
            switch (ch)
            {
            case ' ':
                if (cur_token.size())
                    tokens.emplace(cur_token, false);
                continue;
            case '/': {
                if (cur_token.size())
                    throw Error("Sugar prefix could not be used with modified monomer.");
                // read till next '/'
                ch = 0;
                while (!_scanner.isEOL())
                {
                    ch = _scanner.readChar();
                    if (ch == '/')
                        break;
                    cur_token += ch;
                }
                if (ch != '/')
                    throw Error("Unexpected end of data");
                if (cur_token == "")
                    throw Error("Invalid modification: empty string.");
                if (cur_token.size() < 3)
                    throw Error("Invalid modification: %s.", cur_token.c_str());
                cur_token += ch;
                break;
            }
            case 'A':
            case 'T':
            case 'C':
            case 'G':
            case 'U':
            case 'I':
                cur_token += ch;
                break;
            case 'r':
            case '+':
            case 'm':
                if (cur_token.size())
                    throw Error("Sugar prefix '%s' whithout base.", cur_token.c_str());
                else
                    cur_token += ch;
                continue;
                break;
            default:
                if (invalid_symbols.size())
                    invalid_symbols += ',';
                invalid_symbols += ch;
                continue;
                break;
            }

            if (_scanner.lookNext() == '*')
            {
                tokens.emplace(cur_token, true);
                _scanner.skip(1);
            }
            else
                tokens.emplace(cur_token, false);
            cur_token = "";
        }
        while (!_scanner.isEOF() && _scanner.isEOL()) // Skip EOL characters
            _scanner.skip(1);

        IdtModification modification = IdtModification::FIVE_PRIME_END;

        token_t prev_token;
        while (tokens.size() > 0)
        {
            token_t token = tokens.front();
            tokens.pop();

            std::string phosphate = IDT_DEF_PHOSPHATE;
            std::string sugar = IDT_DEF_SUGAR;
            std::string idt_alias = "";
            std::string base = "";
            std::string single_monomer = "";
            std::string single_monomer_class;
            bool unresolved = false;

            if (token.first.back() == '/')
            {
                token.first.pop_back();
                idt_alias = token.first;
                if ((idt_alias == "5Phos" || idt_alias == "3Phos") && (token.second || prev_token.second))
                    throw Error("Symbol '*' could be placed only between two nucleotides/nucleosides.");
            }
            else
            {
                if (token.first.size() > MAX_STD_TOKEN_SIZE)
                    throw Error("Wrong IDT syntax: '%s'", token.first.c_str());
                idt_alias = token.first.back();
                if (token.first.size() > 1)
                {
                    switch (token.first[0])
                    {
                    case 'r':
                        sugar = "R";
                        break;
                    case '+':
                        sugar = "LR";
                        break;
                    case 'm':
                        sugar = "mR";
                        break;
                    default:
                        throw Error("Wrong IDT syntax: '%s'", token.first.c_str());
                    }
                }
            }

            if (idt_alias.size() == 1)
            {
                if (IDT_STANDARD_BASES.count(idt_alias) == 0)
                {
                    if (invalid_symbols.size())
                        invalid_symbols += ',';
                    invalid_symbols += idt_alias[0];
                }
                else
                {
                    base = idt_alias;

                    if (tokens.size() == 0)
                    {
                        if (token.second)
                            throw Error("Invalid IDT sequence: '*' couldn't be the last symbol.");
                        modification = IdtModification::THREE_PRIME_END;
                        phosphate = "";
                    }
                    else if (token.second)
                    {
                        phosphate = IDT_MODIFIED_PHOSPHATE;
                    }

                    if (!checkAddTemplate(mol, MonomerClass::Sugar, sugar))
                        throw Error("Unknown sugar '%s'", sugar.c_str());
                    if (idt_alias.size() > 0 && !checkAddTemplate(mol, MonomerClass::Base, base))
                        throw Error("Unknown base '%s'", idt_alias.c_str());
                    if (phosphate.size() > 0 && !checkAddTemplate(mol, MonomerClass::Phosphate, phosphate))
                        throw Error("Unknown phosphate '%s'", phosphate.c_str());
                }
            }
            else
            {
                if (tokens.size() == 0)
                {
                    modification = IdtModification::THREE_PRIME_END;
                    // Corner case: /3Phos/ after standard monomer - no additional P should be added
                    if (prev_token.first.size() > 0 && prev_token.first.size() <= MAX_STD_TOKEN_SIZE && idt_alias == "3Phos")
                        continue;
                }

                sugar = "";
                IdtModification alias_mod;
                const std::string& mgt_id = _library.getMGTidByIdtAlias(idt_alias, alias_mod);
                if (mgt_id.size())
                {
                    // Check that alias modification can be used in current position
                    check_monomer_place(idt_alias, modification, alias_mod, prev_token.first.size() > 0);
                    MonomerGroupTemplate& mgt = _library.getMonomerGroupTemplateById(mgt_id);
                    const MonomerTemplate& sugar_template = mgt.getTemplateByClass(MonomerClass::Sugar);
                    sugar = sugar_template.getStringProp("alias");
                    checkAddTemplate(mol, sugar_template);
                    if (alias_mod == IdtModification::THREE_PRIME_END)
                    {
                        if (token.second)
                            throw Error("Monomer /%s/ doesn't have phosphate, so '*' couldn't be applied.", idt_alias.c_str());
                        phosphate = "";
                    }
                    else
                    {
                        if (mgt.hasTemplateClass(MonomerClass::Phosphate))
                        {
                            if (token.second) // * means that 'sP' should be used
                            {
                                phosphate = IDT_MODIFIED_PHOSPHATE;
                                checkAddTemplate(mol, MonomerClass::Phosphate, phosphate);
                            }
                            else // use phosphate from template
                            {
                                const MonomerTemplate& phosphate_template = mgt.getTemplateByClass(MonomerClass::Phosphate);
                                phosphate = phosphate_template.getStringProp("alias");
                                checkAddTemplate(mol, phosphate_template);
                            }
                        }
                        else
                        {
                            if (token.second)
                                throw Error("Monomer /%s/ doesn't have phosphate, so '*' couldn't be applied.", idt_alias.c_str());
                            phosphate = "";
                        }
                    }
                    if (mgt.hasTemplateClass(MonomerClass::Base))
                    {
                        const MonomerTemplate& base_template = mgt.getTemplateByClass(MonomerClass::Base);
                        base = base_template.getStringProp("alias");
                        checkAddTemplate(mol, base_template);
                    }
                }
                else
                {
                    IdtModification alias_mod;
                    auto monomer_template_id = _library.getMonomerTemplateIdByIdtAlias(idt_alias, alias_mod);
                    if (monomer_template_id.size())
                    {
                        if (token.second)
                            throw Error("'*' couldn't be applied to monomer /%s/.", idt_alias.c_str());
                        check_monomer_place(idt_alias, modification, alias_mod, prev_token.first.size() > 0);
                        const MonomerTemplate& monomer_template = _library.getMonomerTemplateById(monomer_template_id);
                        checkAddTemplate(mol, monomer_template);
                        single_monomer = monomer_template.getStringProp("alias");
                        single_monomer_class = MonomerTemplates::classToStr(monomer_template.monomerClass());
                    }
                    else // IDT alias not found
                    {
                        unresolved = true;
                        single_monomer = "unknown_monomer_with_idt_alias_" + idt_alias;
                        auto monomer_class = MonomerClass::CHEM;
                        single_monomer_class = MonomerTemplates::classToStr(monomer_class);
                        // Unresoved monomer could be in any position
                        MonomerTemplate monomer_template(single_monomer, monomer_class, IdtAlias(idt_alias, idt_alias, idt_alias, idt_alias), true);
                        monomer_template.setStringProp("alias", idt_alias);
                        for (auto ap : {"R1", "R2", "R3", "R4"})
                            monomer_template.AddAttachmentPoint(ap, -1);
                        checkAddTemplate(mol, monomer_template);
                    }
                }
            }

            if (single_monomer.size())
            {
                int monomer_idx = addTemplateAtom(mol, unresolved ? idt_alias.c_str() : single_monomer.c_str(), single_monomer_class.c_str(), _seq_id);
                mol.asMolecule().setAtomXyz(monomer_idx, getBackboneMonomerPosition());
                if (_last_monomer_idx >= 0)
                    addTemplateBond(mol, _last_monomer_idx, monomer_idx);
                _last_monomer_idx = monomer_idx;
            }
            else
                addNucleotide(mol, base, sugar, phosphate, false);

            _seq_id++;
            _col++;

            prev_token = token; // save to check */3Phos/ case
            modification = IdtModification::INTERNAL;
        }
        _row += 2;
    }

    if (invalid_symbols.size())
        throw Error("Invalid symbols in the sequence: %s", invalid_symbols.c_str());
}