void SequenceLoader::loadHELM()

in core/indigo-core/molecule/src/sequence_loader.cpp [1692:1982]


void SequenceLoader::loadHELM(KetDocument& document)
{
    _row = 0;
    _seq_id = 1;
    std::string simple_polymer_name = "";
    std::string simple_polymer_type = "";
    int monomer_idx = 0;
    int prev_monomer_template_atom_idx = -1;
    int unknown_count = 0;
    _unknown_ambiguous_count = 0;
    using polymer_map = std::map<std::string, std::map<int, size_t>>;
    polymer_map used_polymer_nums;
    polymer_map::iterator cur_polymer_map;
    _opts_to_template_id.clear();
    enum class helm_parts
    {
        ListOfSimplePolymers,
        ListOfConnections,
        ListOfPolymerGroups,
        ExtendedAnnotation,
        End
    };
    helm_parts helm_part = helm_parts::ListOfSimplePolymers;
    while (!_scanner.isEOF())
    {
        if (helm_part == helm_parts::ListOfSimplePolymers)
        {
            auto ch = _scanner.lookNext();
            if (simple_polymer_name.size() == 0) // Read simple polymer_name
            {
                _col = 0;
                simple_polymer_type = readHelmSimplePolymerName(simple_polymer_name);
                if (used_polymer_nums.count(simple_polymer_name))
                    throw Error("Simple polymer '%s' defined more than once.", simple_polymer_name.c_str());
                if (simple_polymer_name == simple_polymer_type)
                    throw Error("Polymer '%s' without number not allowed.", simple_polymer_name.c_str());
                ch = _scanner.lookNext();
                if (ch != '{')
                    throw Error(". Expected '{' but found '%c'.", ch);
                _scanner.skip(1); // skip '{'
                if (used_polymer_nums.count(simple_polymer_name))
                    throw Error("Simple polymer '%s' defined more than once.", simple_polymer_name.c_str());
                auto res = used_polymer_nums.emplace(std::make_pair(simple_polymer_name, std::map<int, size_t>()));
                if (res.second)
                    cur_polymer_map = res.first;
                else
                    throw Error("Internal error - cannot emplace polymer map.");
            }
            else if (ch != '}')
            {
                monomer_idx++;
                Vec3f pos(_col * LayoutOptions::DEFAULT_MONOMER_BOND_LENGTH, -LayoutOptions::DEFAULT_MONOMER_BOND_LENGTH * _row, 0);
                _col++;
                if (simple_polymer_type == kHELMPolymerTypeUnknown)
                {
                    std::string name;
                    _scanner.readWord(name, reserved_helm_chars);
                    // skip blob for now
                    ch = _scanner.lookNext();
                    if (ch != '}')
                        throw Error("Unexpected symbol. Expected '}' but found '%c'.", ch);
                    continue;
                }
                const auto& monomer_class = MonomerTemplates::getStrToMonomerType().at(simple_polymer_type);
                auto monomer_info = readHelmMonomer(document, monomer_class == MonomerClass::RNA ? MonomerClass::Sugar : monomer_class);
                if (monomer_class == MonomerClass::CHEM)
                {
                    ch = _scanner.lookNext();
                    if (ch != '}')
                        throw Error("Unexpected symbol. Expected '}' but found '%c'.", ch); // only one monomer in chem

                    auto& alias = std::get<0>(monomer_info);
                    if (alias == "*") // if monomer_alias == "*"
                    {
                        alias = "unknown_monomer_" + std::to_string(unknown_count++);
                        MonomerTemplate monomer_template(alias, MonomerClass::CHEM, IdtAlias(alias, alias, alias, alias), true);
                        monomer_template.setStringProp("alias", alias);
                        for (auto ap : {"R1", "R2", "R3", "R4"})
                            monomer_template.AddAttachmentPoint(ap, -1);
                        checkAddTemplate(document, monomer_template);
                        _added_templates.emplace(monomer_class, alias);
                    }
                    cur_polymer_map->second[monomer_idx] = addKetMonomer(document, monomer_info, monomer_class, pos);
                }
                else if (monomer_class == MonomerClass::AminoAcid)
                {
                    auto amino_idx = addKetMonomer(document, monomer_info, monomer_class, pos);
                    cur_polymer_map->second[monomer_idx] = amino_idx;
                    if (monomer_idx > 1)
                        addMonomerConnection(document, amino_idx - 1, amino_idx);
                    ch = _scanner.lookNext();
                    if (ch == '.')
                        _scanner.skip(1);
                    else if (ch != '}')
                        throw Error("Unexpected symbol. Expected '.' or '}' but found '%c'.", ch);
                }
                else // kHELMPolymerTypeRNA
                {
                    const std::string& phosphate_lib_id = _library.getMonomerTemplateIdByAlias(MonomerClass::Phosphate, std::get<0>(monomer_info));
                    const std::string& nucleotide_id = _library.getMonomerTemplateIdByAlias(MonomerClass::RNA, std::get<0>(monomer_info));
                    if (phosphate_lib_id.size() || nucleotide_id.size())
                    {
                        // add phosphate
                        auto added_idx = addKetMonomer(document, monomer_info, nucleotide_id.size() > 0 ? MonomerClass::RNA : MonomerClass::Phosphate, pos);
                        cur_polymer_map->second[monomer_idx] = added_idx;
                        if (monomer_idx > 1)
                            addMonomerConnection(document, added_idx - 1, added_idx);
                        ch = _scanner.lookNext();
                        if (ch != '.' && ch != '}')
                            throw Error("Unexpected symbol. Expected '.' or '}' but found '%c'.", ch);
                        if (ch == '.')
                            _scanner.skip(1);
                        continue;
                    }
                    auto sugar_idx = addKetMonomer(document, monomer_info, MonomerClass::Sugar, pos);
                    cur_polymer_map->second[monomer_idx] = sugar_idx;
                    if (monomer_idx > 1)
                        addMonomerConnection(document, sugar_idx - 1, sugar_idx);
                    ch = _scanner.lookNext();
                    if (ch == '(') // In RNA after sugar could be base in ()
                    {
                        monomer_idx++;
                        auto base_info = readHelmMonomer(document, MonomerClass::Base);
                        ch = _scanner.lookNext();
                        Vec3f base_pos(pos.x, pos.y - LayoutOptions::DEFAULT_MONOMER_BOND_LENGTH, 0);
                        auto base_idx = addKetMonomer(document, base_info, MonomerClass::Base, base_pos);
                        cur_polymer_map->second[monomer_idx] = base_idx;
                        if (monomer_idx > 1)
                            addMonomerConnection(document, sugar_idx, base_idx, true);
                    }
                    if (ch == '.')
                    {
                        _scanner.skip(1);
                    }
                    if (ch == '}')
                        continue;
                    auto phosphate_info = readHelmMonomer(document, MonomerClass::Phosphate);
                    monomer_idx++;
                    Vec3f phosphate_pos(_col * LayoutOptions::DEFAULT_MONOMER_BOND_LENGTH, -LayoutOptions::DEFAULT_MONOMER_BOND_LENGTH * _row, 0);
                    _col++;
                    auto phosphate_idx = addKetMonomer(document, phosphate_info, MonomerClass::Phosphate, phosphate_pos);
                    cur_polymer_map->second[monomer_idx] = phosphate_idx;
                    if (monomer_idx > 1)
                        addMonomerConnection(document, sugar_idx, phosphate_idx);
                    ch = _scanner.lookNext();
                    if (ch != '.' && ch != '}')
                        throw Error("Unexpected symbol. Expected '.' or '}' but found '%c'.", ch);
                    if (ch == '.')
                        _scanner.skip(1);
                }
            }
            else // end of polymer - }
            {
                _scanner.skip(1); // skip '}'
                ch = _scanner.lookNext();
                if (ch == '"')
                {
                    Array<char> annotation;
                    _scanner.skip(1);
                    _scanner.readWord(annotation, "\"");
                    if (_scanner.lookNext() != '"')
                        throw Error("Unexpected symbol. Expected '\"' but found '%c'.", _scanner.lookNext());
                    _scanner.skip(1);
                    // skip annotation for now
                    ch = _scanner.lookNext();
                }
                _row++;
                _col = 0;
                monomer_idx = 0;
                if (simple_polymer_type == kHELMPolymerTypeRNA)
                    _row++; // additional row for bases in RNA
                if (ch == '|')
                {
                    // cleanup to go to next simple polymer
                    simple_polymer_name = "";
                    simple_polymer_type = "";
                }
                else if (ch == '$')
                {
                    helm_part = helm_parts::ListOfConnections;
                }
                else if (ch == -1)
                {
                    throw Error(unexpected_eod);
                }
                else
                {
                    throw Error("Unexpected symbol. Expected '|' or '$' but found '%c'.", ch);
                }
                _scanner.skip(1);
            }
        }
        else if (helm_part == helm_parts::ListOfConnections)
        {
            auto ch = _scanner.lookNext();
            if (ch == '$')
            {
                helm_part = helm_parts::ListOfPolymerGroups;
                _scanner.skip(1);
                continue;
            }
            // CHEM1,RNA1,32:R1-12:R2"annotation"|.....
            std::string left_polymer, right_polymer;
            std::ignore = readHelmSimplePolymerName(left_polymer);
            auto left_polymer_nums = used_polymer_nums.find(left_polymer);
            if (left_polymer_nums == used_polymer_nums.end())
                throw Error("Polymer '%s' not found.", left_polymer.c_str());
            ch = _scanner.lookNext();
            if (ch != ',')
                throw Error("Unexpected symbol. Expected ',' but found '%c'.", _scanner.lookNext());
            _scanner.skip(1);
            std::ignore = readHelmSimplePolymerName(right_polymer);
            auto right_polymer_nums = used_polymer_nums.find(right_polymer);
            if (right_polymer_nums == used_polymer_nums.end())
                throw Error("Polymer '%s' not found.", right_polymer.c_str());
            ch = _scanner.lookNext();
            if (ch != ',')
                throw Error("Unexpected symbol. Expected ',' but found '%c'.", _scanner.lookNext());
            _scanner.skip(1);
            // read monomer position
            int left_monomer_idx, right_monomer_idx;
            std::string left_ap, right_ap;
            std::string position;
            size_t error_pos;
            _scanner.readWord(position, ":");
            _scanner.skip(1);
            left_monomer_idx = std::stoi(position, &error_pos);
            if (error_pos != position.size())
                throw Error("Only direct connections supported now.");
            _scanner.readWord(left_ap, "-");
            _scanner.skip(1);
            position.clear();
            _scanner.readWord(position, ":");
            _scanner.skip(1);
            right_monomer_idx = std::stoi(position, &error_pos);
            if (error_pos != position.size())
                throw Error("Only direct connections supported now.");
            _scanner.readWord(right_ap, "\"|$");
            auto left_mon_it = left_polymer_nums->second.find(left_monomer_idx);
            if (left_mon_it == left_polymer_nums->second.end())
                throw Error("Polymer '%s' does not contains monomer with number %d.", left_polymer.c_str(), left_monomer_idx);
            auto right_mon_it = right_polymer_nums->second.find(right_monomer_idx);
            if (right_mon_it == right_polymer_nums->second.end())
                throw Error("Polymer '%s' does not contains monomer with number %d.", right_polymer.c_str(), right_monomer_idx);
            document.addConnection(document.monomers().at(std::to_string(left_mon_it->second))->ref(), left_ap,
                                   document.monomers().at(std::to_string(right_mon_it->second))->ref(), right_ap);
            if (_scanner.isEOF())
                throw Error(unexpected_eod);
            ch = _scanner.readChar();
            if (ch == '"')
            {
                std::string annotation;
                _scanner.readWord(annotation, "\"");
                if (_scanner.isEOF())
                    throw Error(unexpected_eod);
                if (_scanner.lookNext() != '"')
                    throw Error("Unexpected symbol. Expected '\"' but found '%c'.", _scanner.lookNext());
                _scanner.skip(1); // skip '"'
                if (_scanner.isEOF())
                    throw Error(unexpected_eod);
                ch = _scanner.readChar();
            }
            if (ch != '|' && ch != '$')
                throw Error("Unexpected symbol. Expected '|' or '$' but found '%c'.", _scanner.lookNext());
        }
        else if (helm_part == helm_parts::ListOfPolymerGroups)
        {
            std::string groups;
            _scanner.readWord(groups, "$");
            // skip groups for now
            helm_part = helm_parts::ExtendedAnnotation;
        }
        else // helm_parts::ExtendedAnnotation
        {
            // read rest of data
            std::string rest_of_helm;
            _scanner.readAll(rest_of_helm);
            auto it = rest_of_helm.find_last_of('$');
            if (it == rest_of_helm.npos)
                throw Error("Incorrect format. Last '$' not found.");
            std::string signature = rest_of_helm.substr(it + 1);
            // split by last '$' and check if right part eq “V2.0”
            // if (signature != "v2.0")
            //     throw Error("Expected HELM V2.0 but got '%s'.", signature.c_str());
            // check that left part is valid json - TODO
            helm_part = helm_parts::End;
        }
    }
    if (helm_part != helm_parts::End)
        throw Error(unexpected_eod);
}