in core/indigo-core/molecule/src/sequence_loader.cpp [1692:1982]
void SequenceLoader::loadHELM(KetDocument& document)
{
_row = 0;
_seq_id = 1;
std::string simple_polymer_name = "";
std::string simple_polymer_type = "";
int monomer_idx = 0;
int prev_monomer_template_atom_idx = -1;
int unknown_count = 0;
_unknown_ambiguous_count = 0;
using polymer_map = std::map<std::string, std::map<int, size_t>>;
polymer_map used_polymer_nums;
polymer_map::iterator cur_polymer_map;
_opts_to_template_id.clear();
enum class helm_parts
{
ListOfSimplePolymers,
ListOfConnections,
ListOfPolymerGroups,
ExtendedAnnotation,
End
};
helm_parts helm_part = helm_parts::ListOfSimplePolymers;
while (!_scanner.isEOF())
{
if (helm_part == helm_parts::ListOfSimplePolymers)
{
auto ch = _scanner.lookNext();
if (simple_polymer_name.size() == 0) // Read simple polymer_name
{
_col = 0;
simple_polymer_type = readHelmSimplePolymerName(simple_polymer_name);
if (used_polymer_nums.count(simple_polymer_name))
throw Error("Simple polymer '%s' defined more than once.", simple_polymer_name.c_str());
if (simple_polymer_name == simple_polymer_type)
throw Error("Polymer '%s' without number not allowed.", simple_polymer_name.c_str());
ch = _scanner.lookNext();
if (ch != '{')
throw Error(". Expected '{' but found '%c'.", ch);
_scanner.skip(1); // skip '{'
if (used_polymer_nums.count(simple_polymer_name))
throw Error("Simple polymer '%s' defined more than once.", simple_polymer_name.c_str());
auto res = used_polymer_nums.emplace(std::make_pair(simple_polymer_name, std::map<int, size_t>()));
if (res.second)
cur_polymer_map = res.first;
else
throw Error("Internal error - cannot emplace polymer map.");
}
else if (ch != '}')
{
monomer_idx++;
Vec3f pos(_col * LayoutOptions::DEFAULT_MONOMER_BOND_LENGTH, -LayoutOptions::DEFAULT_MONOMER_BOND_LENGTH * _row, 0);
_col++;
if (simple_polymer_type == kHELMPolymerTypeUnknown)
{
std::string name;
_scanner.readWord(name, reserved_helm_chars);
// skip blob for now
ch = _scanner.lookNext();
if (ch != '}')
throw Error("Unexpected symbol. Expected '}' but found '%c'.", ch);
continue;
}
const auto& monomer_class = MonomerTemplates::getStrToMonomerType().at(simple_polymer_type);
auto monomer_info = readHelmMonomer(document, monomer_class == MonomerClass::RNA ? MonomerClass::Sugar : monomer_class);
if (monomer_class == MonomerClass::CHEM)
{
ch = _scanner.lookNext();
if (ch != '}')
throw Error("Unexpected symbol. Expected '}' but found '%c'.", ch); // only one monomer in chem
auto& alias = std::get<0>(monomer_info);
if (alias == "*") // if monomer_alias == "*"
{
alias = "unknown_monomer_" + std::to_string(unknown_count++);
MonomerTemplate monomer_template(alias, MonomerClass::CHEM, IdtAlias(alias, alias, alias, alias), true);
monomer_template.setStringProp("alias", alias);
for (auto ap : {"R1", "R2", "R3", "R4"})
monomer_template.AddAttachmentPoint(ap, -1);
checkAddTemplate(document, monomer_template);
_added_templates.emplace(monomer_class, alias);
}
cur_polymer_map->second[monomer_idx] = addKetMonomer(document, monomer_info, monomer_class, pos);
}
else if (monomer_class == MonomerClass::AminoAcid)
{
auto amino_idx = addKetMonomer(document, monomer_info, monomer_class, pos);
cur_polymer_map->second[monomer_idx] = amino_idx;
if (monomer_idx > 1)
addMonomerConnection(document, amino_idx - 1, amino_idx);
ch = _scanner.lookNext();
if (ch == '.')
_scanner.skip(1);
else if (ch != '}')
throw Error("Unexpected symbol. Expected '.' or '}' but found '%c'.", ch);
}
else // kHELMPolymerTypeRNA
{
const std::string& phosphate_lib_id = _library.getMonomerTemplateIdByAlias(MonomerClass::Phosphate, std::get<0>(monomer_info));
const std::string& nucleotide_id = _library.getMonomerTemplateIdByAlias(MonomerClass::RNA, std::get<0>(monomer_info));
if (phosphate_lib_id.size() || nucleotide_id.size())
{
// add phosphate
auto added_idx = addKetMonomer(document, monomer_info, nucleotide_id.size() > 0 ? MonomerClass::RNA : MonomerClass::Phosphate, pos);
cur_polymer_map->second[monomer_idx] = added_idx;
if (monomer_idx > 1)
addMonomerConnection(document, added_idx - 1, added_idx);
ch = _scanner.lookNext();
if (ch != '.' && ch != '}')
throw Error("Unexpected symbol. Expected '.' or '}' but found '%c'.", ch);
if (ch == '.')
_scanner.skip(1);
continue;
}
auto sugar_idx = addKetMonomer(document, monomer_info, MonomerClass::Sugar, pos);
cur_polymer_map->second[monomer_idx] = sugar_idx;
if (monomer_idx > 1)
addMonomerConnection(document, sugar_idx - 1, sugar_idx);
ch = _scanner.lookNext();
if (ch == '(') // In RNA after sugar could be base in ()
{
monomer_idx++;
auto base_info = readHelmMonomer(document, MonomerClass::Base);
ch = _scanner.lookNext();
Vec3f base_pos(pos.x, pos.y - LayoutOptions::DEFAULT_MONOMER_BOND_LENGTH, 0);
auto base_idx = addKetMonomer(document, base_info, MonomerClass::Base, base_pos);
cur_polymer_map->second[monomer_idx] = base_idx;
if (monomer_idx > 1)
addMonomerConnection(document, sugar_idx, base_idx, true);
}
if (ch == '.')
{
_scanner.skip(1);
}
if (ch == '}')
continue;
auto phosphate_info = readHelmMonomer(document, MonomerClass::Phosphate);
monomer_idx++;
Vec3f phosphate_pos(_col * LayoutOptions::DEFAULT_MONOMER_BOND_LENGTH, -LayoutOptions::DEFAULT_MONOMER_BOND_LENGTH * _row, 0);
_col++;
auto phosphate_idx = addKetMonomer(document, phosphate_info, MonomerClass::Phosphate, phosphate_pos);
cur_polymer_map->second[monomer_idx] = phosphate_idx;
if (monomer_idx > 1)
addMonomerConnection(document, sugar_idx, phosphate_idx);
ch = _scanner.lookNext();
if (ch != '.' && ch != '}')
throw Error("Unexpected symbol. Expected '.' or '}' but found '%c'.", ch);
if (ch == '.')
_scanner.skip(1);
}
}
else // end of polymer - }
{
_scanner.skip(1); // skip '}'
ch = _scanner.lookNext();
if (ch == '"')
{
Array<char> annotation;
_scanner.skip(1);
_scanner.readWord(annotation, "\"");
if (_scanner.lookNext() != '"')
throw Error("Unexpected symbol. Expected '\"' but found '%c'.", _scanner.lookNext());
_scanner.skip(1);
// skip annotation for now
ch = _scanner.lookNext();
}
_row++;
_col = 0;
monomer_idx = 0;
if (simple_polymer_type == kHELMPolymerTypeRNA)
_row++; // additional row for bases in RNA
if (ch == '|')
{
// cleanup to go to next simple polymer
simple_polymer_name = "";
simple_polymer_type = "";
}
else if (ch == '$')
{
helm_part = helm_parts::ListOfConnections;
}
else if (ch == -1)
{
throw Error(unexpected_eod);
}
else
{
throw Error("Unexpected symbol. Expected '|' or '$' but found '%c'.", ch);
}
_scanner.skip(1);
}
}
else if (helm_part == helm_parts::ListOfConnections)
{
auto ch = _scanner.lookNext();
if (ch == '$')
{
helm_part = helm_parts::ListOfPolymerGroups;
_scanner.skip(1);
continue;
}
// CHEM1,RNA1,32:R1-12:R2"annotation"|.....
std::string left_polymer, right_polymer;
std::ignore = readHelmSimplePolymerName(left_polymer);
auto left_polymer_nums = used_polymer_nums.find(left_polymer);
if (left_polymer_nums == used_polymer_nums.end())
throw Error("Polymer '%s' not found.", left_polymer.c_str());
ch = _scanner.lookNext();
if (ch != ',')
throw Error("Unexpected symbol. Expected ',' but found '%c'.", _scanner.lookNext());
_scanner.skip(1);
std::ignore = readHelmSimplePolymerName(right_polymer);
auto right_polymer_nums = used_polymer_nums.find(right_polymer);
if (right_polymer_nums == used_polymer_nums.end())
throw Error("Polymer '%s' not found.", right_polymer.c_str());
ch = _scanner.lookNext();
if (ch != ',')
throw Error("Unexpected symbol. Expected ',' but found '%c'.", _scanner.lookNext());
_scanner.skip(1);
// read monomer position
int left_monomer_idx, right_monomer_idx;
std::string left_ap, right_ap;
std::string position;
size_t error_pos;
_scanner.readWord(position, ":");
_scanner.skip(1);
left_monomer_idx = std::stoi(position, &error_pos);
if (error_pos != position.size())
throw Error("Only direct connections supported now.");
_scanner.readWord(left_ap, "-");
_scanner.skip(1);
position.clear();
_scanner.readWord(position, ":");
_scanner.skip(1);
right_monomer_idx = std::stoi(position, &error_pos);
if (error_pos != position.size())
throw Error("Only direct connections supported now.");
_scanner.readWord(right_ap, "\"|$");
auto left_mon_it = left_polymer_nums->second.find(left_monomer_idx);
if (left_mon_it == left_polymer_nums->second.end())
throw Error("Polymer '%s' does not contains monomer with number %d.", left_polymer.c_str(), left_monomer_idx);
auto right_mon_it = right_polymer_nums->second.find(right_monomer_idx);
if (right_mon_it == right_polymer_nums->second.end())
throw Error("Polymer '%s' does not contains monomer with number %d.", right_polymer.c_str(), right_monomer_idx);
document.addConnection(document.monomers().at(std::to_string(left_mon_it->second))->ref(), left_ap,
document.monomers().at(std::to_string(right_mon_it->second))->ref(), right_ap);
if (_scanner.isEOF())
throw Error(unexpected_eod);
ch = _scanner.readChar();
if (ch == '"')
{
std::string annotation;
_scanner.readWord(annotation, "\"");
if (_scanner.isEOF())
throw Error(unexpected_eod);
if (_scanner.lookNext() != '"')
throw Error("Unexpected symbol. Expected '\"' but found '%c'.", _scanner.lookNext());
_scanner.skip(1); // skip '"'
if (_scanner.isEOF())
throw Error(unexpected_eod);
ch = _scanner.readChar();
}
if (ch != '|' && ch != '$')
throw Error("Unexpected symbol. Expected '|' or '$' but found '%c'.", _scanner.lookNext());
}
else if (helm_part == helm_parts::ListOfPolymerGroups)
{
std::string groups;
_scanner.readWord(groups, "$");
// skip groups for now
helm_part = helm_parts::ExtendedAnnotation;
}
else // helm_parts::ExtendedAnnotation
{
// read rest of data
std::string rest_of_helm;
_scanner.readAll(rest_of_helm);
auto it = rest_of_helm.find_last_of('$');
if (it == rest_of_helm.npos)
throw Error("Incorrect format. Last '$' not found.");
std::string signature = rest_of_helm.substr(it + 1);
// split by last '$' and check if right part eq “V2.0”
// if (signature != "v2.0")
// throw Error("Expected HELM V2.0 but got '%s'.", signature.c_str());
// check that left part is valid json - TODO
helm_part = helm_parts::End;
}
}
if (helm_part != helm_parts::End)
throw Error(unexpected_eod);
}