in core/indigo-core/molecule/src/sequence_loader.cpp [882:1291]
void SequenceLoader::loadIdt(KetDocument& document)
{
const auto IDT_DEF_SUGAR = "dR";
const auto IDT_DEF_PHOSPHATE = "P";
const auto IDT_MODIFIED_PHOSPHATE = "sP";
constexpr int MAX_STD_TOKEN_SIZE = 2;
_row = 0;
std::string invalid_symbols;
while (!_scanner.isEOF())
{
_seq_id = 0;
_last_monomer_idx = -1;
_col = 0;
using token_t = std::pair<std::string, bool>;
std::queue<token_t> tokens; // second=true if token folowed by *
std::string cur_token;
while (true)
{
if (_scanner.isEOL())
{
if (cur_token.size())
tokens.emplace(cur_token, false);
break;
}
auto ch = _scanner.readChar();
switch (ch)
{
case ' ':
if (cur_token.size())
tokens.emplace(cur_token, false);
continue;
case '/': {
if (cur_token.size())
throw Error("Sugar prefix could not be used with modified monomer.");
// read till next '/'
ch = 0;
while (!_scanner.isEOL())
{
ch = _scanner.readChar();
if (ch == '/')
break;
cur_token += ch;
}
if (ch != '/')
throw Error("Unexpected end of data");
if (cur_token == "")
throw Error("Invalid modification: empty string.");
if (cur_token.size() < 3)
throw Error("Invalid modification: %s.", cur_token.c_str());
cur_token += ch;
break;
}
case '(': { // read till ')'
cur_token += ch;
ch = 0;
while (!_scanner.isEOL())
{
ch = _scanner.readChar();
if (ch == ')')
break;
cur_token += ch;
}
if (ch != ')')
throw Error("Unexpected end of data");
if (cur_token == "")
throw Error("Invalid ambiguous monomer: empty string.");
cur_token += ch;
break;
}
case 'A':
case 'T':
case 'C':
case 'G':
case 'U':
case 'I':
case 'R':
case 'Y':
case 'M':
case 'K':
case 'S':
case 'W':
case 'H':
case 'B':
case 'V':
case 'D':
case 'N':
cur_token += ch;
break;
case 'r':
case '+':
case 'm':
if (cur_token.size())
throw Error("Sugar prefix '%s' whithout base.", cur_token.c_str());
else
cur_token += ch;
continue;
break;
default:
if (invalid_symbols.size())
invalid_symbols += ',';
invalid_symbols += ch;
continue;
break;
}
if (_scanner.lookNext() == '*')
{
tokens.emplace(cur_token, true);
_scanner.skip(1);
}
else
tokens.emplace(cur_token, false);
cur_token = "";
}
while (!_scanner.isEOF() && _scanner.isEOL()) // Skip EOL characters
_scanner.skip(1);
IdtModification modification = IdtModification::FIVE_PRIME_END;
token_t prev_token;
while (tokens.size() > 0)
{
token_t token = tokens.front();
tokens.pop();
std::string phosphate = IDT_DEF_PHOSPHATE;
std::string sugar = IDT_DEF_SUGAR;
std::string idt_alias = "";
std::string base = "";
std::string single_monomer = "";
std::string single_monomer_alias = "";
std::string single_monomer_class;
bool unresolved = false;
bool ambiguous_monomer = false;
if (token.first.back() == '/')
{
token.first.pop_back();
idt_alias = token.first;
if ((idt_alias == "5Phos" || idt_alias == "3Phos") && (token.second || prev_token.second))
throw Error("Symbol '*' could be placed only between two nucleotides/nucleosides.");
}
else
{
if (token.first.size() > MAX_STD_TOKEN_SIZE)
if (token.first.back() == ')')
idt_alias = token.first;
else
throw Error("Wrong IDT syntax: '%s'", token.first.c_str());
else
idt_alias = token.first.back();
if (token.first.size() > 1)
{
auto ch = token.first[0];
if (ch != '(')
{
switch (ch)
{
case 'r':
sugar = "R";
break;
case '+':
sugar = "LR";
break;
case 'm':
sugar = "mR";
break;
default:
throw Error("Wrong IDT syntax: '%s'", token.first.c_str());
}
if (idt_alias.back() == ')')
idt_alias.erase(0, 1);
}
}
}
if (STANDARD_MIXED_BASES.count(idt_alias) != 0 || idt_alias.back() == ')')
ambiguous_monomer = true;
if (idt_alias.size() == 1 || ambiguous_monomer)
{
if (IDT_STANDARD_BASES.count(idt_alias) == 0 && !ambiguous_monomer)
{
if (invalid_symbols.size())
invalid_symbols += ',';
invalid_symbols += idt_alias[0];
continue;
}
if (ambiguous_monomer)
{
auto mixed_base = idt_alias;
std::optional<std::array<float, 4>> ratios;
if (mixed_base.back() == ')')
{
mixed_base = idt_alias.substr(1, idt_alias.size() - 2);
auto check_mixed_base = [](const std::string& base) {
if (base.size() < 2)
return;
auto count = base.substr(1, base.size() - 1);
for (auto ch : count)
{
if (!std::isdigit(ch))
throw Error("Invalid mixed base - only numerical index allowed.");
}
};
if (auto pos = mixed_base.find(':'); pos != std::string::npos)
{
auto ratios_str = mixed_base.substr(pos + 1, mixed_base.size() - pos - 1);
mixed_base = mixed_base.substr(0, pos);
check_mixed_base(mixed_base);
if (ratios_str.size() != 8)
throw Exception("Invalid IDT ambiguous monomer %s", idt_alias.c_str());
auto stof = [](const std::string& arg) -> float {
try
{
return std::stof(arg);
}
catch (...)
{
throw Error("Invalid number '%s'", arg.c_str());
}
};
ratios.emplace(std::array<float, 4>{stof(ratios_str.substr(0, 2)), stof(ratios_str.substr(2, 2)), stof(ratios_str.substr(4, 2)),
stof(ratios_str.substr(6, 2))});
idt_alias = '(' + mixed_base + ')';
mixed_base = mixed_base[0];
}
else
{
check_mixed_base(mixed_base);
}
}
if (sugar == "R" && RNA_DNA_MIXED_BASES.count(mixed_base) == 0)
idt_alias = 'r' + idt_alias;
if (!document.hasAmbiguousMonomerTemplate(idt_alias))
{
auto it = STANDARD_MIXED_BASES.find(mixed_base);
if (it == STANDARD_MIXED_BASES.end())
throw Error("Unknown mixed base '%s'", mixed_base.c_str());
std::vector<KetAmbiguousMonomerOption> options;
for (auto template_alias : it->second)
{
if (sugar == "R" && template_alias == "T") // U instead of T for RNA
template_alias = "U";
auto& template_id = _library.getMonomerTemplateIdByAlias(MonomerClass::Base, template_alias);
if (template_id.size() == 0)
throw Error("Monomer base template '%s' not found", template_alias.c_str());
auto& option = options.emplace_back(template_id);
if (ratios.has_value())
{
option.setRatio(ratios.value()[IDT_BASE_TO_RATIO_IDX.at(template_alias)]);
}
auto& monomer_template = _library.getMonomerTemplateById(template_id);
checkAddTemplate(document, monomer_template);
_alias_to_id.emplace(template_alias, template_id);
}
auto& templ = document.addAmbiguousMonomerTemplate("mixture", idt_alias, idt_alias, IdtAlias(), options);
static const std::map<std::string, KetAttachmentPoint> aps{{"R1", -1}};
templ.setAttachmentPoints(aps);
_var_alias_to_id.emplace(idt_alias, idt_alias);
}
else
{
if (ratios.has_value())
throw Error("Ambiguous monomer %s redefinion", idt_alias.c_str());
}
}
base = idt_alias;
if (base == "I")
base = "In"; // use correct alias for Inosine
if (tokens.size() == 0)
{
if (token.second)
throw Error("Invalid IDT sequence: '*' couldn't be the last symbol.");
modification = IdtModification::THREE_PRIME_END;
phosphate = "";
}
else if (token.second)
{
phosphate = IDT_MODIFIED_PHOSPHATE;
}
_alias_to_id.emplace(sugar, checkAddTemplate(document, MonomerClass::Sugar, sugar));
if (base.size() > 0 && !ambiguous_monomer)
_alias_to_id.emplace(base, checkAddTemplate(document, MonomerClass::Base, base));
if (phosphate.size() > 0)
_alias_to_id.emplace(phosphate, checkAddTemplate(document, MonomerClass::Phosphate, phosphate));
}
else
{
if (tokens.size() == 0)
{
modification = IdtModification::THREE_PRIME_END;
// Corner case: /3Phos/ after standard monomer - no additional P should be added
if (prev_token.first.size() > 0 && prev_token.first.size() <= MAX_STD_TOKEN_SIZE && idt_alias == "3Phos")
continue;
}
sugar = "";
IdtModification alias_mod;
const std::string& mgt_id = _library.getMGTidByIdtAlias(idt_alias, alias_mod);
if (mgt_id.size())
{
// Check that alias modification can be used in current position
check_monomer_place(idt_alias, modification, alias_mod, prev_token.first.size() > 0);
MonomerGroupTemplate& mgt = _library.getMonomerGroupTemplateById(mgt_id);
const MonomerTemplate& sugar_template = mgt.getTemplateByClass(MonomerClass::Sugar);
sugar = sugar_template.getStringProp("alias");
_alias_to_id.emplace(sugar, sugar_template.id());
checkAddTemplate(document, sugar_template);
if (alias_mod == IdtModification::THREE_PRIME_END)
{
if (token.second)
throw Error("Monomer /%s/ doesn't have phosphate, so '*' couldn't be applied.", idt_alias.c_str());
phosphate = "";
}
else
{
if (mgt.hasTemplateClass(MonomerClass::Phosphate))
{
if (token.second) // * means that 'sP' should be used
{
phosphate = IDT_MODIFIED_PHOSPHATE;
_alias_to_id.emplace(phosphate, checkAddTemplate(document, MonomerClass::Phosphate, phosphate));
}
else // use phosphate from template
{
const MonomerTemplate& phosphate_template = mgt.getTemplateByClass(MonomerClass::Phosphate);
phosphate = phosphate_template.getStringProp("alias");
_alias_to_id.emplace(phosphate, phosphate_template.id());
checkAddTemplate(document, phosphate_template);
}
}
else
{
if (token.second)
throw Error("Monomer /%s/ doesn't have phosphate, so '*' couldn't be applied.", idt_alias.c_str());
phosphate = "";
}
}
if (mgt.hasTemplateClass(MonomerClass::Base))
{
const MonomerTemplate& base_template = mgt.getTemplateByClass(MonomerClass::Base);
base = base_template.getStringProp("alias");
_alias_to_id.emplace(base, base_template.id());
checkAddTemplate(document, base_template);
}
}
else
{
IdtModification alias_mod;
auto& monomer_template_id = _library.getMonomerTemplateIdByIdtAlias(idt_alias, alias_mod);
if (monomer_template_id.size())
{
if (token.second)
throw Error("'*' couldn't be applied to monomer /%s/.", idt_alias.c_str());
check_monomer_place(idt_alias, modification, alias_mod, prev_token.first.size() > 0);
const MonomerTemplate& monomer_template = _library.getMonomerTemplateById(monomer_template_id);
checkAddTemplate(document, monomer_template);
single_monomer = monomer_template_id;
single_monomer_alias = monomer_template.getStringProp("alias");
}
else // IDT alias not found
{
single_monomer = "unknown_monomer_with_idt_alias_" + idt_alias;
single_monomer_alias = idt_alias;
auto monomer_class = MonomerClass::CHEM;
// Unresoved monomer could be in any position
MonomerTemplate monomer_template(single_monomer, monomer_class, IdtAlias(idt_alias, idt_alias, idt_alias, idt_alias), true);
monomer_template.setStringProp("alias", idt_alias);
for (auto ap : {"R1", "R2", "R3", "R4"})
monomer_template.AddAttachmentPoint(ap, -1);
checkAddTemplate(document, monomer_template);
}
}
}
if (single_monomer.size())
{
auto monomer_idx = document.monomers().size();
auto& monomer = document.addMonomer(single_monomer_alias, single_monomer);
monomer->setIntProp("seqid", _seq_id);
monomer->setPosition(getBackboneMonomerPosition());
if (_last_monomer_idx >= 0)
addMonomerConnection(document, _last_monomer_idx, monomer_idx);
_last_monomer_idx = static_cast<int>(monomer_idx);
}
else
addNucleotide(document, base, sugar, phosphate, false, ambiguous_monomer);
_seq_id++;
_col++;
prev_token = token; // save to check */3Phos/ case
modification = IdtModification::INTERNAL;
}
_row += 2;
}
if (invalid_symbols.size())
throw Error("Invalid symbols in the sequence: %s", invalid_symbols.c_str());
}