in core/indigo-core/molecule/src/sequence_loader.cpp [600:880]
void SequenceLoader::loadIdt(BaseMolecule& mol)
{
const auto IDT_DEF_SUGAR = "dR";
const auto IDT_DEF_PHOSPHATE = "P";
const auto IDT_MODIFIED_PHOSPHATE = "sP";
constexpr int MAX_STD_TOKEN_SIZE = 2;
_row = 0;
mol.clear();
std::string invalid_symbols;
while (!_scanner.isEOF())
{
_seq_id = 0;
_last_monomer_idx = -1;
_col = 0;
using token_t = std::pair<std::string, bool>;
std::queue<token_t> tokens; // second=true if token folowed by *
std::string cur_token;
while (true)
{
if (_scanner.isEOL())
{
if (cur_token.size())
tokens.emplace(cur_token, false);
break;
}
auto ch = _scanner.readChar();
switch (ch)
{
case ' ':
if (cur_token.size())
tokens.emplace(cur_token, false);
continue;
case '/': {
if (cur_token.size())
throw Error("Sugar prefix could not be used with modified monomer.");
// read till next '/'
ch = 0;
while (!_scanner.isEOL())
{
ch = _scanner.readChar();
if (ch == '/')
break;
cur_token += ch;
}
if (ch != '/')
throw Error("Unexpected end of data");
if (cur_token == "")
throw Error("Invalid modification: empty string.");
if (cur_token.size() < 3)
throw Error("Invalid modification: %s.", cur_token.c_str());
cur_token += ch;
break;
}
case 'A':
case 'T':
case 'C':
case 'G':
case 'U':
case 'I':
cur_token += ch;
break;
case 'r':
case '+':
case 'm':
if (cur_token.size())
throw Error("Sugar prefix '%s' whithout base.", cur_token.c_str());
else
cur_token += ch;
continue;
break;
default:
if (invalid_symbols.size())
invalid_symbols += ',';
invalid_symbols += ch;
continue;
break;
}
if (_scanner.lookNext() == '*')
{
tokens.emplace(cur_token, true);
_scanner.skip(1);
}
else
tokens.emplace(cur_token, false);
cur_token = "";
}
while (!_scanner.isEOF() && _scanner.isEOL()) // Skip EOL characters
_scanner.skip(1);
IdtModification modification = IdtModification::FIVE_PRIME_END;
token_t prev_token;
while (tokens.size() > 0)
{
token_t token = tokens.front();
tokens.pop();
std::string phosphate = IDT_DEF_PHOSPHATE;
std::string sugar = IDT_DEF_SUGAR;
std::string idt_alias = "";
std::string base = "";
std::string single_monomer = "";
std::string single_monomer_class;
bool unresolved = false;
if (token.first.back() == '/')
{
token.first.pop_back();
idt_alias = token.first;
if ((idt_alias == "5Phos" || idt_alias == "3Phos") && (token.second || prev_token.second))
throw Error("Symbol '*' could be placed only between two nucleotides/nucleosides.");
}
else
{
if (token.first.size() > MAX_STD_TOKEN_SIZE)
throw Error("Wrong IDT syntax: '%s'", token.first.c_str());
idt_alias = token.first.back();
if (token.first.size() > 1)
{
switch (token.first[0])
{
case 'r':
sugar = "R";
break;
case '+':
sugar = "LR";
break;
case 'm':
sugar = "mR";
break;
default:
throw Error("Wrong IDT syntax: '%s'", token.first.c_str());
}
}
}
if (idt_alias.size() == 1)
{
if (IDT_STANDARD_BASES.count(idt_alias) == 0)
{
if (invalid_symbols.size())
invalid_symbols += ',';
invalid_symbols += idt_alias[0];
}
else
{
base = idt_alias;
if (tokens.size() == 0)
{
if (token.second)
throw Error("Invalid IDT sequence: '*' couldn't be the last symbol.");
modification = IdtModification::THREE_PRIME_END;
phosphate = "";
}
else if (token.second)
{
phosphate = IDT_MODIFIED_PHOSPHATE;
}
if (!checkAddTemplate(mol, MonomerClass::Sugar, sugar))
throw Error("Unknown sugar '%s'", sugar.c_str());
if (idt_alias.size() > 0 && !checkAddTemplate(mol, MonomerClass::Base, base))
throw Error("Unknown base '%s'", idt_alias.c_str());
if (phosphate.size() > 0 && !checkAddTemplate(mol, MonomerClass::Phosphate, phosphate))
throw Error("Unknown phosphate '%s'", phosphate.c_str());
}
}
else
{
if (tokens.size() == 0)
{
modification = IdtModification::THREE_PRIME_END;
// Corner case: /3Phos/ after standard monomer - no additional P should be added
if (prev_token.first.size() > 0 && prev_token.first.size() <= MAX_STD_TOKEN_SIZE && idt_alias == "3Phos")
continue;
}
sugar = "";
IdtModification alias_mod;
const std::string& mgt_id = _library.getMGTidByIdtAlias(idt_alias, alias_mod);
if (mgt_id.size())
{
// Check that alias modification can be used in current position
check_monomer_place(idt_alias, modification, alias_mod, prev_token.first.size() > 0);
MonomerGroupTemplate& mgt = _library.getMonomerGroupTemplateById(mgt_id);
const MonomerTemplate& sugar_template = mgt.getTemplateByClass(MonomerClass::Sugar);
sugar = sugar_template.getStringProp("alias");
checkAddTemplate(mol, sugar_template);
if (alias_mod == IdtModification::THREE_PRIME_END)
{
if (token.second)
throw Error("Monomer /%s/ doesn't have phosphate, so '*' couldn't be applied.", idt_alias.c_str());
phosphate = "";
}
else
{
if (mgt.hasTemplateClass(MonomerClass::Phosphate))
{
if (token.second) // * means that 'sP' should be used
{
phosphate = IDT_MODIFIED_PHOSPHATE;
checkAddTemplate(mol, MonomerClass::Phosphate, phosphate);
}
else // use phosphate from template
{
const MonomerTemplate& phosphate_template = mgt.getTemplateByClass(MonomerClass::Phosphate);
phosphate = phosphate_template.getStringProp("alias");
checkAddTemplate(mol, phosphate_template);
}
}
else
{
if (token.second)
throw Error("Monomer /%s/ doesn't have phosphate, so '*' couldn't be applied.", idt_alias.c_str());
phosphate = "";
}
}
if (mgt.hasTemplateClass(MonomerClass::Base))
{
const MonomerTemplate& base_template = mgt.getTemplateByClass(MonomerClass::Base);
base = base_template.getStringProp("alias");
checkAddTemplate(mol, base_template);
}
}
else
{
IdtModification alias_mod;
auto monomer_template_id = _library.getMonomerTemplateIdByIdtAlias(idt_alias, alias_mod);
if (monomer_template_id.size())
{
if (token.second)
throw Error("'*' couldn't be applied to monomer /%s/.", idt_alias.c_str());
check_monomer_place(idt_alias, modification, alias_mod, prev_token.first.size() > 0);
const MonomerTemplate& monomer_template = _library.getMonomerTemplateById(monomer_template_id);
checkAddTemplate(mol, monomer_template);
single_monomer = monomer_template.getStringProp("alias");
single_monomer_class = MonomerTemplates::classToStr(monomer_template.monomerClass());
}
else // IDT alias not found
{
unresolved = true;
single_monomer = "unknown_monomer_with_idt_alias_" + idt_alias;
auto monomer_class = MonomerClass::CHEM;
single_monomer_class = MonomerTemplates::classToStr(monomer_class);
// Unresoved monomer could be in any position
MonomerTemplate monomer_template(single_monomer, monomer_class, IdtAlias(idt_alias, idt_alias, idt_alias, idt_alias), true);
monomer_template.setStringProp("alias", idt_alias);
for (auto ap : {"R1", "R2", "R3", "R4"})
monomer_template.AddAttachmentPoint(ap, -1);
checkAddTemplate(mol, monomer_template);
}
}
}
if (single_monomer.size())
{
int monomer_idx = addTemplateAtom(mol, unresolved ? idt_alias.c_str() : single_monomer.c_str(), single_monomer_class.c_str(), _seq_id);
mol.asMolecule().setAtomXyz(monomer_idx, getBackboneMonomerPosition());
if (_last_monomer_idx >= 0)
addTemplateBond(mol, _last_monomer_idx, monomer_idx);
_last_monomer_idx = monomer_idx;
}
else
addNucleotide(mol, base, sugar, phosphate, false);
_seq_id++;
_col++;
prev_token = token; // save to check */3Phos/ case
modification = IdtModification::INTERNAL;
}
_row += 2;
}
if (invalid_symbols.size())
throw Error("Invalid symbols in the sequence: %s", invalid_symbols.c_str());
}