in core/indigo-core/molecule/src/molecule_auto_loader.cpp [200:577]
void MoleculeAutoLoader::_loadMolecule(BaseMolecule& mol)
{
bool query = mol.isQueryMolecule();
properties.clear();
auto local_scanner = _scanner; // local scanner only for binary format
// chack for base64
uint8_t base64_id[] = "base64::";
std::unique_ptr<BufferScanner> base64_scanner;
Array<char> base64_data;
if (local_scanner->length() >= (sizeof(base64_id) - 1))
{
byte id[sizeof(base64_id) - 1];
long long pos = local_scanner->tell();
local_scanner->readCharsFix(sizeof(base64_id) - 1, (char*)id);
bool is_base64 = (std::equal(std::begin(id), std::end(id), std::begin(base64_id)));
if (!is_base64)
local_scanner->seek(pos, SEEK_SET);
std::string base64_str;
local_scanner->readAll(base64_str);
base64_str.erase(std::remove_if(base64_str.begin(), base64_str.end(), [](char c) { return c == '\n' || c == '\r'; }), base64_str.end());
if (validate_base64(base64_str))
{
base64_data.copy(base64_str.data(), static_cast<int>(base64_str.size()));
base64_scanner = std::make_unique<BufferScanner>(base64_data, true);
local_scanner = base64_scanner.get();
}
local_scanner->seek(pos, SEEK_SET);
_scanner->seek(pos, SEEK_SET);
}
// check for GZip format
if (local_scanner->length() >= 2LL)
{
byte id[2];
long long pos = local_scanner->tell();
local_scanner->readCharsFix(2, (char*)id);
local_scanner->seek(pos, SEEK_SET);
if (id[0] == 0x1f && id[1] == 0x8b)
{
GZipScanner gzscanner(*local_scanner);
QS_DEF(Array<char>, buf);
gzscanner.readAll(buf);
MoleculeAutoLoader loader2(buf);
loader2.stereochemistry_options = stereochemistry_options;
loader2.ignore_noncritical_query_features = ignore_noncritical_query_features;
loader2.treat_x_as_pseudoatom = treat_x_as_pseudoatom;
loader2.skip_3d_chirality = skip_3d_chirality;
loader2.ignore_no_chiral_flag = ignore_no_chiral_flag;
loader2.treat_stereo_as = treat_stereo_as;
loader2.loadMolecule(mol);
return;
}
}
if (local_scanner->startsWith(kCDX_HeaderString))
{
local_scanner->seek(kCDX_HeaderLength, SEEK_CUR);
MoleculeCdxmlLoader loader(*local_scanner, true);
loader.stereochemistry_options = stereochemistry_options;
loader.loadMolecule(mol);
return;
}
_scanner->skipBom();
// check for MDLCT format
{
QS_DEF(Array<char>, buf);
if (tryMDLCT(*_scanner, buf))
{
BufferScanner scanner2(buf);
MolfileLoader loader(scanner2);
loader.stereochemistry_options = stereochemistry_options;
loader.ignore_noncritical_query_features = ignore_noncritical_query_features;
loader.skip_3d_chirality = skip_3d_chirality;
loader.treat_x_as_pseudoatom = treat_x_as_pseudoatom;
loader.ignore_no_chiral_flag = ignore_no_chiral_flag;
loader.treat_stereo_as = treat_stereo_as;
if (query)
loader.loadQueryMolecule((QueryMolecule&)mol);
else
loader.loadMolecule((Molecule&)mol);
return;
}
}
// check for ICM format
if (!query && _scanner->length() >= 4LL)
{
char id[3];
long long pos = _scanner->tell();
_scanner->readCharsFix(3, id);
_scanner->seek(pos, SEEK_SET);
if (IcmSaver::checkVersion(id))
{
if (query)
throw Error("cannot load query molecule from ICM format");
IcmLoader loader(*_scanner);
loader.loadMolecule((Molecule&)mol);
return;
}
}
// check for CML format
{
long long pos = _scanner->tell();
_scanner->skipSpace();
if (_scanner->lookNext() == '<')
{
if (_scanner->findWord("<molecule"))
{
CmlLoader loader(*_scanner);
loader.stereochemistry_options = stereochemistry_options;
if (query)
loader.loadQueryMolecule((QueryMolecule&)mol);
else
loader.loadMolecule((Molecule&)mol);
return;
}
}
_scanner->seek(pos, SEEK_SET);
}
// check for CDXML format
{
long long pos = _scanner->tell();
_scanner->skipSpace();
if (_scanner->lookNext() == '<' && _scanner->findWord("CDXML"))
{
_scanner->seek(pos, SEEK_SET);
MoleculeCdxmlLoader loader(*_scanner);
loader.stereochemistry_options = stereochemistry_options;
loader.loadMolecule(mol);
return;
}
_scanner->seek(pos, SEEK_SET);
}
// check json format
long long pos = _scanner->tell();
{
if (_scanner->lookNext() == '{')
{
if (_scanner->findWord("root") && _scanner->findWord("nodes"))
{
using namespace rapidjson;
_scanner->seek(pos, SEEK_SET);
{
Array<char> buf;
_scanner->readAll(buf);
buf.push(0);
unsigned char* ptr = (unsigned char*)buf.ptr();
Document data;
if (!data.Parse((char*)ptr).HasParseError())
{
if (data.HasMember("root"))
{
MoleculeJsonLoader loader(data);
loader.stereochemistry_options = stereochemistry_options;
loader.ignore_noncritical_query_features = ignore_noncritical_query_features;
loader.treat_x_as_pseudoatom = treat_x_as_pseudoatom;
loader.skip_3d_chirality = skip_3d_chirality;
loader.ignore_no_chiral_flag = ignore_no_chiral_flag;
loader.treat_stereo_as = treat_stereo_as;
loader.loadMolecule(mol);
return;
}
}
}
}
}
_scanner->seek(pos, SEEK_SET);
}
// check for single line formats
if (Scanner::isSingleLine(*_scanner))
{
// for debug purposes: check for sequence
{
const std::string kPeptide = "PEPTIDE:";
const std::string kRNA = "RNA:";
const std::string kDNA = "DNA:";
const std::string kIDT = "IDT:";
const std::string kHELM = "HELM:";
long long start_pos = _scanner->tell();
if (_scanner->length() > static_cast<long long>(kRNA.size()))
{
MonomerTemplateLibrary lib;
std::vector<char> tag(kPeptide.size() + 1, 0);
_scanner->readCharsFix(static_cast<int>(kRNA.size()), tag.data());
SequenceLoader sl(*_scanner, lib);
if (kRNA == tag.data())
{
sl.loadSequence(mol, SequenceLoader::SeqType::RNASeq);
return;
}
else if (kDNA == tag.data())
{
sl.loadSequence(mol, SequenceLoader::SeqType::DNASeq);
return;
}
else if (kIDT == tag.data())
{
sl.loadIdt(mol);
return;
}
else if (kHELM == tag.data())
{
// sl.loadHelm(mol);
return;
}
else
{
_scanner->seek(start_pos, SEEK_SET);
if (_scanner->length() > static_cast<long long>(kPeptide.size()))
{
_scanner->readCharsFix(static_cast<int>(kPeptide.size()), tag.data());
if (kPeptide == tag.data())
{
sl.loadSequence(mol, SequenceLoader::SeqType::PEPTIDESeq);
return;
}
}
}
}
_scanner->seek(start_pos, SEEK_SET);
}
// check for InChI format
{
char prefix[6] = {'\0'};
long long start = _scanner->tell();
bool inchi = false;
{
char* ptr = prefix;
while (!_scanner->isEOF() && ptr - prefix < 6)
{
*ptr = _scanner->readChar();
ptr++;
}
inchi = (strncmp(prefix, "InChI=", 6) == 0);
_scanner->seek(start, SEEK_SET);
}
if (inchi)
{
if (query)
{
throw Error("InChI input doesn't support query molecules");
}
Array<char> inchi_data;
_scanner->readWord(inchi_data, " ");
InchiWrapper loader;
loader.loadMoleculeFromInchi(inchi_data.ptr(), (Molecule&)mol);
return;
}
}
// If not InChI then SMILES or IUPAC name
Array<char> err_buf;
try
{
SmilesLoader loader(*_scanner);
long long start = _scanner->tell();
loader.ignore_closing_bond_direction_mismatch = ignore_closing_bond_direction_mismatch;
loader.stereochemistry_options = stereochemistry_options;
loader.ignore_cistrans_errors = ignore_cistrans_errors;
loader.ignore_no_chiral_flag = ignore_no_chiral_flag;
/*
If exception is thrown, try the SMARTS, if exception thrown again - the string is rather an IUPAC name than a SMILES string
We catch it and pass down to IUPAC name conversion
*/
if (query)
{
try
{
loader.loadQueryMolecule(static_cast<QueryMolecule&>(mol));
}
catch (Exception&)
{
_scanner->seek(start, SEEK_SET);
loader.loadSMARTS(static_cast<QueryMolecule&>(mol));
}
}
else
{
loader.loadMolecule(static_cast<Molecule&>(mol));
}
return;
}
catch (Exception& e)
{
err_buf.appendString(e.message(), true);
}
// We fall down to IUPAC name conversion if SMILES loading threw an exception
try
{
Array<char> name;
_scanner->seek(SEEK_SET, SEEK_SET);
_scanner->readLine(name, true);
MoleculeNameParser parser;
parser.parseMolecule(name.ptr(), static_cast<Molecule&>(mol));
return;
}
catch (Exception&)
{
}
if (err_buf.size() > 0)
{
throw Error(err_buf.ptr());
}
}
// default is Molfile format
{
SdfLoader sdf_loader(*_scanner);
bool is_first = true;
while (!sdf_loader.isEOF())
{
sdf_loader.readNext();
// Copy properties
properties.copy(sdf_loader.properties);
BufferScanner scanner2(sdf_loader.data);
MolfileLoader loader(scanner2);
loader.stereochemistry_options = stereochemistry_options;
loader.ignore_noncritical_query_features = ignore_noncritical_query_features;
loader.skip_3d_chirality = skip_3d_chirality;
loader.treat_x_as_pseudoatom = treat_x_as_pseudoatom;
loader.ignore_no_chiral_flag = ignore_no_chiral_flag;
loader.treat_stereo_as = treat_stereo_as;
if (is_first && sdf_loader.isEOF())
{
if (query)
loader.loadQueryMolecule((QueryMolecule&)mol);
else
loader.loadMolecule((Molecule&)mol);
}
else
{
std::unique_ptr<BaseMolecule> mol_fragment(mol.neu());
if (query)
loader.loadQueryMolecule((QueryMolecule&)*mol_fragment);
else
loader.loadMolecule((Molecule&)*mol_fragment);
if (!properties.is_empty() && mol_fragment->vertexCount())
mol_fragment->properties().insert(0).copy(properties);
Array<int> mapping;
mol.mergeWithMolecule(*mol_fragment, &mapping, 0);
}
is_first = false;
}
}
}