void MoleculeAutoLoader::_loadMolecule()

in core/indigo-core/molecule/src/molecule_auto_loader.cpp [200:577]


void MoleculeAutoLoader::_loadMolecule(BaseMolecule& mol)
{
    bool query = mol.isQueryMolecule();
    properties.clear();

    auto local_scanner = _scanner; // local scanner only for binary format
    // chack for base64
    uint8_t base64_id[] = "base64::";
    std::unique_ptr<BufferScanner> base64_scanner;
    Array<char> base64_data;
    if (local_scanner->length() >= (sizeof(base64_id) - 1))
    {
        byte id[sizeof(base64_id) - 1];
        long long pos = local_scanner->tell();
        local_scanner->readCharsFix(sizeof(base64_id) - 1, (char*)id);
        bool is_base64 = (std::equal(std::begin(id), std::end(id), std::begin(base64_id)));
        if (!is_base64)
            local_scanner->seek(pos, SEEK_SET);

        std::string base64_str;
        local_scanner->readAll(base64_str);
        base64_str.erase(std::remove_if(base64_str.begin(), base64_str.end(), [](char c) { return c == '\n' || c == '\r'; }), base64_str.end());
        if (validate_base64(base64_str))
        {
            base64_data.copy(base64_str.data(), static_cast<int>(base64_str.size()));
            base64_scanner = std::make_unique<BufferScanner>(base64_data, true);
            local_scanner = base64_scanner.get();
        }
        local_scanner->seek(pos, SEEK_SET);
        _scanner->seek(pos, SEEK_SET);
    }

    // check for GZip format
    if (local_scanner->length() >= 2LL)
    {
        byte id[2];
        long long pos = local_scanner->tell();

        local_scanner->readCharsFix(2, (char*)id);
        local_scanner->seek(pos, SEEK_SET);

        if (id[0] == 0x1f && id[1] == 0x8b)
        {
            GZipScanner gzscanner(*local_scanner);
            QS_DEF(Array<char>, buf);

            gzscanner.readAll(buf);
            MoleculeAutoLoader loader2(buf);

            loader2.stereochemistry_options = stereochemistry_options;
            loader2.ignore_noncritical_query_features = ignore_noncritical_query_features;
            loader2.treat_x_as_pseudoatom = treat_x_as_pseudoatom;
            loader2.skip_3d_chirality = skip_3d_chirality;
            loader2.ignore_no_chiral_flag = ignore_no_chiral_flag;
            loader2.treat_stereo_as = treat_stereo_as;
            loader2.loadMolecule(mol);
            return;
        }
    }

    if (local_scanner->startsWith(kCDX_HeaderString))
    {
        local_scanner->seek(kCDX_HeaderLength, SEEK_CUR);
        MoleculeCdxmlLoader loader(*local_scanner, true);
        loader.stereochemistry_options = stereochemistry_options;
        loader.loadMolecule(mol);
        return;
    }

    _scanner->skipBom();

    // check for MDLCT format
    {
        QS_DEF(Array<char>, buf);
        if (tryMDLCT(*_scanner, buf))
        {
            BufferScanner scanner2(buf);
            MolfileLoader loader(scanner2);
            loader.stereochemistry_options = stereochemistry_options;
            loader.ignore_noncritical_query_features = ignore_noncritical_query_features;
            loader.skip_3d_chirality = skip_3d_chirality;
            loader.treat_x_as_pseudoatom = treat_x_as_pseudoatom;
            loader.ignore_no_chiral_flag = ignore_no_chiral_flag;
            loader.treat_stereo_as = treat_stereo_as;

            if (query)
                loader.loadQueryMolecule((QueryMolecule&)mol);
            else
                loader.loadMolecule((Molecule&)mol);
            return;
        }
    }

    // check for ICM format
    if (!query && _scanner->length() >= 4LL)
    {
        char id[3];
        long long pos = _scanner->tell();

        _scanner->readCharsFix(3, id);
        _scanner->seek(pos, SEEK_SET);
        if (IcmSaver::checkVersion(id))
        {
            if (query)
                throw Error("cannot load query molecule from ICM format");

            IcmLoader loader(*_scanner);
            loader.loadMolecule((Molecule&)mol);
            return;
        }
    }

    // check for CML format
    {
        long long pos = _scanner->tell();
        _scanner->skipSpace();

        if (_scanner->lookNext() == '<')
        {
            if (_scanner->findWord("<molecule"))
            {
                CmlLoader loader(*_scanner);
                loader.stereochemistry_options = stereochemistry_options;
                if (query)
                    loader.loadQueryMolecule((QueryMolecule&)mol);
                else
                    loader.loadMolecule((Molecule&)mol);
                return;
            }
        }

        _scanner->seek(pos, SEEK_SET);
    }

    // check for CDXML format
    {
        long long pos = _scanner->tell();
        _scanner->skipSpace();
        if (_scanner->lookNext() == '<' && _scanner->findWord("CDXML"))
        {
            _scanner->seek(pos, SEEK_SET);
            MoleculeCdxmlLoader loader(*_scanner);
            loader.stereochemistry_options = stereochemistry_options;
            loader.loadMolecule(mol);
            return;
        }
        _scanner->seek(pos, SEEK_SET);
    }

    // check json format
    long long pos = _scanner->tell();
    {
        if (_scanner->lookNext() == '{')
        {
            if (_scanner->findWord("root") && _scanner->findWord("nodes"))
            {
                using namespace rapidjson;
                _scanner->seek(pos, SEEK_SET);
                {
                    Array<char> buf;
                    _scanner->readAll(buf);
                    buf.push(0);
                    unsigned char* ptr = (unsigned char*)buf.ptr();
                    Document data;
                    if (!data.Parse((char*)ptr).HasParseError())
                    {
                        if (data.HasMember("root"))
                        {
                            MoleculeJsonLoader loader(data);
                            loader.stereochemistry_options = stereochemistry_options;
                            loader.ignore_noncritical_query_features = ignore_noncritical_query_features;
                            loader.treat_x_as_pseudoatom = treat_x_as_pseudoatom;
                            loader.skip_3d_chirality = skip_3d_chirality;
                            loader.ignore_no_chiral_flag = ignore_no_chiral_flag;
                            loader.treat_stereo_as = treat_stereo_as;
                            loader.loadMolecule(mol);
                            return;
                        }
                    }
                }
            }
        }
        _scanner->seek(pos, SEEK_SET);
    }

    // check for single line formats

    if (Scanner::isSingleLine(*_scanner))
    {
        // for debug purposes: check for sequence
        {
            const std::string kPeptide = "PEPTIDE:";
            const std::string kRNA = "RNA:";
            const std::string kDNA = "DNA:";
            const std::string kIDT = "IDT:";
            const std::string kHELM = "HELM:";

            long long start_pos = _scanner->tell();
            if (_scanner->length() > static_cast<long long>(kRNA.size()))
            {
                MonomerTemplateLibrary lib;
                std::vector<char> tag(kPeptide.size() + 1, 0);
                _scanner->readCharsFix(static_cast<int>(kRNA.size()), tag.data());
                SequenceLoader sl(*_scanner, lib);
                if (kRNA == tag.data())
                {
                    sl.loadSequence(mol, SequenceLoader::SeqType::RNASeq);
                    return;
                }
                else if (kDNA == tag.data())
                {
                    sl.loadSequence(mol, SequenceLoader::SeqType::DNASeq);
                    return;
                }
                else if (kIDT == tag.data())
                {
                    sl.loadIdt(mol);
                    return;
                }
                else if (kHELM == tag.data())
                {
                    // sl.loadHelm(mol);
                    return;
                }
                else
                {
                    _scanner->seek(start_pos, SEEK_SET);
                    if (_scanner->length() > static_cast<long long>(kPeptide.size()))
                    {
                        _scanner->readCharsFix(static_cast<int>(kPeptide.size()), tag.data());
                        if (kPeptide == tag.data())
                        {
                            sl.loadSequence(mol, SequenceLoader::SeqType::PEPTIDESeq);
                            return;
                        }
                    }
                }
            }
            _scanner->seek(start_pos, SEEK_SET);
        }
        // check for InChI format
        {
            char prefix[6] = {'\0'};
            long long start = _scanner->tell();

            bool inchi = false;
            {
                char* ptr = prefix;
                while (!_scanner->isEOF() && ptr - prefix < 6)
                {
                    *ptr = _scanner->readChar();
                    ptr++;
                }
                inchi = (strncmp(prefix, "InChI=", 6) == 0);
                _scanner->seek(start, SEEK_SET);
            }

            if (inchi)
            {
                if (query)
                {
                    throw Error("InChI input doesn't support query molecules");
                }

                Array<char> inchi_data;
                _scanner->readWord(inchi_data, " ");

                InchiWrapper loader;
                loader.loadMoleculeFromInchi(inchi_data.ptr(), (Molecule&)mol);
                return;
            }
        }

        // If not InChI then SMILES or IUPAC name
        Array<char> err_buf;

        try
        {
            SmilesLoader loader(*_scanner);
            long long start = _scanner->tell();

            loader.ignore_closing_bond_direction_mismatch = ignore_closing_bond_direction_mismatch;
            loader.stereochemistry_options = stereochemistry_options;
            loader.ignore_cistrans_errors = ignore_cistrans_errors;
            loader.ignore_no_chiral_flag = ignore_no_chiral_flag;

            /*
            If exception is thrown, try the SMARTS, if exception thrown again - the string is rather an IUPAC name than a SMILES string
            We catch it and pass down to IUPAC name conversion
            */
            if (query)
            {
                try
                {
                    loader.loadQueryMolecule(static_cast<QueryMolecule&>(mol));
                }
                catch (Exception&)
                {
                    _scanner->seek(start, SEEK_SET);
                    loader.loadSMARTS(static_cast<QueryMolecule&>(mol));
                }
            }
            else
            {
                loader.loadMolecule(static_cast<Molecule&>(mol));
            }
            return;
        }
        catch (Exception& e)
        {
            err_buf.appendString(e.message(), true);
        }

        // We fall down to IUPAC name conversion if SMILES loading threw an exception
        try
        {
            Array<char> name;
            _scanner->seek(SEEK_SET, SEEK_SET);
            _scanner->readLine(name, true);
            MoleculeNameParser parser;
            parser.parseMolecule(name.ptr(), static_cast<Molecule&>(mol));
            return;
        }
        catch (Exception&)
        {
        }

        if (err_buf.size() > 0)
        {
            throw Error(err_buf.ptr());
        }
    }

    // default is Molfile format

    {
        SdfLoader sdf_loader(*_scanner);
        bool is_first = true;
        while (!sdf_loader.isEOF())
        {
            sdf_loader.readNext();

            // Copy properties
            properties.copy(sdf_loader.properties);

            BufferScanner scanner2(sdf_loader.data);

            MolfileLoader loader(scanner2);
            loader.stereochemistry_options = stereochemistry_options;
            loader.ignore_noncritical_query_features = ignore_noncritical_query_features;
            loader.skip_3d_chirality = skip_3d_chirality;
            loader.treat_x_as_pseudoatom = treat_x_as_pseudoatom;
            loader.ignore_no_chiral_flag = ignore_no_chiral_flag;
            loader.treat_stereo_as = treat_stereo_as;

            if (is_first && sdf_loader.isEOF())
            {
                if (query)
                    loader.loadQueryMolecule((QueryMolecule&)mol);
                else
                    loader.loadMolecule((Molecule&)mol);
            }
            else
            {
                std::unique_ptr<BaseMolecule> mol_fragment(mol.neu());
                if (query)
                    loader.loadQueryMolecule((QueryMolecule&)*mol_fragment);
                else
                    loader.loadMolecule((Molecule&)*mol_fragment);
                if (!properties.is_empty() && mol_fragment->vertexCount())
                    mol_fragment->properties().insert(0).copy(properties);
                Array<int> mapping;
                mol.mergeWithMolecule(*mol_fragment, &mapping, 0);
            }
            is_first = false;
        }
    }
}