core/indigo-core/molecule/molecule_cdxml_loader.h (735 lines of code) (raw):

/**************************************************************************** * Copyright (C) from 2009 to Present EPAM Systems. * * This file is part of Indigo toolkit. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ***************************************************************************/ #ifndef __cdxml_loader__ #define __cdxml_loader__ #include <functional> #include <regex> #include <sstream> #include <string> #include <tinyxml2.h> #include <unordered_map> #include <unordered_set> #include <vector> #include "base_cpp/array.h" #include "base_cpp/exception.h" #include "common/utils/emf_utils.h" #include "elements.h" #include "molecule/base_molecule.h" #include "molecule/meta_commons.h" #include "molecule/molecule_stereocenter_options.h" #include "molecule/query_molecule.h" typedef unsigned short int UINT16; typedef int INT32; typedef unsigned int UINT32; #include "CDXCommons.h" #ifdef _MSC_VER #pragma warning(push) #pragma warning(disable : 4201) #endif namespace tinyxml2 { class XMLHandle; class XMLElement; class XMLNode; class XMLAttribute; } namespace indigo { class Scanner; class Molecule; class QueryMolecule; class AutoInt { public: AutoInt() : val(0){}; AutoInt(int v) : val(v){}; AutoInt(const std::string& v) : val(std::stoi(v)) { } operator int() const { return val; } operator std::string() const { return std::to_string(val); } private: int val; }; union CDXMLFontStyle { CDXMLFontStyle(unsigned int val) : face(val) { } struct { unsigned int is_bold : 1; unsigned int is_italic : 1; unsigned int is_underline : 1; unsigned int is_outline : 1; unsigned int is_shadow : 1; unsigned int is_subscript : 1; unsigned int is_superscript : 1; }; unsigned int face; }; struct _ExtConnection { int bond_id; int point_id; int atom_idx; }; struct CdxmlNode { CdxmlNode() : element(ELEM_C), type(kCDXNodeType_Element), enchanced_stereo(EnhancedStereoType::UNSPECIFIED), is_not_list(false), has_fragment(false) // Carbon by default { } AutoInt id; std::string label; AutoInt element; Vec3f pos; int type; AutoInt isotope; AutoInt charge; AutoInt radical; AutoInt valence; AutoInt hydrogens; AutoInt stereo; EnhancedStereoType enchanced_stereo; AutoInt enhanced_stereo_group; AutoInt index; AutoInt geometry; AutoInt alt_group_id; AutoInt rg_index; bool is_not_list; bool has_fragment; std::vector<AutoInt> element_list; std::unordered_map<int, std::size_t> bond_id_to_connection_idx; std::unordered_map<int, std::size_t> node_id_to_connection_idx; std::vector<_ExtConnection> connections; std::vector<int> ext_connections; std::vector<int> inner_nodes; }; struct CdxmlBond { CdxmlBond() : order(1), stereo(0), dir(0), display(0), display2(0), topology(0), reaction_center(0), swap_bond(false) { } AutoInt id; std::pair<AutoInt, AutoInt> be; AutoInt order; AutoInt stereo; AutoInt dir; AutoInt display; AutoInt display2; AutoInt topology; AutoInt reaction_center; bool swap_bond; }; struct CdxmlBracket { CdxmlBracket() : repeat_pattern(RepeatingUnit::HEAD_TO_TAIL), usage(kCDXBracketUsage_Generic), is_superatom(false) { } std::vector<AutoInt> bracketed_list; int usage; AutoInt repeat_count; int repeat_pattern; std::string label; bool is_superatom; }; struct CdxmlText { CdxmlText(const Vec3f& pos, const Vec2f& size, const std::string& text) : pos(pos), size(size), text(text) { } std::string text; Vec3f pos; Vec2f size; }; class BaseCDXProperty { public: virtual bool hasContent() const = 0; virtual std::unique_ptr<BaseCDXProperty> copy() = 0; virtual std::unique_ptr<BaseCDXProperty> next() = 0; virtual std::string name() const = 0; virtual std::string value() const = 0; }; static constexpr uint32_t tag_size = sizeof(uint16_t); static constexpr uint32_t id_size = sizeof(uint32_t); class CDXMLProperty : public BaseCDXProperty { public: DECL_ERROR; CDXMLProperty(const tinyxml2::XMLAttribute* attribute) : _attribute(attribute){}; virtual bool hasContent() const override { return _attribute != nullptr; } virtual std::unique_ptr<BaseCDXProperty> copy() override { return std::make_unique<CDXMLProperty>(_attribute); } virtual std::unique_ptr<BaseCDXProperty> next() override { return std::make_unique<CDXMLProperty>(attribute()->Next()); } virtual std::string name() const override { return attribute()->Name(); } virtual std::string value() const override { return attribute()->Value(); } protected: const tinyxml2::XMLAttribute* attribute() const { if (_attribute == nullptr) throw Error("Null property"); return _attribute; } private: const tinyxml2::XMLAttribute* _attribute; }; class CDXElement; class CDXProperty : public BaseCDXProperty { public: DECL_ERROR; CDXProperty(CDXElement* parent) : CDXProperty(parent, 0, nullptr) { } CDXProperty(CDXElement* parent, uint16_t tag, const uint8_t* data, uint32_t size = 0); bool hasContent() const override { return _data != nullptr; } std::unique_ptr<BaseCDXProperty> copy() override { return std::make_unique<CDXProperty>(_parent, _tag, _data, _size); } std::unique_ptr<BaseCDXProperty> next() override { return nextProp(); } std::unique_ptr<CDXProperty> nextProp(); std::string name() const override { auto it = KCDXPropToName.find(_tag); return it == KCDXPropToName.end() ? std::string{} : it->second.first; } std::string value() const override { auto it = KCDXPropToName.find(_tag); if (it != KCDXPropToName.end()) { if (_size == 0) return ""; auto prop_type = it->second.second; return formatValue(prop_type); } std::stringstream ss; std::vector<uint8_t> val_dump(_data, _data + _size); ss << "raw value:" << std::hex; for (auto val : val_dump) ss << std::setw(2) << std::setfill('0') << (int)val << " "; return ss.str(); } uint16_t tag() { return _tag; } // protected: std::string formatValue(ECDXType cdx_type) const; std::string parseCDXUINT16(uint16_t val) const; std::string parseCDXINT16(int16_t val) const; std::string parseCDXINT32(int32_t val) const; std::string parseCDXINT8(int8_t val) const; CDXElement* _parent; uint16_t _tag; const uint8_t* _data; uint32_t _size; }; class CDXIdProperty : public CDXProperty { public: CDXIdProperty(CDXElement* parent, const uint8_t* data) : CDXProperty(parent, 0, data, id_size){}; std::unique_ptr<BaseCDXProperty> copy() override { return std::make_unique<CDXIdProperty>(_parent, _data); } std::string name() const override { return "id"; } std::string value() const override { return formatValue(ECDXType::CDXObjectID); } }; class CDXStyleProperty : public CDXProperty { public: CDXStyleProperty(CDXElement* parent, const uint8_t* data, uint8_t prop_index) : CDXProperty(parent, 0xffff, data, sizeof(uint16_t)), _prop_index(prop_index){}; std::unique_ptr<BaseCDXProperty> copy() override { return std::make_unique<CDXStyleProperty>(_parent, _data, _prop_index); } std::unique_ptr<BaseCDXProperty> next() { uint8_t next_index = _prop_index + 1; if (next_index < KStyleProperties.size()) return std::make_unique<CDXStyleProperty>(_parent, _data + sizeof(uint16_t), next_index); else return std::make_unique<CDXProperty>(_parent); } std::string name() const override { return KStyleProperties[_prop_index]; } std::string value() const override { uint16_t style_prop = *reinterpret_cast<const uint16_t*>(_data); if (_prop_index == kCDXMLStyleSizeIndex) style_prop /= kCDXMLSizeMultiplier; return parseCDXUINT16(style_prop); } protected: uint8_t _prop_index; }; class BaseCDXElement { public: virtual bool hasContent() = 0; virtual std::unique_ptr<BaseCDXElement> copy() = 0; virtual std::unique_ptr<BaseCDXProperty> firstProperty() = 0; virtual std::unique_ptr<BaseCDXProperty> findProperty(const std::string& name) = 0; virtual std::unique_ptr<BaseCDXElement> firstChildElement() = 0; virtual std::unique_ptr<BaseCDXElement> nextSiblingElement() = 0; virtual std::string name() = 0; virtual std::string value() = 0; virtual std::string getText() = 0; }; class CDXMLElement : public BaseCDXElement { public: DECL_ERROR; CDXMLElement(const tinyxml2::XMLElement* xml) : _xml(xml){}; bool hasContent() override { return _xml != nullptr; } std::unique_ptr<BaseCDXElement> copy() override { return std::make_unique<CDXMLElement>(_xml); } std::unique_ptr<BaseCDXProperty> firstProperty() override { return std::make_unique<CDXMLProperty>(xml()->FirstAttribute()); }; std::unique_ptr<BaseCDXProperty> findProperty(const std::string& name) override { return std::make_unique<CDXMLProperty>(xml()->FindAttribute(name.c_str())); } std::unique_ptr<BaseCDXElement> firstChildElement() override { return std::make_unique<CDXMLElement>(xml()->FirstChildElement()); } std::unique_ptr<BaseCDXElement> nextSiblingElement() override { return std::make_unique<CDXMLElement>(xml()->NextSiblingElement()); } std::string name() override { return std::string(xml()->Name()); } std::string value() override { return std::string(xml()->Value()); } std::string getText() override { std::string result; auto ptext = xml()->GetText(); if (ptext) result = ptext; return result; } protected: const tinyxml2::XMLElement* xml() { if (_xml == nullptr) throw Error("Null element"); return _xml; } private: const tinyxml2::XMLElement* _xml; }; class CDXElement : public BaseCDXElement { public: DECL_ERROR; CDXElement() : CDXElement(0, nullptr) { } CDXElement(uint16_t tag, const uint8_t* data, uint32_t size = 0) : _tag(tag), _data(data), _data_size(size) { } CDXElement(const void* data, size_t size = 0) : _data_size(static_cast<uint32_t>(size)) { _data = get_uint16_t(static_cast<const uint8_t*>(data), _tag); } bool hasContent() override { return _data != nullptr; } std::unique_ptr<BaseCDXElement> copy() override { return std::make_unique<CDXElement>(_tag, _data, _data_size); } std::unique_ptr<BaseCDXProperty> firstProperty() override { return std::make_unique<CDXIdProperty>(this, _data); } static const uint8_t* get_property_size(const uint8_t* data, uint32_t& size) { size = *reinterpret_cast<const uint16_t*>(data); data += sizeof(uint16_t); if (0xFFFF == size) { size = *reinterpret_cast<const uint32_t*>(data); data += sizeof(uint32_t); } return data; } static const uint8_t* get_uint16_t(const uint8_t* data, uint16_t& tag) { tag = *reinterpret_cast<const uint16_t*>(data); return data + sizeof(uint16_t); } std::unique_ptr<CDXProperty> getProperty(const uint8_t* data) { uint16_t tag; const uint8_t* ptr = get_uint16_t(data, tag); while (tag >= kCDXTag_Object || tag == kCDXProp_Text) // skip child objects { if (tag == kCDXProp_Text) { uint32_t size; ptr = get_property_size(ptr, size); ptr += size; } else ptr = skipObject(ptr); ptr = get_uint16_t(ptr, tag); } if (tag == 0) // End of object - return empty property return std::make_unique<CDXProperty>(this); uint32_t size; ptr = get_property_size(ptr, size); return std::make_unique<CDXProperty>(this, tag, ptr, size); } std::unique_ptr<BaseCDXProperty> findProperty(const std::string& name) override { auto first_prop = firstProperty(); if (first_prop->name() == name) return first_prop; auto it = KCDXNameToProp.find(name); if (it != KCDXNameToProp.end()) return findProperty(it->second.first); throw Error("Property %s not found", name.c_str()); } std::unique_ptr<CDXProperty> findProperty(uint16_t tag) { for (auto prop = getProperty(_data + id_size); prop->hasContent(); prop = prop->nextProp()) if (prop->tag() == tag) return prop; return std::make_unique<CDXProperty>(this); } std::unique_ptr<BaseCDXElement> firstChildElement() override { return getChild(_data + id_size); // _data pointed to object id, object content just after id } std::unique_ptr<BaseCDXElement> nextSiblingElement() override { return next(); } std::unique_ptr<CDXElement> next() { return getChild(skipObject(_data)); // return first object after this } std::unique_ptr<CDXElement> getChild(const uint8_t* ptr); std::string name() override { if (_tag == 0) return "CDXML"; auto it = KCDXObjToName.find(_tag); if (it != KCDXObjToName.end()) return it->second; return std::string{}; } std::string value() override { return name(); } std::string getText() override { switch (_tag) { case kCDXObj_Text: { for (auto child = getChild(_data + id_size); child->hasContent(); child = child->next()) if (child->tag() == kCDXProp_Text) return child->getText(); auto text_prop = findProperty(kCDXProp_Text); if (text_prop->hasContent()) return text_prop->value(); } default: return name(); break; } return std::string{}; } uint16_t tag() { return _tag; } protected: static const uint8_t* skipProperty(const uint8_t* ptr) { uint32_t size = 0; ptr = get_property_size(ptr, size); // skip size ptr += size; // skip content return ptr; // points to the next property or object } static const uint8_t* skipObject(const uint8_t* ptr) { ptr += id_size; // skip tag and id uint16_t tag; while (tag = *reinterpret_cast<const uint16_t*>(ptr)) { if (tag < kCDXTag_Object) ptr = skipProperty(ptr + tag_size); else ptr = skipObject(ptr + tag_size); } return ptr + tag_size; // skip terminating zero tag } uint16_t _tag; const uint8_t* _data; uint32_t _data_size; }; class CDXTextElement : public CDXElement { public: CDXTextElement(uint16_t tag, const uint8_t* data, uint32_t size, uint16_t style_index) : CDXElement(tag, data, size), _style_index(style_index) { _style_count = *reinterpret_cast<const uint16_t*>(_data); _text_start = reinterpret_cast<const char*>(data); _text_len = size; if (_style_count > 0) { uint32_t styles_size = _style_count * sizeof(CDXTextStyle); if (styles_size < size + sizeof(_style_count)) // Some CDXString contains no style and no style-count property { _text_start += styles_size + sizeof(_style_count); _text_len -= styles_size + sizeof(_style_count); } else _style_count = 0; } else { _text_start += sizeof(_style_count); _text_len -= sizeof(_style_count); } }; std::unique_ptr<BaseCDXElement> copy() override { return std::make_unique<CDXTextElement>(_tag, _data, _data_size, _style_index); } std::unique_ptr<BaseCDXProperty> firstProperty() override { if (style_count() > 0 && _style_index < style_count()) { // offset = sizeof(style_count)+sizeof(previous styles)+sizeof(text_offset) size_t offset = sizeof(uint16_t) + _style_index * sizeof(CDXTextStyle) + sizeof(uint16_t); return std::make_unique<CDXStyleProperty>(this, _data + offset, 0); } return std::make_unique<CDXProperty>(this); } std::unique_ptr<BaseCDXElement> firstChildElement() override { return std::make_unique<CDXElement>(); // no child objects in } uint16_t style_count() { return _style_count; } std::unique_ptr<BaseCDXElement> nextSiblingElement() override { if (style_count() > _style_index + 1) return std::make_unique<CDXTextElement>(_tag, _data, _data_size, _style_index + 1); return getChild(_data + _data_size); // return first object after this } std::string name() override { return "s"; } std::string getText() override { long text_len = _text_len; const char* text_start = _text_start; if (_style_count > 0) { const CDXTextStyle* ptext_styles = reinterpret_cast<const CDXTextStyle*>(get_uint16_t(_data, _style_count)); text_start += ptext_styles[_style_index].offset; if ((_style_index + 1) < _style_count) text_len = ptext_styles[_style_index + 1].offset - ptext_styles[_style_index].offset; else text_len -= ptext_styles[_style_index].offset; } return std::string(text_start, text_len); } protected: int _style_index; uint16_t _style_count; const char* _text_start; uint32_t _text_len; }; class CDXReader { public: CDXReader(Scanner& scanner); virtual std::unique_ptr<BaseCDXElement> rootElement() { return std::make_unique<CDXElement>(_buffer.data(), _buffer.size()); } virtual void process() { } Scanner& scanner() { return _scanner; } virtual ~CDXReader(){}; protected: std::string _buffer; Scanner& _scanner; }; class CDXMLReader : public CDXReader { public: DECL_ERROR; CDXMLReader(Scanner& scanner) : CDXReader(scanner) { } void process() override { _xml.Parse(_buffer.c_str()); if (_xml.Error()) throw Error("XML parsing error: %s", _xml.ErrorStr()); } std::unique_ptr<BaseCDXElement> rootElement() override { return std::make_unique<CDXMLElement>(_xml.RootElement()); } ~CDXMLReader() { } private: tinyxml2::XMLDocument _xml; }; class MoleculeCdxmlLoader { public: struct ImageDescriptor { ImageDescriptor(EmbeddedImageObject::ImageFormat iformat, Rect2f& rc, const std::string& raw_data) : image_format(iformat), bbox(rc), data(raw_data) { } EmbeddedImageObject::ImageFormat image_format; Rect2f bbox; std::string data; }; struct EnhancedStereoCenter { EnhancedStereoCenter(int atom, int type_id, int group_num) : atom_idx(atom), type(type_id), group(group_num) { } int atom_idx; int type; int group; }; DECL_ERROR; MoleculeCdxmlLoader(Scanner& scanner, bool is_binary = false, bool is_fragment = false); void loadMolecule(BaseMolecule& mol, bool load_arrows = false); void loadMoleculeFromFragment(BaseMolecule& mol, BaseCDXElement& elem); static void applyDispatcher(BaseCDXProperty& prop, const std::unordered_map<std::string, std::function<void(const std::string&)>>& dispatcher); void parseCDXMLAttributes(BaseCDXProperty& prop); void parseBBox(const std::string& data, Rect2f& bbox); void parsePos(const std::string& data, Vec3f& bbox); void parseSeg(const std::string& data, Vec2f& v1, Vec2f& v2); void parseHex(const std::string& hex, std::string& binary); StereocentersOptions stereochemistry_options; bool ignore_bad_valence; Rect2f cdxml_bbox; AutoInt cdxml_bond_length; std::vector<CdxmlNode> nodes; std::vector<CdxmlBond> bonds; std::vector<CdxmlBracket> brackets; std::vector<CdxmlText> text_objects; static const int SCALE = 30; protected: void _initMolecule(BaseMolecule& mol); void _parseCollections(BaseMolecule& mol); void _checkFragmentConnection(int node_id, int bond_id); void _parseNode(CdxmlNode& node, BaseCDXElement& elem); void _addNode(CdxmlNode& node); void _parseBond(CdxmlBond& bond, BaseCDXProperty& prop); void _parseBracket(CdxmlBracket& bracket, BaseCDXProperty& prop); void _parseText(BaseCDXElement& elem, std::vector<CdxmlText>& text_parsed); void _parseLabel(BaseCDXElement& elem, std::string& label); void _parseGraphic(BaseCDXElement& elem); void _parseArrow(BaseCDXElement& elem); void _parseAltGroup(BaseCDXElement& elem); void _parseEmbeddedObject(BaseCDXElement& elem); int _addBond(Molecule& mol, const CdxmlBond& bond, int begin, int end); void _addAtomsAndBonds(BaseMolecule& mol, const std::vector<int>& atoms, const std::vector<CdxmlBond>& new_bonds); void _addBracket(BaseMolecule& mol, const CdxmlBracket& bracket); void _handleSGroup(SGroup& sgroup, const std::unordered_set<int>& atoms, BaseMolecule& bmol); void _processEnhancedStereo(BaseMolecule& mol); void _parseCDXMLPage(BaseCDXElement& elem); void _parseCDXMLElements(BaseCDXElement& elem, bool no_siblings = false, bool inside_fragment_node = false); void _parseFragmentAttributes(BaseCDXProperty& prop); void _gunzip(Scanner& scanner, Array<char>& dataBuf); std::string _inflate(const char* data, size_t dataLength); void _appendQueryAtom(const char* atom_label, std::unique_ptr<QueryMolecule::Atom>& atom); void _updateConnection(const CdxmlNode& node, int atom_idx); Molecule* _pmol; QueryMolecule* _pqmol; std::unordered_map<int, int> _id_to_atom_idx; std::unordered_map<int, std::size_t> _id_to_node_index; std::unordered_map<int, std::size_t> _id_to_bond_index; std::vector<int> _fragment_nodes; std::vector<Vec2f> _pluses; std::vector<ImageDescriptor> _images; std::unordered_map<int, std::pair<std::pair<Vec3f, Vec3f>, int>> _arrows; std::vector<std::pair<std::pair<Vec3f, Vec3f>, int>> _graphic_arrows; std::unordered_set<int> _retro_arrows_graph_id; std::vector<std::pair<std::pair<Vec2f, Vec2f>, int>> _primitives; std::vector<EnhancedStereoCenter> _stereo_centers; Scanner& _scanner; bool _is_binary; bool _is_fragment; bool _has_bounding_box; bool _has_scheme; private: MoleculeCdxmlLoader(const MoleculeCdxmlLoader&); // no implicit copy }; } // namespace indigo #ifdef _MSC_VER #pragma warning(pop) #endif #endif