core/indigo-core/common/base_cpp/scanner.cpp (780 lines of code) (raw):

/**************************************************************************** * Copyright (C) from 2009 to Present EPAM Systems. * * This file is part of Indigo toolkit. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ***************************************************************************/ #include <algorithm> #include <ctype.h> #include <errno.h> #include <limits> #include <stdarg.h> #include <stdio.h> #include <string.h> #include <array> #include <cppcodec/base64_default_rfc4648.hpp> #include "base_c/defs.h" #include "base_cpp/scanner.h" #include "base_cpp/tlscont.h" #include "reusable_obj_array.h" using namespace indigo; enum { MAX_LINE_LENGTH = 1048576 }; IMPL_ERROR(Scanner, "scanner"); Scanner::~Scanner() { } bool Scanner::isEOL() { return isEOF() || lookNext() == '\n' || lookNext() == '\r'; } int Scanner::readIntFix(int digits) { int result; char buf[20]; if (digits >= NELEM(buf) - 1) throw Error("readIntFix(): digits = %d", digits); read(digits, buf); buf[digits] = 0; char* end; result = strtol(buf, &end, 10); // Check that some digits were read if (buf == end) throw Error("readIntFix(%d): invalid number representation: \"%s\"", digits, buf); // Check that the unread part contains only spaces while (end != buf + digits) { if (!isspace(*end)) throw Error("readIntFix(%d): invalid number representation: \"%s\"", digits, buf); end++; } return result; } int Scanner::readInt1(void) { QS_DEF(Array<char>, buf); char c; int result; buf.clear(); skipSpace(); while (!isEOF()) { c = readChar(); if (!isdigit(c) && c != '-' && c != '+') break; buf.push(c); if (buf.size() > MAX_LINE_LENGTH) throw Error("Line length is too long. Probably the file format is not correct."); } buf.push(0); if (sscanf(buf.ptr(), "%d", &result) < 1) throw Error("readInt(): error parsing %s", buf.ptr()); return result; } int Scanner::readInt(void) { QS_DEF(Array<char>, buf); char c; int result; buf.clear(); c = readChar(); if (c == '+' || c == '-' || isdigit(c)) buf.push(c); while (isdigit(lookNext())) { buf.push(readChar()); if (buf.size() > MAX_LINE_LENGTH) throw Error("Line length is too long. Probably the file format is not correct."); } buf.push(0); if (sscanf(buf.ptr(), "%d", &result) < 1) throw Error("readInt(): error parsing %s", buf.ptr()); return result; } // Try to read unsigned int. Return readed value, on error return -1 and restore position int Scanner::tryReadUnsigned() { int result = 0; bool was_digit = false; long long pos = tell(); while (!isEOF()) { char c = readChar(); if (isdigit(c)) { was_digit = true; result = (int)(c - '0') + result * 10; } else { seek(-1, SEEK_CUR); break; } } if (!was_digit) { seek(pos, SEEK_SET); return -1; } return result; } int Scanner::readUnsigned() { int result = tryReadUnsigned(); if (result < 0) throw Error("readUnsigned(): no digits"); return result; } // This very basic floating-point number parser was written // to avoid locale problems on various platforms. bool Scanner::_readDouble(double& res, int max) { res = 0; bool plus = false; bool minus = false; bool digit = false; bool e = false; double denom = 0; int cnt = 0; while (1) { if (max > 0 && cnt == max) break; char c = (char)lookNext(); if (c == -1) // EOF break; if (c == '+') { if (plus || minus || digit || denom > 1) return false; plus = true; } else if (c == '-') { if (plus || minus || digit || denom > 1) return false; minus = true; } else if (isdigit(c)) { if (denom > 1) { res += (c - '0') / (double)denom; denom *= 10; } else res = res * 10 + (c - '0'); digit = true; } else if (c == '.') { if (denom > 1) return false; denom = 10; } else if (c == 'E' || c == 'e') { skip(1); e = true; break; } else if (isspace(c)) { if (plus || minus || digit || denom > 1) break; } else break; skip(1); cnt++; } if (minus) res *= -1; if (e) { int exponent = readInt(); if (exponent > 0) { while (exponent-- > 0) res *= 10; } while (exponent++ < 0) res /= 10; } return digit; } float Scanner::readFloat(void) { double res; if (!_readDouble(res, 0)) throw Error("readFloat(): error parsing"); return (float)res; } bool Scanner::tryReadFloat(float& value) { long long pos = tell(); double res; if (!_readDouble(res, 0)) { seek(pos, SEEK_SET); return false; } value = (float)res; return true; } void Scanner::readWord(std::string& word, const char* delimiters) { Array<char> buf; readWord(buf, delimiters); word = buf.ptr(); } void Scanner::readWord(Array<char>& word, const char* delimiters) { word.clear(); if (isEOF()) throw Error("readWord(): end of stream"); do { int next = lookNext(); if (next == -1) break; if (delimiters == 0 && isspace((char)next)) break; if (delimiters != 0 && strchr(delimiters, (char)next) != NULL) break; word.push(readChar()); if (word.size() > MAX_LINE_LENGTH) throw Error("Line length is too long. Probably the file format is not correct."); } while (!isEOF()); word.push(0); } float Scanner::readFloatFix(int digits) { long long pos = tell(); double res; if (!_readDouble(res, digits)) throw Error("readFloatFix(): error parsing"); long long rest = tell() - pos - digits; // Check that the unread part contains only spaces while (rest-- > 0LL) { if (!isspace(readChar())) throw Error("readFloatFix(): garbage after the number"); } return (float)res; } char Scanner::readChar() { char c; read(sizeof(char), &c); return c; } byte Scanner::readByte() { byte c; read(1, &c); return c; } bool Scanner::skipLine() { char c; if (isEOF()) return false; while (!isEOF()) { c = readChar(); if (c == '\n') { if (lookNext() == '\r') skip(1); return true; } if (c == '\r') { if (lookNext() == '\n') skip(1); return true; } } return false; } void Scanner::read(int length, Array<char>& buf) { buf.resize(length); read(length, buf.ptr()); } void Scanner::skipSpace() { while (isspace(lookNext())) skip(1); } void Scanner::skipBom() { long long pos = tell(); const int kBOMSize = 3; const std::array<unsigned char, kBOMSize> kBOM = {0xEF, 0xBB, 0xBF}; if (length() >= kBOMSize) { std::array<unsigned char, kBOMSize> bom; readCharsFix(kBOMSize, (char*)bom.data()); if (bom != kBOM) seek(pos, SEEK_SET); } } bool Scanner::startsWith(const char* word) { if (word == nullptr) return false; long long pos = tell(); char ch = *word; while (ch != 0 && !isEOF() && ch == readChar()) { word++; ch = *word; } seek(pos, SEEK_SET); return ch == 0; } void Scanner::skipUntil(const char* delimiters) { while (strchr(delimiters, lookNext()) == nullptr) skip(1); } void Scanner::appendLine(Array<char>& out, bool append_zero) { if (isEOF()) throw Error("appendLine(): end of stream"); if (out.size() > 0) while (out.top() == 0) out.pop(); do { char c = readChar(); if (c == '\r') { if (lookNext() == '\n') continue; break; } if (c == '\n') break; out.push(c); if (out.size() > MAX_LINE_LENGTH) throw Error("Line length is too long. Probably the file format is not correct."); } while (!isEOF()); if (append_zero) out.push(0); } void Scanner::readLine(Array<char>& out, bool append_zero) { out.clear(); appendLine(out, append_zero); } void Scanner::readCharsFix(int n, char* chars_out) { read(n, chars_out); } int Scanner::readCharsFlexible(int n, char* chars_out) { int i = 0; while ((i < n) && !isEOF()) { chars_out[i++] = readChar(); } return i; } word Scanner::readBinaryWord() { word res; read(sizeof(word), &res); return res; } dword Scanner::readBinaryDword() { dword res; read(sizeof(dword), &res); return res; } int Scanner::readBinaryInt() { int res; read(sizeof(int), &res); return res; //*res = ntohl(*res); } float Scanner::readBinaryFloat() { float res; read(sizeof(float), &res); return res; } short Scanner::readPackedShort() { byte high = readByte(); if (high < 128) return high; byte low = readByte(); high -= 128; return high * (short)256 + low; } unsigned int Scanner::readPackedUInt() { unsigned int value = 0; int shift = 0; while (true) { byte cur = readByte(); value |= (cur & 0x7F) << shift; if (!(cur & 0x80)) return value; shift += 7; } } void Scanner::readAll(std::string& str) { const long long size = length() - tell(); const int max_int = std::numeric_limits<int>::max(); if (size > max_int) { throw Error("Cannot read more than %d into memory", max_int); } str.resize(static_cast<size_t>(size)); read(static_cast<int>(str.size()), &str[0]); } void Scanner::readAll(Array<char>& arr) { const long long size = length() - tell(); const int max_int = std::numeric_limits<int>::max(); if (size > max_int) { throw Error("Cannot read more than %d into memory", max_int); } arr.clear_resize(static_cast<int>(size)); read(arr.size(), arr.ptr()); } bool Scanner::isSingleLine(Scanner& scanner) { long long pos = scanner.tell(); scanner.skipLine(); bool res = scanner.isEOF(); scanner.seek(pos, SEEK_SET); return res; } // // FileScanner // FileScanner::FileScanner(Encoding filename_encoding, const char* filename) { _init(filename_encoding, filename); } FileScanner::FileScanner(const char* format, ...) { char filename[1024]; va_list args; va_start(args, format); vsnprintf(filename, sizeof(filename), format, args); va_end(args); _init(ENCODING_ASCII, filename); } void FileScanner::_init(Encoding filename_encoding, const char* filename) { _file = 0; _file_len = 0LL; if (filename == 0) throw Error("null filename"); _file = openFile(filename_encoding, filename, "rb"); if (_file == NULL) throw Error("can't open file %s. Error: %s", filename, strerror(errno)); #ifdef _WIN32 _fseeki64(_file, 0LL, SEEK_END); _file_len = _ftelli64(_file); _fseeki64(_file, 0LL, SEEK_SET); #else fseeko(_file, 0LL, SEEK_END); _file_len = ftello(_file); fseeko(_file, 0LL, SEEK_SET); #endif _invalidateCache(); } int FileScanner::lookNext() { _validateCache(); if (_cache_pos == _max_cache) return -1; return _cache[_cache_pos]; } void FileScanner::_invalidateCache() { _max_cache = 0; _cache_pos = 0; } void FileScanner::_validateCache() { if (_cache_pos < _max_cache) return; size_t nread = fread(_cache, 1, NELEM(_cache), _file); _max_cache = static_cast<int>(nread); _cache_pos = 0; } long long FileScanner::tell() { _validateCache(); #ifdef _WIN32 return _ftelli64(_file) - _max_cache + _cache_pos; #else return ftello(_file) - _max_cache + _cache_pos; #endif } void FileScanner::read(int length, void* res) { int to_read_from_cache = std::min(length, _max_cache - _cache_pos); memcpy(res, _cache + _cache_pos, to_read_from_cache); _cache_pos += to_read_from_cache; if (to_read_from_cache != length) { int left = length - to_read_from_cache; size_t nread = fread((char*)res + to_read_from_cache, 1, left, _file); if (nread != (size_t)left) throw Error("FileScanner::read() error"); } } bool FileScanner::isEOF() { if (_file == NULL) return true; if (_cache_pos < _max_cache) return false; return tell() == _file_len; } void FileScanner::skip(int n) { _validateCache(); _cache_pos += n; if (_cache_pos > _max_cache) { int delta = _cache_pos - _max_cache; #ifdef _WIN32 long long res = _fseeki64(_file, delta, SEEK_CUR); #else long long res = fseeko(_file, delta, SEEK_CUR); #endif _invalidateCache(); if (res != 0LL) throw Error("skip() passes after end of file"); } } void FileScanner::seek(long long pos, int from) { #ifdef _WIN32 if (from == SEEK_CUR) _fseeki64(_file, pos - _max_cache + _cache_pos, from); else _fseeki64(_file, pos, from); #else if (from == SEEK_CUR) fseeko(_file, pos - _max_cache + _cache_pos, from); else fseeko(_file, pos, from); #endif _invalidateCache(); } long long FileScanner::length() { return _file_len; } char FileScanner::readChar() { _validateCache(); if (_cache_pos == _max_cache) throw Error("readChar() passes after end of file"); return _cache[_cache_pos++]; } FileScanner::~FileScanner() { if (_file != NULL) fclose(_file); } // // BufferScanner // void BufferScanner::_init(const char* buffer, int size) { if (size < -1 || (size > 0 && buffer == 0)) throw Error("incorrect parameters in BufferScanner constructor"); if (_is_base64) { std::string encoded(buffer, size); auto decoded = base64::decode(encoded.c_str(), encoded.size()); _base64_buffer.copy(reinterpret_cast<const char*>(decoded.data()), static_cast<int>(decoded.size())); _buffer = _base64_buffer.ptr(); _size = _base64_buffer.size(); } else { _buffer = buffer; _size = size; } _offset = 0; } BufferScanner::BufferScanner(const char* buffer, int buffer_size, bool is_base64) : _is_base64(is_base64) { _init(buffer, buffer_size); } BufferScanner::BufferScanner(const byte* buffer, int buffer_size, bool is_base64) : _is_base64(is_base64) { _init((const char*)buffer, buffer_size); } BufferScanner::BufferScanner(const char* str, bool is_base64) : _is_base64(is_base64) { if (str == 0) throw Error("null input"); _init(str, (int)strlen(str)); } BufferScanner::BufferScanner(const Array<char>& arr, bool is_base64) : _is_base64(is_base64) { _init(arr.ptr(), arr.size()); } BufferScanner::~BufferScanner() { } bool BufferScanner::isEOF() { if (_size < 0) throw Error("isEOF() called to unlimited buffer"); return _offset >= _size; } void BufferScanner::read(int length, void* res) { if (_size >= 0 && _offset + length > _size) throw Error("BufferScanner::read() error"); memcpy(res, &_buffer[_offset], length); _offset += length; } int BufferScanner::lookNext() { if (_size >= 0 && _offset >= _size) return -1; return _buffer[_offset]; } long long BufferScanner::length() { return _size; } long long BufferScanner::tell() { return _offset; } const void* BufferScanner::curptr() { return _buffer + _offset; } void BufferScanner::skip(int n) { _offset += n; if (_size >= 0 && _offset > _size) throw Error("skip() passes after end of buffer"); } void BufferScanner::seek(long long pos, int from) { if (from == SEEK_SET) _offset = static_cast<int>(pos); else if (from == SEEK_CUR) _offset += static_cast<int>(pos); else // SEEK_END { if (_size < 0) throw Error("can not seek from end: buffer is unlimited"); _offset = _size - static_cast<int>(pos); } if ((_size >= 0 && _offset > _size) || _offset < 0) throw Error("size = %d, offset = %d after seek()", _size, _offset); } byte BufferScanner::readByte() { if (_size >= 0 && _offset >= _size) throw Error("readByte(): end of buffer"); return _buffer[_offset++]; } void Scanner::_prefixFunction(Array<char>& str, Array<int>& prefix) { prefix.clear(); prefix.push(0); int i, k = 0; for (i = 1; i < str.size(); i++) { while ((k > 0) && (str[k] != str[i])) k = prefix[k - 1]; if (str[k] == str[i]) k++; prefix.push(k); } } bool Scanner::findWord(const char* word) { QS_DEF(ReusableObjArray<Array<char>>, strs); strs.clear(); Array<char>& str = strs.push(); str.readString(word, false); return findWord(strs) == 0; } int Scanner::findWord(ReusableObjArray<Array<char>>& words) { if (isEOF()) return -1; QS_DEF(ReusableObjArray<Array<int>>, prefixes); QS_DEF(Array<int>, pos); int i; long long pos_saved = tell(); prefixes.clear(); pos.clear(); for (i = 0; i < words.size(); i++) { _prefixFunction(words[i], prefixes.push()); pos.push(0); } while (!isEOF()) { int c = readChar(); for (i = 0; i < words.size(); i++) { while (pos[i] > 0 && words[i][pos[i]] != c) pos[i] = prefixes[i][pos[i] - 1]; if (words[i][pos[i]] == c) pos[i]++; if (pos[i] == words[i].size()) { seek(-words[i].size(), SEEK_CUR); return i; } } } seek(pos_saved, SEEK_SET); return -1; } bool Scanner::findWordIgnoreCase(const char* word) { QS_DEF(ReusableObjArray<Array<char>>, strs); strs.clear(); Array<char>& str = strs.push(); str.readString(word, false); return findWordIgnoreCase(strs) == 0; } int Scanner::findWordIgnoreCase(ReusableObjArray<Array<char>>& words) { if (isEOF()) return -1; QS_DEF(ReusableObjArray<Array<int>>, prefixes); QS_DEF(Array<int>, pos); int i; long long pos_saved = tell(); prefixes.clear(); pos.clear(); for (i = 0; i < words.size(); i++) { _prefixFunction(words[i], prefixes.push()); pos.push(0); } while (!isEOF()) { int c = readChar(); for (i = 0; i < words.size(); i++) { int c1 = ::tolower(words[i][pos[i]]); int c2 = ::tolower(c); while (pos[i] > 0 && c1 != c2) pos[i] = prefixes[i][pos[i] - 1]; if (c1 == c2) pos[i]++; if (pos[i] == words[i].size()) { seek(-words[i].size(), SEEK_CUR); return i; } } } seek(pos_saved, SEEK_SET); return -1; }