#pragma once #include "config.h" #if USE_SIMDJSON # include # include # include # include # include "ElementTypes.h" # include # include # include namespace DB { namespace ErrorCodes { extern const int CANNOT_ALLOCATE_MEMORY; } /// Format elements of basic types into string. /// The original implementation is mini_formatter in simdjson.h. But it is not public API, so we /// add a implementation here. class SimdJSONBasicFormatter { public: explicit SimdJSONBasicFormatter(PaddedPODArray & buffer_) : buffer(buffer_) {} void comma() { oneChar(','); } /** Start an array, prints [ **/ void startArray() { oneChar('['); } /** End an array, prints ] **/ void endArray() { oneChar(']'); } /** Start an array, prints { **/ void startObject() { oneChar('{'); } /** Start an array, prints } **/ void endObject() { oneChar('}'); } /** Prints a true **/ void trueAtom() { const char * s = "true"; buffer.insert(s, s + 4); } /** Prints a false **/ void falseAtom() { const char * s = "false"; buffer.insert(s, s + 5); } /** Prints a null **/ void nullAtom() { const char * s = "null"; buffer.insert(s, s + 4); } /** Prints a number **/ void number(int64_t x) { char number_buffer[24]; auto res = std::to_chars(number_buffer, number_buffer + sizeof(number_buffer), x); buffer.insert(number_buffer, res.ptr); } /** Prints a number **/ void number(uint64_t x) { char number_buffer[24]; auto res = std::to_chars(number_buffer, number_buffer + sizeof(number_buffer), x); buffer.insert(number_buffer, res.ptr); } /** Prints a number **/ void number(double x) { char number_buffer[24]; auto res = std::to_chars(number_buffer, number_buffer + sizeof(number_buffer), x); buffer.insert(number_buffer, res.ptr); } /** Prints a key (string + colon) **/ void key(std::string_view unescaped) { string(unescaped); oneChar(':'); } /** Prints a string. The string is escaped as needed. **/ void string(std::string_view unescaped) { oneChar('\"'); size_t i = 0; // Fast path for the case where we have no control character, no ", and no backslash. // This should include most keys. // // We would like to use 'bool' but some compilers take offense to bitwise operation // with bool types. constexpr static char needs_escaping[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; for (; i + 8 <= unescaped.length(); i += 8) { // Poor's man vectorization. This could get much faster if we used SIMD. // // It is not the case that replacing '|' with '||' would be neutral performance-wise. if (needs_escaping[uint8_t(unescaped[i])] | needs_escaping[uint8_t(unescaped[i + 1])] | needs_escaping[uint8_t(unescaped[i + 2])] | needs_escaping[uint8_t(unescaped[i + 3])] | needs_escaping[uint8_t(unescaped[i + 4])] | needs_escaping[uint8_t(unescaped[i + 5])] | needs_escaping[uint8_t(unescaped[i + 6])] | needs_escaping[uint8_t(unescaped[i + 7])]) { break; } } for (; i < unescaped.length(); i++) { if (needs_escaping[uint8_t(unescaped[i])]) { break; } } // The following is also possible and omits a 256-byte table, but it is slower: // for (; (i < unescaped.length()) && (uint8_t(unescaped[i]) > 0x1F) // && (unescaped[i] != '\"') && (unescaped[i] != '\\'); i++) {} // At least for long strings, the following should be fast. We could // do better by integrating the checks and the insertion. buffer.insert(unescaped.data(), unescaped.data() + i); // We caught a control character if we enter this loop (slow). // Note that we are do not restart from the beginning, but rather we continue // from the point where we encountered something that requires escaping. for (; i < unescaped.length(); i++) { switch (unescaped[i]) { case '\"': { const char * s = "\\\""; buffer.insert(s, s + 2); } break; case '\\': { const char * s = "\\\\"; buffer.insert(s, s + 2); } break; default: if (uint8_t(unescaped[i]) <= 0x1F) { // If packed, this uses 8 * 32 bytes. // Note that we expect most compilers to embed this code in the data // section. constexpr static simdjson::escape_sequence escaped[32] = { {6, "\\u0000"}, {6, "\\u0001"}, {6, "\\u0002"}, {6, "\\u0003"}, {6, "\\u0004"}, {6, "\\u0005"}, {6, "\\u0006"}, {6, "\\u0007"}, {2, "\\b"}, {2, "\\t"}, {2, "\\n"}, {6, "\\u000b"}, {2, "\\f"}, {2, "\\r"}, {6, "\\u000e"}, {6, "\\u000f"}, {6, "\\u0010"}, {6, "\\u0011"}, {6, "\\u0012"}, {6, "\\u0013"}, {6, "\\u0014"}, {6, "\\u0015"}, {6, "\\u0016"}, {6, "\\u0017"}, {6, "\\u0018"}, {6, "\\u0019"}, {6, "\\u001a"}, {6, "\\u001b"}, {6, "\\u001c"}, {6, "\\u001d"}, {6, "\\u001e"}, {6, "\\u001f"}}; auto u = escaped[uint8_t(unescaped[i])]; buffer.insert(u.string, u.string + u.length); } else { oneChar(unescaped[i]); } } // switch } // for oneChar('\"'); } void oneChar(char c) { buffer.push_back(c); } private: PaddedPODArray & buffer; }; /// Format object elements into string, element, array, object, kv-pair. /// Similar to string_builder in simdjson.h. class SimdJSONElementFormatter { public: explicit SimdJSONElementFormatter(PaddedPODArray & buffer_) : format(buffer_) {} /** Append an element to the builder (to be printed) **/ void append(simdjson::dom::element value) { switch (value.type()) { case simdjson::dom::element_type::UINT64: { format.number(value.get_uint64().value_unsafe()); break; } case simdjson::dom::element_type::INT64: { format.number(value.get_int64().value_unsafe()); break; } case simdjson::dom::element_type::DOUBLE: { format.number(value.get_double().value_unsafe()); break; } case simdjson::dom::element_type::STRING: { format.string(value.get_string().value_unsafe()); break; } case simdjson::dom::element_type::BOOL: { if (value.get_bool().value_unsafe()) format.trueAtom(); else format.falseAtom(); break; } case simdjson::dom::element_type::NULL_VALUE: { format.nullAtom(); break; } case simdjson::dom::element_type::ARRAY: { append(value.get_array().value_unsafe()); break; } case simdjson::dom::element_type::OBJECT: { append(value.get_object().value_unsafe()); break; } } } /** Append an array to the builder (to be printed) **/ void append(simdjson::dom::array value) { format.startArray(); auto iter = value.begin(); auto end = value.end(); if (iter != end) { append(*iter); for (++iter; iter != end; ++iter) { format.comma(); append(*iter); } } format.endArray(); } void append(simdjson::dom::object value) { format.startObject(); auto pair = value.begin(); auto end = value.end(); if (pair != end) { append(*pair); for (++pair; pair != end; ++pair) { format.comma(); append(*pair); } } format.endObject(); } void append(simdjson::dom::key_value_pair kv) { format.key(kv.key); append(kv.value); } private: SimdJSONBasicFormatter format; }; /// This class can be used as an argument for the template class FunctionJSON. /// It provides ability to parse JSONs using simdjson library. struct SimdJSONParser { class Array; class Object; /// References an element in a JSON document, representing a JSON null, boolean, string, number, /// array or object. class Element { public: ALWAYS_INLINE Element() {} /// NOLINT ALWAYS_INLINE Element(const simdjson::dom::element & element_) : element(element_) {} /// NOLINT ALWAYS_INLINE ElementType type() const { switch (element.type()) { case simdjson::dom::element_type::INT64: return ElementType::INT64; case simdjson::dom::element_type::UINT64: return ElementType::UINT64; case simdjson::dom::element_type::DOUBLE: return ElementType::DOUBLE; case simdjson::dom::element_type::STRING: return ElementType::STRING; case simdjson::dom::element_type::ARRAY: return ElementType::ARRAY; case simdjson::dom::element_type::OBJECT: return ElementType::OBJECT; case simdjson::dom::element_type::BOOL: return ElementType::BOOL; case simdjson::dom::element_type::NULL_VALUE: return ElementType::NULL_VALUE; } } ALWAYS_INLINE bool isInt64() const { return element.type() == simdjson::dom::element_type::INT64; } ALWAYS_INLINE bool isUInt64() const { return element.type() == simdjson::dom::element_type::UINT64; } ALWAYS_INLINE bool isDouble() const { return element.type() == simdjson::dom::element_type::DOUBLE; } ALWAYS_INLINE bool isString() const { return element.type() == simdjson::dom::element_type::STRING; } ALWAYS_INLINE bool isArray() const { return element.type() == simdjson::dom::element_type::ARRAY; } ALWAYS_INLINE bool isObject() const { return element.type() == simdjson::dom::element_type::OBJECT; } ALWAYS_INLINE bool isBool() const { return element.type() == simdjson::dom::element_type::BOOL; } ALWAYS_INLINE bool isNull() const { return element.type() == simdjson::dom::element_type::NULL_VALUE; } ALWAYS_INLINE Int64 getInt64() const { return element.get_int64().value_unsafe(); } ALWAYS_INLINE UInt64 getUInt64() const { return element.get_uint64().value_unsafe(); } ALWAYS_INLINE double getDouble() const { return element.get_double().value_unsafe(); } ALWAYS_INLINE bool getBool() const { return element.get_bool().value_unsafe(); } ALWAYS_INLINE std::string_view getString() const { return element.get_string().value_unsafe(); } ALWAYS_INLINE Array getArray() const; ALWAYS_INLINE Object getObject() const; ALWAYS_INLINE simdjson::dom::element getElement() const { return element; } private: simdjson::dom::element element; }; /// References an array in a JSON document. class Array { public: class Iterator { public: ALWAYS_INLINE Iterator(const simdjson::dom::array::iterator & it_) : it(it_) {} /// NOLINT ALWAYS_INLINE Element operator*() const { return *it; } ALWAYS_INLINE Iterator & operator++() { ++it; return *this; } ALWAYS_INLINE Iterator operator++(int) { auto res = *this; ++it; return res; } /// NOLINT ALWAYS_INLINE friend bool operator!=(const Iterator & left, const Iterator & right) { return left.it != right.it; } ALWAYS_INLINE friend bool operator==(const Iterator & left, const Iterator & right) { return !(left != right); } private: simdjson::dom::array::iterator it; }; ALWAYS_INLINE Array(const simdjson::dom::array & array_) : array(array_) {} /// NOLINT ALWAYS_INLINE Iterator begin() const { return array.begin(); } ALWAYS_INLINE Iterator end() const { return array.end(); } ALWAYS_INLINE size_t size() const { return array.size(); } ALWAYS_INLINE Element operator[](size_t index) const { assert(index < size()); return array.at(index).value_unsafe(); } private: simdjson::dom::array array; }; using KeyValuePair = std::pair; /// References an object in a JSON document. class Object { public: class Iterator { public: ALWAYS_INLINE Iterator(const simdjson::dom::object::iterator & it_) : it(it_) {} /// NOLINT ALWAYS_INLINE KeyValuePair operator*() const { const auto & res = *it; return {res.key, res.value}; } ALWAYS_INLINE Iterator & operator++() { ++it; return *this; } ALWAYS_INLINE Iterator operator++(int) { auto res = *this; ++it; return res; } /// NOLINT ALWAYS_INLINE friend bool operator!=(const Iterator & left, const Iterator & right) { return left.it != right.it; } ALWAYS_INLINE friend bool operator==(const Iterator & left, const Iterator & right) { return !(left != right); } private: simdjson::dom::object::iterator it; }; ALWAYS_INLINE Object(const simdjson::dom::object & object_) : object(object_) {} /// NOLINT ALWAYS_INLINE Iterator begin() const { return object.begin(); } ALWAYS_INLINE Iterator end() const { return object.end(); } ALWAYS_INLINE size_t size() const { return object.size(); } bool find(std::string_view key, Element & result) const { auto x = object.at_key(key); if (x.error()) return false; result = x.value_unsafe(); return true; } /// Optional: Provides access to an object's element by index. KeyValuePair operator[](size_t index) const { assert(index < size()); auto it = object.begin(); while (index--) ++it; const auto & res = *it; return {res.key, res.value}; } private: simdjson::dom::object object; }; /// Parses a JSON document, returns the reference to its root element if succeeded. bool parse(std::string_view json, Element & result) { auto document = parser.parse(json.data(), json.size()); if (document.error()) return false; result = document.value_unsafe(); return true; } /// Optional: Allocates memory to parse JSON documents faster. void reserve(size_t max_size) { if (parser.allocate(max_size) != simdjson::error_code::SUCCESS) throw Exception(ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Couldn't allocate {} bytes when parsing JSON", max_size); } private: simdjson::dom::parser parser; }; inline ALWAYS_INLINE SimdJSONParser::Array SimdJSONParser::Element::getArray() const { return element.get_array().value_unsafe(); } inline ALWAYS_INLINE SimdJSONParser::Object SimdJSONParser::Element::getObject() const { return element.get_object().value_unsafe(); } } #endif