#pragma once #include #include #include #include #include #include namespace DB { /* * Class for parsing data in BSON format. * Each row is parsed as a separate BSON document. * Each column is parsed as a single field with column name as a key. * It uses the following correspondence between BSON types and ClickHouse types: * * BSON Type | ClickHouse Type * \x01 double | Float32/Float64 * \x02 string | String/FixedString * \x03 document | Map/Named Tuple * \x04 array | Array/Tuple * \x05 binary, \x00 binary subtype | String/FixedString * \x05 binary, \x02 old binary subtype | String/FixedString * \x05 binary, \x03 old uuid subtype | UUID * \x05 binary, \x04 uuid subtype | UUID * \x07 ObjectId | String * \x08 boolean | Bool * \x09 datetime | DateTime64 * \x0A null value | NULL * \x0D JavaScript code | String * \x0E symbol | String/FixedString * \x10 int32 | Int32/Decimal32 * \x12 int64 | Int64/Decimal64/DateTime64 * \x11 uint64 | UInt64 * * Other BSON types are not supported. * Also, we perform conversion between different integer types * (for example, you can insert BSON int32 value into ClickHouse UInt8) * Big integers and decimals Int128/UInt128/Int256/UInt256/Decimal128/Decimal256 * can be parsed from BSON Binary value with \x00 binary subtype. In this case * we validate that the size of binary data equals the size of expected value. * * Note: this format will not work on Big-Endian platforms. */ class ReadBuffer; class BSONEachRowRowInputFormat final : public IRowInputFormat { public: BSONEachRowRowInputFormat( ReadBuffer & in_, const Block & header_, Params params_, const FormatSettings & format_settings_); String getName() const override { return "BSONEachRowRowInputFormat"; } void resetParser() override; private: bool readRow(MutableColumns & columns, RowReadExtension & ext) override; bool allowSyncAfterError() const override { return true; } void syncAfterError() override; bool supportsCountRows() const override { return true; } size_t countRows(size_t max_block_size) override; size_t columnIndex(const StringRef & name, size_t key_index); using ColumnReader = std::function; bool readField(IColumn & column, const DataTypePtr & data_type, BSONType bson_type); void skipUnknownField(BSONType type, const String & key_name); void readTuple(IColumn & column, const DataTypePtr & data_type, BSONType bson_type); void readArray(IColumn & column, const DataTypePtr & data_type, BSONType bson_type); void readMap(IColumn & column, const DataTypePtr & data_type, BSONType bson_type); const FormatSettings format_settings; /// Buffer for the read from the stream field name. Used when you have to copy it. String current_key_name; /// Set of columns for which the values were read. The rest will be filled with default values. std::vector read_columns; /// Set of columns which already met in row. Exception is thrown if there are more than one column with the same name. std::vector seen_columns; /// These sets may be different, because if null_as_default=1 read_columns[i] will be false and seen_columns[i] will be true /// for row like {..., "non-nullable column name" : null, ...} /// Hash table match `field name -> position in the block`. Block::NameMap name_map; /// Cached search results for previous row (keyed as index in JSON object) - used as a hint. std::vector prev_positions; DataTypes types; size_t current_document_start; BSONSizeT current_document_size; }; class BSONEachRowSchemaReader : public IRowWithNamesSchemaReader { public: BSONEachRowSchemaReader(ReadBuffer & in_, const FormatSettings & settings_); private: NamesAndTypesList readRowAndGetNamesAndDataTypes(bool & eof) override; void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type) override; NamesAndTypesList getDataTypesFromBSONDocument(bool skip_unsupported_types); DataTypePtr getDataTypeFromBSONField(BSONType type, bool skip_unsupported_types, bool & skip); }; }