#pragma once #include #include #include #include #include #include namespace DB { class CSVFormatReader; /** A stream for inputting data in csv format. * Does not conform with https://tools.ietf.org/html/rfc4180 because it skips spaces and tabs between values. */ class CSVRowInputFormat : public RowInputFormatWithNamesAndTypes { public: /** with_names - in the first line the header with column names * with_types - on the next line header with type names */ CSVRowInputFormat(const Block & header_, ReadBuffer & in_, const Params & params_, bool with_names_, bool with_types_, const FormatSettings & format_settings_); String getName() const override { return "CSVRowInputFormat"; } void setReadBuffer(ReadBuffer & in_) override; void resetReadBuffer() override; protected: CSVRowInputFormat(const Block & header_, std::shared_ptr in_, const Params & params_, bool with_names_, bool with_types_, const FormatSettings & format_settings_, std::unique_ptr format_reader_); CSVRowInputFormat(const Block & header_, std::shared_ptr in_buf_, const Params & params_, bool with_names_, bool with_types_, const FormatSettings & format_settings_); private: bool allowSyncAfterError() const override { return true; } void syncAfterError() override; bool supportsCountRows() const override { return true; } protected: std::shared_ptr buf; }; class CSVFormatReader : public FormatWithNamesAndTypesReader { public: CSVFormatReader(PeekableReadBuffer & buf_, const FormatSettings & format_settings_); bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) override; bool parseRowEndWithDiagnosticInfo(WriteBuffer & out) override; bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override { return *pos != '\n' && *pos != '\r' && *pos != format_settings.csv.delimiter && *pos != ' ' && *pos != '\t'; } bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) override; void skipRow() override; void skipField(size_t /*file_column*/) override { skipField(); } void skipField(); void skipHeaderRow(); void skipNames() override { skipHeaderRow(); } void skipTypes() override { skipHeaderRow(); } void skipFieldDelimiter() override; void skipRowEndDelimiter() override; void skipPrefixBeforeHeader() override; bool checkForEndOfRow() override; bool allowVariableNumberOfColumns() const override; std::vector readNames() override { return readHeaderRow(); } std::vector readTypes() override { return readHeaderRow(); } std::vector readHeaderRow() { return readRowImpl(); } std::vector readRow() { return readRowImpl(); } std::vector readRowForHeaderDetection() override { return readHeaderRow(); } bool checkForSuffix() override; template std::vector readRowImpl(); template String readCSVFieldIntoString(); void setReadBuffer(ReadBuffer & in_) override; FormatSettings::EscapingRule getEscapingRule() const override { return FormatSettings::EscapingRule::CSV; } bool readFieldImpl(ReadBuffer & istr, DB::IColumn & column, const DB::DataTypePtr & type, const DB::SerializationPtr & serialization); bool readFieldOrDefault(DB::IColumn & column, const DB::DataTypePtr & type, const DB::SerializationPtr & serialization); protected: PeekableReadBuffer * buf; }; class CSVSchemaReader : public FormatWithNamesAndTypesSchemaReader { public: CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_settings_); private: bool allowVariableNumberOfColumns() const override { return format_settings.csv.allow_variable_number_of_columns; } std::optional readRowAndGetDataTypesImpl() override; std::optional, DataTypes>> readRowAndGetFieldsAndDataTypes() override; PeekableReadBuffer buf; CSVFormatReader reader; DataTypes buffered_types; }; std::pair fileSegmentationEngineCSVImpl(ReadBuffer & in, DB::Memory<> & memory, size_t min_bytes, size_t min_rows, size_t max_rows, const FormatSettings & settings); }