#pragma once #include #include #ifdef __SSE2__ #include #endif #if defined(__AVX512F__) || defined(__AVX512BW__) || defined(__AVX__) || defined(__AVX2__) #include #endif #if defined(__aarch64__) && defined(__ARM_NEON) # include #endif /// Common helper methods for implementation of different columns. namespace DB { namespace ErrorCodes { extern const int SIZES_OF_COLUMNS_DOESNT_MATCH; extern const int LOGICAL_ERROR; } /// Transform 64-byte mask to 64-bit mask inline UInt64 bytes64MaskToBits64Mask(const UInt8 * bytes64) { #if defined(__AVX512F__) && defined(__AVX512BW__) const __m512i vbytes = _mm512_loadu_si512(reinterpret_cast(bytes64)); UInt64 res = _mm512_testn_epi8_mask(vbytes, vbytes); #elif defined(__AVX__) && defined(__AVX2__) const __m256i zero32 = _mm256_setzero_si256(); UInt64 res = (static_cast(_mm256_movemask_epi8(_mm256_cmpeq_epi8( _mm256_loadu_si256(reinterpret_cast(bytes64)), zero32))) & 0xffffffff) | (static_cast(_mm256_movemask_epi8(_mm256_cmpeq_epi8( _mm256_loadu_si256(reinterpret_cast(bytes64+32)), zero32))) << 32); #elif defined(__SSE2__) const __m128i zero16 = _mm_setzero_si128(); UInt64 res = (static_cast(_mm_movemask_epi8(_mm_cmpeq_epi8( _mm_loadu_si128(reinterpret_cast(bytes64)), zero16))) & 0xffff) | ((static_cast(_mm_movemask_epi8(_mm_cmpeq_epi8( _mm_loadu_si128(reinterpret_cast(bytes64 + 16)), zero16))) << 16) & 0xffff0000) | ((static_cast(_mm_movemask_epi8(_mm_cmpeq_epi8( _mm_loadu_si128(reinterpret_cast(bytes64 + 32)), zero16))) << 32) & 0xffff00000000) | ((static_cast(_mm_movemask_epi8(_mm_cmpeq_epi8( _mm_loadu_si128(reinterpret_cast(bytes64 + 48)), zero16))) << 48) & 0xffff000000000000); #elif defined(__aarch64__) && defined(__ARM_NEON) const uint8x16_t bitmask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; const auto * src = reinterpret_cast(bytes64); const uint8x16_t p0 = vceqzq_u8(vld1q_u8(src)); const uint8x16_t p1 = vceqzq_u8(vld1q_u8(src + 16)); const uint8x16_t p2 = vceqzq_u8(vld1q_u8(src + 32)); const uint8x16_t p3 = vceqzq_u8(vld1q_u8(src + 48)); uint8x16_t t0 = vandq_u8(p0, bitmask); uint8x16_t t1 = vandq_u8(p1, bitmask); uint8x16_t t2 = vandq_u8(p2, bitmask); uint8x16_t t3 = vandq_u8(p3, bitmask); uint8x16_t sum0 = vpaddq_u8(t0, t1); uint8x16_t sum1 = vpaddq_u8(t2, t3); sum0 = vpaddq_u8(sum0, sum1); sum0 = vpaddq_u8(sum0, sum0); UInt64 res = vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0); #else UInt64 res = 0; for (size_t i = 0; i < 64; ++i) res |= static_cast(0 == bytes64[i]) << i; #endif return ~res; } /// Counts how many bytes of `filt` are greater than zero. size_t countBytesInFilter(const UInt8 * filt, size_t start, size_t end); size_t countBytesInFilter(const IColumn::Filter & filt); size_t countBytesInFilterWithNull(const IColumn::Filter & filt, const UInt8 * null_map, size_t start, size_t end); /// Returns vector with num_columns elements. vector[i] is the count of i values in selector. /// Selector must contain values from 0 to num_columns - 1. NOTE: this is not checked. std::vector countColumnsSizeInSelector(IColumn::ColumnIndex num_columns, const IColumn::Selector & selector); /// Returns true, if the memory contains only zeros. bool memoryIsZero(const void * data, size_t start, size_t end); bool memoryIsByte(const void * data, size_t start, size_t end, uint8_t byte); /// The general implementation of `filter` function for ColumnArray and ColumnString. template void filterArraysImpl( const PaddedPODArray & src_elems, const IColumn::Offsets & src_offsets, PaddedPODArray & res_elems, IColumn::Offsets & res_offsets, const IColumn::Filter & filt, ssize_t result_size_hint); /// Same as above, but not fills res_offsets. template void filterArraysImplOnlyData( const PaddedPODArray & src_elems, const IColumn::Offsets & src_offsets, PaddedPODArray & res_elems, const IColumn::Filter & filt, ssize_t result_size_hint); namespace detail { template const PaddedPODArray * getIndexesData(const IColumn & indexes); } /// Check limit <= indexes->size() and call column.indexImpl(const PaddedPodArray & indexes, UInt64 limit). template ColumnPtr selectIndexImpl(const Column & column, const IColumn & indexes, size_t limit) { if (limit == 0) limit = indexes.size(); if (indexes.size() < limit) throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of indexes ({}) is less than required ({})", indexes.size(), limit); if (const auto * data_uint8 = detail::getIndexesData(indexes)) return column.template indexImpl(*data_uint8, limit); if (const auto * data_uint16 = detail::getIndexesData(indexes)) return column.template indexImpl(*data_uint16, limit); if (const auto * data_uint32 = detail::getIndexesData(indexes)) return column.template indexImpl(*data_uint32, limit); if (const auto * data_uint64 = detail::getIndexesData(indexes)) return column.template indexImpl(*data_uint64, limit); throw Exception(ErrorCodes::LOGICAL_ERROR, "Indexes column for IColumn::select must be ColumnUInt, got {}", indexes.getName()); } size_t getLimitForPermutation(size_t column_size, size_t perm_size, size_t limit); template ColumnPtr permuteImpl(const Column & column, const IColumn::Permutation & perm, size_t limit) { limit = getLimitForPermutation(column.size(), perm.size(), limit); return column.indexImpl(perm, limit); } /// NOLINTNEXTLINE #define INSTANTIATE_INDEX_IMPL(Column) \ template ColumnPtr Column::indexImpl(const PaddedPODArray & indexes, size_t limit) const; \ template ColumnPtr Column::indexImpl(const PaddedPODArray & indexes, size_t limit) const; \ template ColumnPtr Column::indexImpl(const PaddedPODArray & indexes, size_t limit) const; \ template ColumnPtr Column::indexImpl(const PaddedPODArray & indexes, size_t limit) const; }