#pragma once #include #include #include #include #include namespace DB { /** Different data structures that can be used for aggregation * For efficiency, the aggregation data itself is put into the pool. * Data and pool ownership (states of aggregate functions) * is acquired later - in `convertToBlocks` function, by the ColumnAggregateFunction object. * * Most data structures exist in two versions: normal and two-level (TwoLevel). * A two-level hash table works a little slower with a small number of different keys, * but with a large number of different keys scales better, because it allows * parallelize some operations (merging, post-processing) in a natural way. * * To ensure efficient work over a wide range of conditions, * first single-level hash tables are used, * and when the number of different keys is large enough, * they are converted to two-level ones. * * PS. There are many different approaches to the effective implementation of parallel and distributed aggregation, * best suited for different cases, and this approach is just one of them, chosen for a combination of reasons. */ using AggregatedDataWithoutKey = AggregateDataPtr; using AggregatedDataWithUInt8Key = FixedImplicitZeroHashMapWithCalculatedSize; using AggregatedDataWithUInt16Key = FixedImplicitZeroHashMap; using AggregatedDataWithUInt32Key = HashMap>; using AggregatedDataWithUInt64Key = HashMap>; using AggregatedDataWithShortStringKey = StringHashMap; using AggregatedDataWithStringKey = HashMapWithSavedHash; using AggregatedDataWithKeys128 = HashMap; using AggregatedDataWithKeys256 = HashMap; using AggregatedDataWithUInt32KeyTwoLevel = TwoLevelHashMap>; using AggregatedDataWithUInt64KeyTwoLevel = TwoLevelHashMap>; using AggregatedDataWithShortStringKeyTwoLevel = TwoLevelStringHashMap; using AggregatedDataWithStringKeyTwoLevel = TwoLevelHashMapWithSavedHash; using AggregatedDataWithKeys128TwoLevel = TwoLevelHashMap; using AggregatedDataWithKeys256TwoLevel = TwoLevelHashMap; /** Variants with better hash function, using more than 32 bits for hash. * Using for merging phase of external aggregation, where number of keys may be far greater than 4 billion, * but we keep in memory and merge only sub-partition of them simultaneously. * TODO We need to switch for better hash function not only for external aggregation, * but also for huge aggregation results on machines with terabytes of RAM. */ using AggregatedDataWithUInt64KeyHash64 = HashMap>; using AggregatedDataWithStringKeyHash64 = HashMapWithSavedHash; using AggregatedDataWithKeys128Hash64 = HashMap; using AggregatedDataWithKeys256Hash64 = HashMap; template struct AggregationDataWithNullKey : public Base { using Base::Base; bool & hasNullKeyData() { return has_null_key_data; } AggregateDataPtr & getNullKeyData() { return null_key_data; } bool hasNullKeyData() const { return has_null_key_data; } const AggregateDataPtr & getNullKeyData() const { return null_key_data; } size_t size() const { return Base::size() + (has_null_key_data ? 1 : 0); } bool empty() const { return Base::empty() && !has_null_key_data; } void clear() { Base::clear(); has_null_key_data = false; } void clearAndShrink() { Base::clearAndShrink(); has_null_key_data = false; } private: bool has_null_key_data = false; AggregateDataPtr null_key_data = nullptr; }; template struct AggregationDataWithNullKeyTwoLevel : public Base { using Base::Base; using Base::impls; AggregationDataWithNullKeyTwoLevel() = default; template explicit AggregationDataWithNullKeyTwoLevel(const Other & other) : Base(other) { impls[0].hasNullKeyData() = other.hasNullKeyData(); impls[0].getNullKeyData() = other.getNullKeyData(); } bool & hasNullKeyData() { return impls[0].hasNullKeyData(); } AggregateDataPtr & getNullKeyData() { return impls[0].getNullKeyData(); } bool hasNullKeyData() const { return impls[0].hasNullKeyData(); } const AggregateDataPtr & getNullKeyData() const { return impls[0].getNullKeyData(); } }; template using HashTableWithNullKey = AggregationDataWithNullKey>; template using StringHashTableWithNullKey = AggregationDataWithNullKey>; using AggregatedDataWithNullableUInt8Key = AggregationDataWithNullKey; using AggregatedDataWithNullableUInt16Key = AggregationDataWithNullKey; using AggregatedDataWithNullableUInt32Key = AggregationDataWithNullKey; using AggregatedDataWithNullableUInt64Key = AggregationDataWithNullKey; using AggregatedDataWithNullableStringKey = AggregationDataWithNullKey; using AggregatedDataWithNullableShortStringKey = AggregationDataWithNullKey; using AggregatedDataWithNullableUInt32KeyTwoLevel = AggregationDataWithNullKeyTwoLevel< TwoLevelHashMap, TwoLevelHashTableGrower<>, HashTableAllocator, HashTableWithNullKey>>; using AggregatedDataWithNullableUInt64KeyTwoLevel = AggregationDataWithNullKeyTwoLevel< TwoLevelHashMap, TwoLevelHashTableGrower<>, HashTableAllocator, HashTableWithNullKey>>; using AggregatedDataWithNullableShortStringKeyTwoLevel = AggregationDataWithNullKeyTwoLevel< TwoLevelStringHashMap>; using AggregatedDataWithNullableStringKeyTwoLevel = AggregationDataWithNullKeyTwoLevel< TwoLevelHashMapWithSavedHash, TwoLevelHashTableGrower<>, HashTableAllocator, HashTableWithNullKey>>; }