#pragma once #include #include #include #include #include #include #include #include namespace DB { class DictionaryHierarchicalParentToChildIndex; using DictionaryHierarchyParentToChildIndexPtr = std::shared_ptr; class DictionaryHierarchicalParentToChildIndex { public: struct KeysRange { UInt32 start_index; UInt32 end_index; }; /// By default we use initial_bytes=4096 in PodArray. /// It might lead to really high memory consumption when arrays are almost empty but there are a lot of them. using Array = PODArray, PADDING_FOR_SIMD - 1, PADDING_FOR_SIMD>; using ParentToChildIndex = HashMap; explicit DictionaryHierarchicalParentToChildIndex(const ParentToChildIndex & parent_to_children_map_) { size_t parent_to_children_map_size = parent_to_children_map_.size(); keys.reserve(parent_to_children_map_size); parent_to_children_keys_range.reserve(parent_to_children_map_size); for (const auto & [parent, children] : parent_to_children_map_) { size_t keys_size = keys.size(); UInt32 start_index = static_cast(keys_size); UInt32 end_index = start_index + static_cast(children.size()); keys.insert(children.begin(), children.end()); parent_to_children_keys_range[parent] = KeysRange{start_index, end_index}; } } size_t getSizeInBytes() const { return parent_to_children_keys_range.getBufferSizeInBytes() + (keys.size() * sizeof(UInt64)); } /// Map parent key to range of children from keys array HashMap parent_to_children_keys_range; /// Array of keys in hierarchy PaddedPODArray keys; }; namespace detail { struct ElementsAndOffsets { PaddedPODArray elements; PaddedPODArray offsets; }; struct IsKeyValidFuncInterface { bool operator()(UInt64 key [[maybe_unused]]) { return false; } }; struct GetParentKeyFuncInterface { std::optional operator()(UInt64 key [[maybe_unused]]) { return {}; } }; /** Calculate hierarchy for keys iterating the hierarchy from child to parent using get_parent_key_func provided by client. * Hierarchy iteration is stopped if key equals null value, get_parent_key_func returns null optional, or hierarchy depth * greater or equal than DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH. * IsKeyValidFunc used for each input hierarchy key, if it returns false result hierarchy for that key will have size 0. * Hierarchy result is ElementsAndOffsets structure, for each element there is hierarchy array, * with size offset[element_index] - (element_index > 0 ? offset[element_index - 1] : 0). * * Example: * id parent_id * 1 0 * 2 1 * 3 1 * 4 2 * * If hierarchy_null_value will be 0. Requested keys [1, 2, 3, 4, 5]. * Result: [1], [2, 1], [3, 1], [4, 2, 1], [] * Elements: [1, 2, 1, 3, 1, 4, 2, 1] * Offsets: [1, 3, 5, 8, 8] */ template ElementsAndOffsets getHierarchy( const PaddedPODArray & keys, IsKeyValidFunc && is_key_valid_func, GetParentKeyFunc && get_parent_key_func) { size_t hierarchy_keys_size = keys.size(); PaddedPODArray elements; elements.reserve(hierarchy_keys_size); PaddedPODArray offsets; offsets.reserve(hierarchy_keys_size); struct OffsetInArray { size_t offset_index; size_t array_element_offset; }; HashMap already_processes_keys_to_offset; already_processes_keys_to_offset.reserve(hierarchy_keys_size); for (size_t i = 0; i < hierarchy_keys_size; ++i) { auto hierarchy_key = keys[i]; size_t current_hierarchy_depth = 0; bool is_key_valid = is_key_valid_func(hierarchy_key); if (!is_key_valid) { offsets.emplace_back(elements.size()); continue; } while (true) { const auto * it = already_processes_keys_to_offset.find(hierarchy_key); if (it) { const auto & index = it->getMapped(); size_t offset = index.offset_index; bool is_loop = (offset == offsets.size()); if (unlikely(is_loop)) break; size_t array_element_offset = index.array_element_offset; size_t previous_offset_size = offset > 0 ? offsets[offset - 1] : 0; size_t start_index = previous_offset_size + array_element_offset; size_t end_index = offsets[offset]; elements.insertFromItself(elements.begin() + start_index, elements.begin() + end_index); break; } if (current_hierarchy_depth >= DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH) break; already_processes_keys_to_offset[hierarchy_key] = {offsets.size(), current_hierarchy_depth}; elements.emplace_back(hierarchy_key); ++current_hierarchy_depth; std::optional parent_key = get_parent_key_func(hierarchy_key); if (!parent_key.has_value()) break; hierarchy_key = *parent_key; } offsets.emplace_back(elements.size()); } ElementsAndOffsets result = {std::move(elements), std::move(offsets)}; return result; } /** Returns array with UInt8 represent if key from in_keys array is in hierarchy of key from keys column. * If value in result array is 1 that means key from in_keys array is in hierarchy of key from * keys array with same index, 0 otherwise. * For getting hierarchy implementation uses getKeysHierarchy function. * * Not: keys size must be equal to in_keys_size. */ template PaddedPODArray getIsInHierarchy( const PaddedPODArray & keys, const PaddedPODArray & in_keys, IsKeyValidFunc && is_key_valid_func, GetParentKeyFunc && get_parent_func) { assert(keys.size() == in_keys.size()); PaddedPODArray result; result.resize_fill(keys.size()); detail::ElementsAndOffsets hierarchy = detail::getHierarchy( keys, std::forward(is_key_valid_func), std::forward(get_parent_func)); auto & offsets = hierarchy.offsets; auto & elements = hierarchy.elements; for (size_t i = 0; i < offsets.size(); ++i) { size_t i_elements_start = i > 0 ? offsets[i - 1] : 0; size_t i_elements_end = offsets[i]; const auto & key_to_find = in_keys[i]; const auto * begin = elements.begin() + i_elements_start; const auto * end = elements.begin() + i_elements_end; const auto * it = std::find(begin, end, key_to_find); bool contains_key = (it != end); result[i] = contains_key; } return result; } struct GetAllDescendantsStrategy { size_t level = 0; }; struct GetDescendantsAtSpecificLevelStrategy { size_t level = 0; }; /** Get descendants for keys iterating the hierarchy from parent to child using parent_to_child hash map provided by client. * GetAllDescendantsStrategy get all descendants for key * GetDescendantsAtSpecificLevelStrategy get descendants only for specific hierarchy level. * Hierarchy result is ElementsAndOffsets structure, for each element there is descendants array, * with size offset[element_index] - (element_index > 0 ? offset[element_index - 1] : 0). * * @param valid_keys - number of keys that are valid in parent_to_child map * * Example: * id parent_id * 1 0 * 2 1 * 3 1 * 4 2 * * Example. Strategy GetAllDescendantsStrategy. * Requested keys [0, 1, 2, 3, 4]. * Result: [1, 2, 3, 4], [2, 2, 4], [4], [], [] * Elements: [1, 2, 3, 4, 2, 3, 4, 4] * Offsets: [4, 7, 8, 8, 8] * * Example. Strategy GetDescendantsAtSpecificLevelStrategy with level 1. * Requested keys [0, 1, 2, 3, 4]. * Result: [1], [2, 3], [4], [], []; * Offsets: [1, 3, 4, 4, 4]; */ template ElementsAndOffsets getDescendants( const PaddedPODArray & keys, const DictionaryHierarchicalParentToChildIndex & parent_to_child_index, Strategy strategy, size_t & valid_keys) { const auto & parent_to_children_keys_range = parent_to_child_index.parent_to_children_keys_range; const auto & children_keys = parent_to_child_index.keys; /// If strategy is GetAllDescendantsStrategy we try to cache and later reuse previously calculated descendants. /// If strategy is GetDescendantsAtSpecificLevelStrategy we does not use cache strategy. size_t keys_size = keys.size(); valid_keys = 0; PaddedPODArray descendants; descendants.reserve(keys_size); PaddedPODArray descendants_offsets; descendants_offsets.reserve(keys_size); struct Range { size_t start_index; size_t end_index; }; static constexpr Int64 key_range_requires_update = -1; HashMap already_processed_keys_to_range [[maybe_unused]]; if constexpr (std::is_same_v) already_processed_keys_to_range.reserve(keys_size); struct KeyAndDepth { UInt64 key; Int64 depth; }; HashSet already_processed_keys_during_loop; already_processed_keys_during_loop.reserve(keys_size); PaddedPODArray next_keys_to_process_stack; next_keys_to_process_stack.reserve(keys_size); Int64 level = static_cast(strategy.level); for (size_t i = 0; i < keys_size; ++i) { const UInt64 & requested_key = keys[i]; if (parent_to_children_keys_range.find(requested_key) == nullptr) { descendants_offsets.emplace_back(descendants.size()); continue; } ++valid_keys; next_keys_to_process_stack.emplace_back(KeyAndDepth{requested_key, 0}); /** To cache range for key without recursive function calls and custom stack we put special * signaling value on stack key_range_requires_update. * When we pop such value from stack that means processing descendants for key is finished * and we can update range with end_index. */ while (!next_keys_to_process_stack.empty()) { KeyAndDepth key_to_process = next_keys_to_process_stack.back(); UInt64 key = key_to_process.key; Int64 depth = key_to_process.depth; next_keys_to_process_stack.pop_back(); if constexpr (std::is_same_v) { /// Update end_index for key if (depth == key_range_requires_update) { auto * it = already_processed_keys_to_range.find(key); assert(it); auto & range_to_update = it->getMapped(); range_to_update.end_index = descendants.size(); continue; } } if (unlikely(already_processed_keys_during_loop.find(key) != nullptr)) { next_keys_to_process_stack.clear(); break; } if constexpr (std::is_same_v) { const auto * already_processed_it = already_processed_keys_to_range.find(key); if (already_processed_it) { Range range = already_processed_it->getMapped(); if (unlikely(range.start_index > range.end_index)) { /// Broken range because there was loop already_processed_keys_to_range.erase(key); } else { auto insert_start_iterator = descendants.begin() + range.start_index; auto insert_end_iterator = descendants.begin() + range.end_index; descendants.insertFromItself(insert_start_iterator, insert_end_iterator); continue; } } } const auto * it = parent_to_children_keys_range.find(key); if (!it || depth >= DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH) continue; if constexpr (std::is_same_v) { if (depth > level) continue; } if constexpr (std::is_same_v) { /// Put special signaling value on stack and update cache with range start size_t range_start_index = descendants.size(); already_processed_keys_to_range[key].start_index = range_start_index; next_keys_to_process_stack.emplace_back(KeyAndDepth{key, key_range_requires_update}); } already_processed_keys_during_loop.insert(key); ++depth; DictionaryHierarchicalParentToChildIndex::KeysRange children_range = it->getMapped(); for (; children_range.start_index < children_range.end_index; ++children_range.start_index) { auto child_key = children_keys[children_range.start_index]; /// In case of GetAllDescendantsStrategy we add any descendant to result array /// If strategy is GetDescendantsAtSpecificLevelStrategy we require depth == level if constexpr (std::is_same_v) descendants.emplace_back(child_key); if constexpr (std::is_same_v) { if (depth == level) { descendants.emplace_back(child_key); continue; } } next_keys_to_process_stack.emplace_back(KeyAndDepth{child_key, depth}); } } already_processed_keys_during_loop.clear(); descendants_offsets.emplace_back(descendants.size()); } ElementsAndOffsets result = {std::move(descendants), std::move(descendants_offsets)}; return result; } /// Converts ElementAndOffsets structure into ArrayColumn ColumnPtr convertElementsAndOffsetsIntoArray(ElementsAndOffsets && elements_and_offsets); } /// Returns hierarchy array column for keys template ColumnPtr getKeysHierarchyArray( const PaddedPODArray & keys, IsKeyValidFunc && is_key_valid_func, GetParentKeyFunc && get_parent_func) { auto elements_and_offsets = detail::getHierarchy( keys, std::forward(is_key_valid_func), std::forward(get_parent_func)); return detail::convertElementsAndOffsetsIntoArray(std::move(elements_and_offsets)); } /// Returns is in hierarchy column for keys template ColumnUInt8::Ptr getKeysIsInHierarchyColumn( const PaddedPODArray & hierarchy_keys, const PaddedPODArray & hierarchy_in_keys, IsKeyValidFunc && is_key_valid_func, GetParentKeyFunc && get_parent_func) { auto is_in_hierarchy_data = detail::getIsInHierarchy( hierarchy_keys, hierarchy_in_keys, std::forward(is_key_valid_func), std::forward(get_parent_func)); auto result = ColumnUInt8::create(); result->getData() = std::move(is_in_hierarchy_data); return result; } /// Returns descendants array column for keys /// /// @param valid_keys - number of keys that are valid in parent_to_child map ColumnPtr getKeysDescendantsArray( const PaddedPODArray & requested_keys, const DictionaryHierarchicalParentToChildIndex & parent_to_child_index, size_t level, size_t & valid_keys); /** Default getHierarchy implementation for dictionaries that does not have structure with child to parent representation. * Implementation will build such structure with getColumn calls, and then getHierarchy for such structure. * * @param valid_keys - number of keys (from @key_column) for which information about parent exists. * @return ColumnArray with hierarchy arrays for keys from key_column. */ ColumnPtr getKeysHierarchyDefaultImplementation( const IDictionary * dictionary, ColumnPtr key_column, const DataTypePtr & key_type, size_t & valid_keys); /** Default isInHierarchy implementation for dictionaries that does not have structure with child to parent representation. * Implementation will build such structure with getColumn calls, and then getHierarchy for such structure. * * @param valid_keys - number of keys (from @key_column) for which information about parent exists. * @return UInt8 column if key from in_key_column is in key hierarchy from key_column. */ ColumnUInt8::Ptr getKeysIsInHierarchyDefaultImplementation( const IDictionary * dictionary, ColumnPtr key_column, ColumnPtr in_key_column, const DataTypePtr & key_type, size_t & valid_keys); }