#pragma once #include #include #include #include #include #include #include #include #include namespace DB { namespace ErrorCodes { extern const int ILLEGAL_COLUMN; } using Pos = const char *; template class FunctionCountMatches : public IFunction { public: static constexpr auto name = CountMatchesBase::name; static FunctionPtr create(ContextPtr) { return std::make_shared>(); } String getName() const override { return name; } size_t getNumberOfArguments() const override { return 2; } bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override { FunctionArgumentDescriptors args{ {"haystack", static_cast(&isStringOrFixedString), nullptr, "String or FixedString"}, {"pattern", static_cast(&isString), isColumnConst, "constant String"} }; validateFunctionArguments(*this, arguments, args); return std::make_shared(); } DataTypePtr getReturnTypeForDefaultImplementationForDynamic() const override { return std::make_shared(); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override { const IColumn * col_pattern = arguments[1].column.get(); const ColumnConst * col_pattern_const = checkAndGetColumnConst(col_pattern); if (col_pattern_const == nullptr) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Pattern argument is not const"); const OptimizedRegularExpression re = Regexps::createRegexp(col_pattern_const->getValue()); const IColumn * col_haystack = arguments[0].column.get(); OptimizedRegularExpression::MatchVec matches; if (const ColumnConst * col_haystack_const = checkAndGetColumnConstStringOrFixedString(col_haystack)) { std::string_view str = col_haystack_const->getDataColumn().getDataAt(0).toView(); uint64_t matches_count = countMatches(str, re, matches); return result_type->createColumnConst(input_rows_count, matches_count); } if (const ColumnString * col_haystack_string = checkAndGetColumn(col_haystack)) { auto col_res = ColumnUInt64::create(); const ColumnString::Chars & src_chars = col_haystack_string->getChars(); const ColumnString::Offsets & src_offsets = col_haystack_string->getOffsets(); ColumnUInt64::Container & vec_res = col_res->getData(); vec_res.resize(input_rows_count); ColumnString::Offset current_src_offset = 0; for (size_t i = 0; i < input_rows_count; ++i) { Pos pos = reinterpret_cast(&src_chars[current_src_offset]); current_src_offset = src_offsets[i]; Pos end = reinterpret_cast(&src_chars[current_src_offset]) - 1; std::string_view str(pos, end - pos); vec_res[i] = countMatches(str, re, matches); } return col_res; } if (const ColumnFixedString * col_haystack_fixedstring = checkAndGetColumn(col_haystack)) { auto col_res = ColumnUInt64::create(); ColumnUInt64::Container & vec_res = col_res->getData(); vec_res.resize(input_rows_count); for (size_t i = 0; i < input_rows_count; ++i) { std::string_view str = col_haystack_fixedstring->getDataAt(i).toView(); vec_res[i] = countMatches(str, re, matches); } return col_res; } throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Could not cast haystack argument to String or FixedString"); } static uint64_t countMatches(std::string_view src, const OptimizedRegularExpression & re, OptimizedRegularExpression::MatchVec & matches) { /// Only one match is required, no need to copy more. static const unsigned matches_limit = 1; Pos pos = reinterpret_cast(src.data()); Pos end = reinterpret_cast(src.data() + src.size()); uint64_t match_count = 0; while (true) { if (pos >= end) break; if (!re.match(pos, end - pos, matches, matches_limit)) break; /// Progress should be made, but with empty match the progress will not be done. /// Also note that simply check is pattern empty is not enough, /// since for example "'[f]{0}'" will match zero bytes: if (!matches[0].length) break; pos += matches[0].offset + matches[0].length; ++match_count; } return match_count; } }; }