#pragma once #include #include #include #include /// TLDType namespace DB { struct FirstSignificantSubdomainDefaultLookup { bool operator()(StringRef host) const { return tldLookup::isValid(host.data, host.size); } }; template struct ExtractFirstSignificantSubdomain { static size_t getReserveLengthForElement() { return 10; } static void execute(const Pos data, const size_t size, Pos & res_data, size_t & res_size, Pos * out_domain_end = nullptr) { FirstSignificantSubdomainDefaultLookup loookup; return execute(loookup, data, size, res_data, res_size, out_domain_end); } template static void execute(const Lookup & lookup, const Pos data, const size_t size, Pos & res_data, size_t & res_size, Pos * out_domain_end = nullptr) { res_data = data; res_size = 0; Pos tmp; size_t domain_length; ExtractDomain::execute(data, size, tmp, domain_length); if (domain_length == 0) return; if (out_domain_end) *out_domain_end = tmp + domain_length; /// cut useless dot if (tmp[domain_length - 1] == '.') --domain_length; res_data = tmp; res_size = domain_length; const auto * begin = tmp; const auto * end = begin + domain_length; std::array last_periods{}; const auto * pos = find_first_symbols<'.'>(begin, end); while (pos < end) { last_periods[2] = last_periods[1]; last_periods[1] = last_periods[0]; last_periods[0] = pos; pos = find_first_symbols<'.'>(pos + 1, end); } if (!last_periods[0]) return; if (!last_periods[1]) { res_size = last_periods[0] - begin; return; } if (!last_periods[2]) last_periods[2] = begin - 1; const auto * end_of_level_domain = find_first_symbols<'/'>(last_periods[0], end); if (!end_of_level_domain) { end_of_level_domain = end; } size_t host_len = static_cast(end_of_level_domain - last_periods[1] - 1); StringRef host{last_periods[1] + 1, host_len}; if (lookup(host)) { res_data += last_periods[2] + 1 - begin; res_size = last_periods[1] - last_periods[2] - 1; } else { res_data += last_periods[1] + 1 - begin; res_size = last_periods[0] - last_periods[1] - 1; } } /// The difference with execute() is due to custom TLD list can have records of any level, /// not only 2-nd level (like non-custom variant), so it requires more lookups. template static void executeCustom(const Lookup & lookup, const Pos data, const size_t size, Pos & res_data, size_t & res_size, Pos * out_domain_end = nullptr) { res_data = data; res_size = 0; Pos tmp; size_t domain_length; ExtractDomain::execute(data, size, tmp, domain_length); if (domain_length == 0) return; if (out_domain_end) *out_domain_end = tmp + domain_length; /// cut useless dot if (tmp[domain_length - 1] == '.') --domain_length; res_data = tmp; res_size = domain_length; const auto * begin = tmp; const auto * end = begin + domain_length; std::array last_periods{}; last_periods[0] = begin - 1; StringRef excluded_host{}; const auto * pos = find_first_symbols<'.'>(begin, end); while (pos < end) { size_t host_len = static_cast(end - pos - 1); StringRef host{pos + 1, host_len}; TLDType tld_type = lookup(host); switch (tld_type) { case TLDType::TLD_NONE: break; case TLDType::TLD_REGULAR: res_data += last_periods[0] + 1 - begin; res_size = end - 1 - last_periods[0]; return; case TLDType::TLD_ANY: { StringRef regular_host{last_periods[0] + 1, static_cast(end - 1 - last_periods[0])}; if (last_periods[1] && excluded_host != regular_host) { /// Return TLD_REGULAR + 1 res_data += last_periods[1] + 1 - begin; res_size = end - 1 - last_periods[1]; } else { /// Same as TLD_REGULAR res_data += last_periods[0] + 1 - begin; res_size = end - 1 - last_periods[0]; } return; } case TLDType::TLD_EXCLUDE: excluded_host = host; break; } last_periods[1] = last_periods[0]; last_periods[0] = pos; pos = find_first_symbols<'.'>(pos + 1, end); } /// - if there is domain of the first level (i.e. no dots in the hostname) -> /// return nothing if (last_periods[0] == begin - 1) return; /// - if there is domain of the second level -> /// always return itself /// /// - if there is domain of the 3+ level, and zero records in TLD list -> /// fallback to domain of the second level res_data += last_periods[1] + 1 - begin; res_size = last_periods[0] - last_periods[1] - 1; } }; }