#pragma once #include #include #include #include #include #include #include #include #ifdef __SSE4_2__ # include #endif namespace DB { // used by FunctionsStringSimilarity and FunctionsStringHash // includes extracting ASCII ngram, UTF8 ngram, ASCII word and UTF8 word struct ExtractStringImpl { static const UInt8 * readOneWord(const UInt8 *& pos, const UInt8 * end) { // jump separators while (pos < end && isUTF8Sep(*pos)) ++pos; // word start from here const UInt8 * word_start = pos; while (pos < end && !isUTF8Sep(*pos)) ++pos; return word_start; } // we use ASCII non-alphanum character as UTF8 separator static ALWAYS_INLINE inline bool isUTF8Sep(const UInt8 c) { return c < 128 && !isAlphaNumericASCII(c); } // read one UTF8 character static ALWAYS_INLINE inline void readOneUTF8Code(const UInt8 *& pos, const UInt8 * end) { size_t length = UTF8::seqLength(*pos); if (pos + length > end) length = end - pos; pos += length; } }; }