From f714a1da00dfd9f1f6ef9a8350099318ddd4734e Mon Sep 17 00:00:00 2001 From: Jessica James Date: Mon, 29 Nov 2021 00:42:59 -0600 Subject: [PATCH] Add find, findi --- src/include/jessilib/unicode.hpp | 184 ++++++++++++++++++++++++++++--- src/test/unicode.cpp | 113 +++++++++++++++++-- 2 files changed, 275 insertions(+), 22 deletions(-) diff --git a/src/include/jessilib/unicode.hpp b/src/include/jessilib/unicode.hpp index 9c11957..8f8cc33 100644 --- a/src/include/jessilib/unicode.hpp +++ b/src/include/jessilib/unicode.hpp @@ -217,17 +217,17 @@ inline bool equalsi(char32_t lhs, char32_t rhs) { auto method(const std::basic_string& lhs, std::basic_string_view rhs) { \ return method(static_cast>(lhs), rhs); } \ template \ - bool method(std::basic_string_view lhs, const std::basic_string& rhs) { \ + auto method(std::basic_string_view lhs, const std::basic_string& rhs) { \ return method(lhs, static_cast>(rhs)); } \ template \ - bool method(const std::basic_string& lhs, const std::basic_string& rhs) { \ + auto method(const std::basic_string& lhs, const std::basic_string& rhs) { \ return method(static_cast>(lhs), static_cast>(rhs)); } /** * Checks if two strings are equal * - * @tparam LhsCharT Unicode codepoint container type for left-hand parameter - * @tparam RhsCharT Unicode codepoint container type for right-hand parameter + * @tparam LhsCharT Character type for left-hand parameter + * @tparam RhsCharT Character type for right-hand parameter * @param lhs First string to compare * @param rhs Second string to compare against * @return True if the strings are equal, false otherwise @@ -267,8 +267,8 @@ ADAPT_BASIC_STRING(equals) /** * Checks if two strings are equal (case insensitive) * - * @tparam LhsCharT Unicode codepoint container type for left-hand parameter - * @tparam RhsCharT Unicode codepoint container type for right-hand parameter + * @tparam LhsCharT Character type for left-hand parameter + * @tparam RhsCharT Character type for right-hand parameter * @param lhs First string to compare * @param rhs Second string to compare against * @return True if the strings are equal, false otherwise @@ -310,8 +310,8 @@ ADAPT_BASIC_STRING(equalsi) /** * Checks if a string starts with a substring * - * @tparam LhsCharT Unicode codepoint container type for underlying string - * @tparam RhsCharT Unicode codepoint container type for prefix string + * @tparam LhsCharT Character type for underlying string + * @tparam RhsCharT Character type for prefix string * @param in_string String to check for prefix * @param in_prefix Substring prefix to check for * @return Data length of in_prefix in terms of LhsCharT if in_string starts with in_prefix, 0 otherwise @@ -360,15 +360,15 @@ ADAPT_BASIC_STRING(starts_with_length) /** * Checks if a string starts with a substring (case insensitive) * - * @tparam LhsCharT Unicode codepoint container type for underlying string - * @tparam RhsCharT Unicode codepoint container type for prefix string + * @tparam LhsCharT Character type for underlying string + * @tparam RhsCharT Character type for prefix string * @param in_string String to check for prefix * @param in_prefix Substring prefix to check for * @return Data length of in_prefix in terms of LhsCharT if in_string starts with in_prefix, 0 otherwise */ template size_t starts_with_lengthi(std::basic_string_view in_string, std::basic_string_view in_prefix) { - // If in_string and in_prefix are the same type, compare their sizes and quickly return if in_string is too small + // If in_string and in_prefix are the same type, skip decoding each point if constexpr (std::is_same_v) { if (in_string.size() < in_prefix.size()) { return 0; @@ -410,8 +410,8 @@ ADAPT_BASIC_STRING(starts_with_lengthi) /** * Checks if a string starts with a substring * - * @tparam LhsCharT Unicode codepoint container type for underlying string - * @tparam RhsCharT Unicode codepoint container type for prefix string + * @tparam LhsCharT Character type for underlying string + * @tparam RhsCharT Character type for prefix string * @param in_string String to check for prefix * @param in_prefix Prefix to check for * @return True if both strings are valid and in_string starts with in_prefix, false otherwise @@ -426,8 +426,8 @@ ADAPT_BASIC_STRING(starts_with) /** * Checks if a string starts with a substring (case insensitive) * - * @tparam LhsCharT Unicode codepoint container type for underlying string - * @tparam RhsCharT Unicode codepoint container type for prefix string + * @tparam LhsCharT Character type for underlying string + * @tparam RhsCharT Character type for prefix string * @param in_string String to check for prefix * @param in_prefix Prefix to check for * @return True if both strings are valid and in_string starts with in_prefix, false otherwise @@ -439,6 +439,160 @@ bool starts_withi(std::basic_string_view in_string, std::basic_string_ ADAPT_BASIC_STRING(starts_withi) +/** + * Searches a string for a specified substring + * + * @tparam LhsCharT Character type of the string being searched + * @tparam RhsCharT Character type of the substring being searched for + * @param in_string String to search + * @param in_substring Substring to search for + * @return Character data index on success, npos otherwise + */ +template +size_t find(std::basic_string_view in_string, char32_t in_codepoint) { + // If we don't have anything to search through, there's nothing to be found + if (in_string.empty()) { + return decltype(in_string)::npos; + } + + if constexpr (!CaseSensitive) { + in_codepoint = fold(in_codepoint); + } + + size_t codepoints_removed{}; + while (!in_string.empty()) { + std::basic_string_view string = in_string; + get_endpoint_result string_front = decode_codepoint(string); + + if (string_front.units == 0) { + // Failed to decode front codepoint; bad unicode sequence + return decltype(in_string)::npos; + } + + if constexpr (CaseSensitive) { + if (string_front.codepoint == in_codepoint) { + // Match found! + return codepoints_removed; + } + } + else { + if (fold(string_front.codepoint) == in_codepoint) { + // Match found! + return codepoints_removed; + } + } + + // Didn't find a match here; remove the front codepoint and try the next position + in_string.remove_prefix(string_front.units); + codepoints_removed += string_front.units; + } + + // We reached the end of in_string before finding the prefix :( + return decltype(in_string)::npos; +} + +/** + * Searches a string for a specified substring + * + * @tparam LhsCharT Character type of the string being searched + * @tparam RhsCharT Character type of the substring being searched for + * @param in_string String to search + * @param in_substring Substring to search for + * @return Character data index on success, npos otherwise + */ +template +size_t find(std::basic_string_view in_string, std::basic_string_view in_substring) { + // If we're searching for nothing, then we've found it at the front + if (in_substring.empty()) { + return 0; + } + + // If we don't have anything to search through, there's nothing to be found + if (in_string.empty()) { + return decltype(in_string)::npos; + } + + // TODO: expand this to cover any instance where in_substring is a single codepoint, rather than a single data unit + if (in_substring.size() == 1) { + return find(in_string, in_substring[0]); + } + + // TODO: optimize for when in_substring is small and of different type, by only decoding it once + + size_t codepoints_removed{}; + while (!in_string.empty()) { + // If in_string and in_prefix are the same type, compare their sizes and quickly return if in_string is too small + if constexpr (std::is_same_v) { + if (in_string.size() < in_substring.size()) { + return decltype(in_string)::npos; + } + } + + std::basic_string_view string = in_string; + std::basic_string_view substring = in_substring; + get_endpoint_result string_front; + do { + // TODO: optimize this for when in_string and in_substring are same type, by only decoding in_string, solely + // to determine number of data units to compare + string_front = decode_codepoint(string); + get_endpoint_result prefix_front = decode_codepoint(substring); + + if (string_front.units == 0 + || prefix_front.units == 0) { + // Failed to decode front codepoint; bad unicode sequence + return decltype(in_string)::npos; + } + + if constexpr (CaseSensitive) { + if (string_front.codepoint != prefix_front.codepoint) { + // Codepoints aren't the same; break & try next position + break; + } + } + else { + if (!equalsi(string_front.codepoint, prefix_front.codepoint)) { + // Codepoints don't fold the same; break & try next position + break; + } + } + + // Codepoints are equal; trim off the fronts and continue + string.remove_prefix(string_front.units); + substring.remove_prefix(prefix_front.units); + + if (substring.empty()) { + // We found the substring! We can return our current position + return codepoints_removed; + } + } while (!string.empty()); + + // Didn't find a match here; remove the front codepoint and try the next position + in_string.remove_prefix(string_front.units); + codepoints_removed += string_front.units; + } + + // We reached the end of in_string before finding the prefix :( + return decltype(in_string)::npos; +} + +ADAPT_BASIC_STRING(find) + +/** + * Searches a string for a specified substring + * + * @tparam LhsCharT Character type of the string being searched + * @tparam RhsCharT Character type of the substring being searched for + * @param in_string String to search + * @param in_substring Substring to search for + * @return Character data index on success, npos otherwise + */ +template +size_t findi(std::basic_string_view in_string, std::basic_string_view in_substring) { + return find(in_string, in_substring); +} + +ADAPT_BASIC_STRING(findi) + /** to_lower / to_upper */ //char32_t to_lower(char32_t in_chr); // TODO: implement //char32_t to_upper(char32_t in_chr); // TODO: implement diff --git a/src/test/unicode.cpp b/src/test/unicode.cpp index b08745a..aa1796a 100644 --- a/src/test/unicode.cpp +++ b/src/test/unicode.cpp @@ -130,10 +130,10 @@ using char_type_combos = ::testing::Types< std::pair, std::pair, std::pair, std::pair>; template -class UnicodeAbcdTest : public ::testing::Test { +class UnicodeFullTest : public ::testing::Test { public: }; -TYPED_TEST_SUITE(UnicodeAbcdTest, char_type_combos); +TYPED_TEST_SUITE(UnicodeFullTest, char_type_combos); template std::basic_string make_str(const char32_t (&in_str)[InLength]) { @@ -147,7 +147,7 @@ std::basic_string make_str(const char32_t (&in_str)[InLength]) { /** string_cast */ -TYPED_TEST(UnicodeAbcdTest, string_cast) { +TYPED_TEST(UnicodeFullTest, string_cast) { auto abcd_str = make_str(U"ABCD"); std::basic_string_view abcd_string_view = abcd_str; @@ -168,7 +168,7 @@ TEST(UTF8Test, string_view_cast) { /** equals */ -TYPED_TEST(UnicodeAbcdTest, equals) { +TYPED_TEST(UnicodeFullTest, equals) { // TypeParam::first_type == TypeParam::second_type EXPECT_TRUE(equals(make_str(U"ABCD"), make_str(U"ABCD"))); @@ -184,7 +184,7 @@ TYPED_TEST(UnicodeAbcdTest, equals) { /** equalsi */ -TYPED_TEST(UnicodeAbcdTest, equalsi) { +TYPED_TEST(UnicodeFullTest, equalsi) { // TypeParam::first_type == TypeParam::second_type EXPECT_TRUE(equalsi(make_str(U"ABCD"), make_str(U"ABCD"))); @@ -200,7 +200,7 @@ TYPED_TEST(UnicodeAbcdTest, equalsi) { /** starts_with */ -TYPED_TEST(UnicodeAbcdTest, starts_with) { +TYPED_TEST(UnicodeFullTest, starts_with) { // TypeParam::first_type == TypeParam::second_type EXPECT_TRUE(starts_with(make_str(U"ABCD"), make_str(U"ABCD"))); @@ -240,7 +240,7 @@ TYPED_TEST(UnicodeAbcdTest, starts_with) { /** starts_withi */ -TYPED_TEST(UnicodeAbcdTest, starts_withi) { +TYPED_TEST(UnicodeFullTest, starts_withi) { // TypeParam::first_type == TypeParam::second_type EXPECT_TRUE(starts_withi(make_str(U"ABCD"), make_str(U"ABCD"))); @@ -285,6 +285,105 @@ TYPED_TEST(UnicodeAbcdTest, starts_withi) { make_str(U"del"))); } +TYPED_TEST(UnicodeFullTest, find) { + auto abcd_str = make_str(U"ABCD"); + + // Empty substring + EXPECT_EQ(find(abcd_str, make_str(U"")), 0); + + // Single-characters + EXPECT_EQ(find(abcd_str, make_str(U"A")), 0); + EXPECT_EQ(find(abcd_str, make_str(U"B")), 1); + EXPECT_EQ(find(abcd_str, make_str(U"C")), 2); + EXPECT_EQ(find(abcd_str, make_str(U"D")), 3); + EXPECT_EQ(find(abcd_str, make_str(U"E")), decltype(abcd_str)::npos); + + // Two characters + EXPECT_EQ(find(abcd_str, make_str(U"AB")), 0); + EXPECT_EQ(find(abcd_str, make_str(U"BC")), 1); + EXPECT_EQ(find(abcd_str, make_str(U"CD")), 2); + EXPECT_EQ(find(abcd_str, make_str(U"DA")), decltype(abcd_str)::npos); + + auto double_abcd_str = make_str(U"AABBCCDD"); + + // Single-characters + EXPECT_EQ(find(double_abcd_str, make_str(U"A")), 0); + EXPECT_EQ(find(double_abcd_str, make_str(U"B")), 2); + EXPECT_EQ(find(double_abcd_str, make_str(U"C")), 4); + EXPECT_EQ(find(double_abcd_str, make_str(U"D")), 6); + EXPECT_EQ(find(double_abcd_str, make_str(U"E")), decltype(double_abcd_str)::npos); + + // Two characters + EXPECT_EQ(find(double_abcd_str, make_str(U"AA")), 0); + EXPECT_EQ(find(double_abcd_str, make_str(U"AB")), 1); + EXPECT_EQ(find(double_abcd_str, make_str(U"BB")), 2); + EXPECT_EQ(find(double_abcd_str, make_str(U"BC")), 3); + EXPECT_EQ(find(double_abcd_str, make_str(U"CC")), 4); + EXPECT_EQ(find(double_abcd_str, make_str(U"CD")), 5); + EXPECT_EQ(find(double_abcd_str, make_str(U"DD")), 6); + EXPECT_EQ(find(double_abcd_str, make_str(U"DA")), decltype(double_abcd_str)::npos); +} + +TYPED_TEST(UnicodeFullTest, findi) { + auto abcd_str = make_str(U"ABCD"); + + // Empty substring + EXPECT_EQ(findi(abcd_str, make_str(U"")), 0); + + // Single-characters + EXPECT_EQ(findi(abcd_str, make_str(U"A")), 0); + EXPECT_EQ(findi(abcd_str, make_str(U"B")), 1); + EXPECT_EQ(findi(abcd_str, make_str(U"C")), 2); + EXPECT_EQ(findi(abcd_str, make_str(U"D")), 3); + EXPECT_EQ(findi(abcd_str, make_str(U"E")), decltype(abcd_str)::npos); + EXPECT_EQ(findi(abcd_str, make_str(U"a")), 0); + EXPECT_EQ(findi(abcd_str, make_str(U"b")), 1); + EXPECT_EQ(findi(abcd_str, make_str(U"c")), 2); + EXPECT_EQ(findi(abcd_str, make_str(U"d")), 3); + EXPECT_EQ(findi(abcd_str, make_str(U"e")), decltype(abcd_str)::npos); + + // Two characters + EXPECT_EQ(findi(abcd_str, make_str(U"AB")), 0); + EXPECT_EQ(findi(abcd_str, make_str(U"BC")), 1); + EXPECT_EQ(findi(abcd_str, make_str(U"CD")), 2); + EXPECT_EQ(findi(abcd_str, make_str(U"DA")), decltype(abcd_str)::npos); + EXPECT_EQ(findi(abcd_str, make_str(U"ab")), 0); + EXPECT_EQ(findi(abcd_str, make_str(U"bc")), 1); + EXPECT_EQ(findi(abcd_str, make_str(U"cd")), 2); + EXPECT_EQ(findi(abcd_str, make_str(U"da")), decltype(abcd_str)::npos); + + auto double_abcd_str = make_str(U"AABBCCDD"); + + // Single-characters + EXPECT_EQ(findi(double_abcd_str, make_str(U"A")), 0); + EXPECT_EQ(findi(double_abcd_str, make_str(U"B")), 2); + EXPECT_EQ(findi(double_abcd_str, make_str(U"C")), 4); + EXPECT_EQ(findi(double_abcd_str, make_str(U"D")), 6); + EXPECT_EQ(findi(double_abcd_str, make_str(U"E")), decltype(double_abcd_str)::npos); + EXPECT_EQ(findi(double_abcd_str, make_str(U"a")), 0); + EXPECT_EQ(findi(double_abcd_str, make_str(U"b")), 2); + EXPECT_EQ(findi(double_abcd_str, make_str(U"c")), 4); + EXPECT_EQ(findi(double_abcd_str, make_str(U"d")), 6); + EXPECT_EQ(findi(double_abcd_str, make_str(U"e")), decltype(double_abcd_str)::npos); + + // Two characters + EXPECT_EQ(findi(double_abcd_str, make_str(U"AA")), 0); + EXPECT_EQ(findi(double_abcd_str, make_str(U"AB")), 1); + EXPECT_EQ(findi(double_abcd_str, make_str(U"BB")), 2); + EXPECT_EQ(findi(double_abcd_str, make_str(U"BC")), 3); + EXPECT_EQ(findi(double_abcd_str, make_str(U"CC")), 4); + EXPECT_EQ(findi(double_abcd_str, make_str(U"CD")), 5); + EXPECT_EQ(findi(double_abcd_str, make_str(U"DD")), 6); + EXPECT_EQ(findi(double_abcd_str, make_str(U"DA")), decltype(double_abcd_str)::npos); + EXPECT_EQ(findi(double_abcd_str, make_str(U"aa")), 0); + EXPECT_EQ(findi(double_abcd_str, make_str(U"ab")), 1); + EXPECT_EQ(findi(double_abcd_str, make_str(U"bb")), 2); + EXPECT_EQ(findi(double_abcd_str, make_str(U"bc")), 3); + EXPECT_EQ(findi(double_abcd_str, make_str(U"cc")), 4); + EXPECT_EQ(findi(double_abcd_str, make_str(U"cd")), 5); + EXPECT_EQ(findi(double_abcd_str, make_str(U"dd")), 6); + EXPECT_EQ(findi(double_abcd_str, make_str(U"da")), decltype(double_abcd_str)::npos); +} /** * Folding test