From 8d3efe083522c94a0dcfd31f34a07130b824f215 Mon Sep 17 00:00:00 2001 From: Jessica James Date: Fri, 3 Dec 2021 23:02:53 -0600 Subject: [PATCH] Add text_hash, text_hashi, text_equal, and text_equals structs to help with unordered_maps, partiuclarly case-insensitive lookups --- src/include/jessilib/unicode.hpp | 172 +++++++++++++++++++++- src/include/jessilib/unicode_sequence.hpp | 70 ++++++--- 2 files changed, 219 insertions(+), 23 deletions(-) diff --git a/src/include/jessilib/unicode.hpp b/src/include/jessilib/unicode.hpp index de812f8..f7e976c 100644 --- a/src/include/jessilib/unicode.hpp +++ b/src/include/jessilib/unicode.hpp @@ -710,7 +710,7 @@ constexpr void join_append(T&){}; // noop template constexpr void join_append(OutT& out_string, InT&& in_string, ArgsT&&... in_args) { using InCharT = typename std::remove_cvref_t::value_type; - if constexpr (std::is_same_v::value_type, typename std::remove_cvref_t::value_type>) { + if constexpr (std::is_same_v::value_type, InCharT>) { // Join these straight together out_string += std::forward(in_string); } @@ -738,6 +738,176 @@ OutT join(ArgsT&&... args) { return result; } +/** + * Calculates the hash of a string based on its codepoints, such that a unicode string will always produce the same hash + * regardless of underlying encoding + * + * This is not intended for generating hashses of arbitrary data; it's specifically intended for strings of text + */ +struct text_hash { + using is_transparent = std::true_type; + + template + static uint64_t hash(const CharT* data, const CharT* end) { + uint64_t hash = 14695981039346656037ULL; + + get_endpoint_result decode; + while (data != end) { + decode = decode_codepoint({data, static_cast(end - data)}); + if (decode.units == 0) { + return hash; + } + + hash = hash ^ decode.codepoint; + hash = hash * 1099511628211ULL; + data += decode.units; + } + + return hash; + } + + auto operator()(const std::basic_string& in_key) const noexcept { // ASSUMES UTF-8 + return hash(in_key.data(), in_key.data() + in_key.size()); + } + + auto operator()(std::basic_string_view in_key) const noexcept { + return hash(in_key.data(), in_key.data() + in_key.size()); + } + + auto operator()(const std::basic_string& in_key) const noexcept { // ASSUMES UTF-8 + return hash(in_key.data(), in_key.data() + in_key.size()); + } + + auto operator()(std::basic_string_view in_key) const noexcept { + return hash(in_key.data(), in_key.data() + in_key.size()); + } + + auto operator()(const std::basic_string& in_key) const noexcept { // ASSUMES UTF-8 + return hash(in_key.data(), in_key.data() + in_key.size()); + } + + auto operator()(std::basic_string_view in_key) const noexcept { + return hash(in_key.data(), in_key.data() + in_key.size()); + } + + auto operator()(const std::basic_string& in_key) const noexcept { // ASSUMES UTF-8 + return hash(in_key.data(), in_key.data() + in_key.size()); + } + + auto operator()(std::basic_string_view in_key) const noexcept { + return hash(in_key.data(), in_key.data() + in_key.size()); + } +}; + +struct text_equal { + using is_transparent = std::true_type; + + template + bool operator()(std::basic_string_view in_lhs, std::basic_string_view in_rhs) const noexcept { + return equals(in_lhs, in_rhs); + } + + template + bool operator()(std::basic_string_view in_lhs, const std::basic_string& in_rhs) const noexcept { + return equals(in_lhs, in_rhs); + } + + template + bool operator()(const std::basic_string& in_lhs, std::basic_string_view in_rhs) const noexcept { + return equals(in_lhs, in_rhs); + } + + template + bool operator()(const std::basic_string& in_lhs, const std::basic_string& in_rhs) const noexcept { + return equals(in_lhs, in_rhs); + } +}; + +/** + * Calculates the hash of a string based on its folded codepoints, such that a unicode string will always produce the + * same hash regardless of underlying encoding or the casing of its values. + * + * This is not intended for generating hashses of arbitrary data; it's specifically intended for strings of text + */ +struct text_hashi { + using is_transparent = std::true_type; + + template + static uint64_t hash(const CharT* data, const CharT* end) { + uint64_t hash = 14695981039346656037ULL; + + get_endpoint_result decode; + while (data != end) { + decode = decode_codepoint({data, static_cast(end - data)}); + if (decode.units == 0) { + return hash; + } + + hash = hash ^ fold(decode.codepoint); + hash = hash * 1099511628211ULL; + data += decode.units; + } + + return hash; + } + + auto operator()(const std::basic_string& in_key) const noexcept { // ASSUMES UTF-8 + return hash(in_key.data(), in_key.data() + in_key.size()); + } + + auto operator()(std::basic_string_view in_key) const noexcept { + return hash(in_key.data(), in_key.data() + in_key.size()); + } + + auto operator()(const std::basic_string& in_key) const noexcept { // ASSUMES UTF-8 + return hash(in_key.data(), in_key.data() + in_key.size()); + } + + auto operator()(std::basic_string_view in_key) const noexcept { + return hash(in_key.data(), in_key.data() + in_key.size()); + } + + auto operator()(const std::basic_string& in_key) const noexcept { // ASSUMES UTF-8 + return hash(in_key.data(), in_key.data() + in_key.size()); + } + + auto operator()(std::basic_string_view in_key) const noexcept { + return hash(in_key.data(), in_key.data() + in_key.size()); + } + + auto operator()(const std::basic_string& in_key) const noexcept { // ASSUMES UTF-8 + return hash(in_key.data(), in_key.data() + in_key.size()); + } + + auto operator()(std::basic_string_view in_key) const noexcept { + return hash(in_key.data(), in_key.data() + in_key.size()); + } +}; + +struct text_equali { + using is_transparent = std::true_type; + + template + bool operator()(std::basic_string_view in_lhs, std::basic_string_view in_rhs) const noexcept { + return equalsi(in_lhs, in_rhs); + } + + template + bool operator()(std::basic_string_view in_lhs, const std::basic_string& in_rhs) const noexcept { + return equalsi(in_lhs, in_rhs); + } + + template + bool operator()(const std::basic_string& in_lhs, std::basic_string_view in_rhs) const noexcept { + return equalsi(in_lhs, in_rhs); + } + + template + bool operator()(const std::basic_string& in_lhs, const std::basic_string& in_rhs) const noexcept { + return equalsi(in_lhs, in_rhs); + } +}; + /** to_lower / to_upper */ //char32_t to_lower(char32_t in_chr); // TODO: implement //char32_t to_upper(char32_t in_chr); // TODO: implement diff --git a/src/include/jessilib/unicode_sequence.hpp b/src/include/jessilib/unicode_sequence.hpp index b8c0678..e0a2ce9 100644 --- a/src/include/jessilib/unicode_sequence.hpp +++ b/src/include/jessilib/unicode_sequence.hpp @@ -45,6 +45,41 @@ bool shrink_tree_member_compare(const shrink_sequence_tree_member& in_lhs return in_lhs.first < in_rhs; } +// Lessers on left +template TreeBegin, size_t TreeSize> +constexpr bool is_sorted() { + auto head = TreeBegin; + constexpr auto end = TreeBegin + TreeSize; + + if (head == end) { + return true; + } + + while (head + 1 != end) { + const auto next = head + 1; + if (head->first > next->first) { + return false; + } + + ++head; + } + + return true; +} + +// Checks whether or not an escape tree consists solely of ASCII / Basic Latin; NOTE: DOES NOT RECURSE +template TreeBegin, size_t TreeSize> +constexpr bool is_simple() { + // This was going to be used to slightly optimized the searching, until it was realized: + // 1) How small the trees are, making the searches require at most maybe 4 checks in most cases + static_assert(is_sorted(), "Tree must be pre-sorted"); + if constexpr (TreeSize == 0) { + return true; + } + + return TreeBegin[TreeSize - 1].first <= 0x7F; +} + // Only use for ASTs where each character process is guaranteed to write at most 1 character for each character consumed template SequenceTreeBegin, size_t SequenceTreeSize> bool apply_shrink_sequence_tree(std::basic_string& inout_string) { @@ -274,7 +309,7 @@ constexpr shrink_sequence_tree_member make_tree_sequence_pair() { return { InCodepointV, [](CharT*& in_write_head, std::basic_string_view& read_view) { auto decode = decode_codepoint(read_view); // TODO: make constexpr - constexpr auto SubTreeEnd = SubTreeBegin + SubTreeSize; + constexpr shrink_sequence_tree_member* SubTreeEnd = SubTreeBegin + SubTreeSize; auto parser = std::lower_bound(SubTreeBegin, SubTreeEnd, decode.codepoint, &shrink_tree_member_compare); if (parser == SubTreeEnd || parser->first != decode.codepoint) { if constexpr (FailNotFound) { @@ -299,27 +334,9 @@ constexpr shrink_sequence_tree_member make_tree_sequence_pair() { } }; } -// Lessers on left -template SubTreeBegin, size_t SubTreeSize> -constexpr bool is_sorted() { - auto head = SubTreeBegin; - constexpr auto end = SubTreeBegin + SubTreeSize; - - if (head == end) { - return true; - } - - while (head + 1 != end) { - const auto next = head + 1; - if (head->first > next->first) { - return false; - } - - ++head; - } - - return true; -} +/** + * C++ escape sequence parser + */ template static constexpr shrink_sequence_tree cpp_escapes_main_tree{ @@ -378,4 +395,13 @@ bool apply_cpp_escape_sequences(std::basic_string& inout_string) { return apply_shrink_sequence_tree, std::size(cpp_escapes_root_tree)>(inout_string); } +/** + * Query string escape sequence parser + */ + +static constexpr shrink_sequence_tree http_query_escapes_root_tree{ + make_hex_sequence_pair(), + make_simple_sequence_pair() +}; + } // namespace jessilib