From 8d3efe083522c94a0dcfd31f34a07130b824f215 Mon Sep 17 00:00:00 2001
From: Jessica James <jessica.aj@outlook.com>
Date: Fri, 3 Dec 2021 23:02:53 -0600
Subject: [PATCH] Add text_hash, text_hashi, text_equal, and text_equals
 structs to help with unordered_maps, partiuclarly case-insensitive lookups

---
 src/include/jessilib/unicode.hpp          | 172 +++++++++++++++++++++-
 src/include/jessilib/unicode_sequence.hpp |  70 ++++++---
 2 files changed, 219 insertions(+), 23 deletions(-)
diff --git a/src/include/jessilib/unicode.hpp b/src/include/jessilib/unicode.hpp
index de812f8..f7e976c 100644
--- a/src/include/jessilib/unicode.hpp
+++ b/src/include/jessilib/unicode.hpp
@@ -710,7 +710,7 @@ constexpr void join_append(T&){}; // noop
 template<typename OutT, typename InT, typename... ArgsT>
 constexpr void join_append(OutT& out_string, InT&& in_string, ArgsT&&... in_args) {
 	using InCharT = typename std::remove_cvref_t<InT>::value_type;
-	if constexpr (std::is_same_v<typename std::remove_cvref_t<OutT>::value_type, typename std::remove_cvref_t<InT>::value_type>) {
+	if constexpr (std::is_same_v<typename std::remove_cvref_t<OutT>::value_type, InCharT>) {
 		// Join these straight together
 		out_string += std::forward<InT>(in_string);
 	}
@@ -738,6 +738,176 @@ OutT join(ArgsT&&... args) {
 	return result;
 }
 
+/**
+ * Calculates the hash of a string based on its codepoints, such that a unicode string will always produce the same hash
+ * regardless of underlying encoding
+ *
+ * This is not intended for generating hashses of arbitrary data; it's specifically intended for strings of text
+ */
+struct text_hash {
+	using is_transparent = std::true_type;
+
+	template<typename CharT>
+	static uint64_t hash(const CharT* data, const CharT* end) {
+		uint64_t hash = 14695981039346656037ULL;
+
+		get_endpoint_result decode;
+		while (data != end) {
+			decode = decode_codepoint({data, static_cast<size_t>(end - data)});
+			if (decode.units == 0) {
+				return hash;
+			}
+
+			hash = hash ^ decode.codepoint;
+			hash = hash * 1099511628211ULL;
+			data += decode.units;
+		}
+
+		return hash;
+	}
+
+	auto operator()(const std::basic_string<char>& in_key) const noexcept { // ASSUMES UTF-8
+		return hash(in_key.data(), in_key.data() + in_key.size());
+	}
+
+	auto operator()(std::basic_string_view<char> in_key) const noexcept {
+		return hash(in_key.data(), in_key.data() + in_key.size());
+	}
+
+	auto operator()(const std::basic_string<char8_t>& in_key) const noexcept { // ASSUMES UTF-8
+		return hash(in_key.data(), in_key.data() + in_key.size());
+	}
+
+	auto operator()(std::basic_string_view<char8_t> in_key) const noexcept {
+		return hash(in_key.data(), in_key.data() + in_key.size());
+	}
+
+	auto operator()(const std::basic_string<char16_t>& in_key) const noexcept { // ASSUMES UTF-8
+		return hash(in_key.data(), in_key.data() + in_key.size());
+	}
+
+	auto operator()(std::basic_string_view<char16_t> in_key) const noexcept {
+		return hash(in_key.data(), in_key.data() + in_key.size());
+	}
+
+	auto operator()(const std::basic_string<char32_t>& in_key) const noexcept { // ASSUMES UTF-8
+		return hash(in_key.data(), in_key.data() + in_key.size());
+	}
+
+	auto operator()(std::basic_string_view<char32_t> in_key) const noexcept {
+		return hash(in_key.data(), in_key.data() + in_key.size());
+	}
+};
+
+struct text_equal {
+	using is_transparent = std::true_type;
+
+	template<typename LhsCharT, typename RhsCharT>
+	bool operator()(std::basic_string_view<LhsCharT> in_lhs, std::basic_string_view<RhsCharT> in_rhs) const noexcept {
+		return equals<LhsCharT, RhsCharT>(in_lhs, in_rhs);
+	}
+
+	template<typename LhsCharT, typename RhsCharT>
+	bool operator()(std::basic_string_view<LhsCharT> in_lhs, const std::basic_string<RhsCharT>& in_rhs) const noexcept {
+		return equals<LhsCharT, RhsCharT>(in_lhs, in_rhs);
+	}
+
+	template<typename LhsCharT, typename RhsCharT>
+	bool operator()(const std::basic_string<LhsCharT>& in_lhs, std::basic_string_view<RhsCharT> in_rhs) const noexcept {
+		return equals<LhsCharT, RhsCharT>(in_lhs, in_rhs);
+	}
+
+	template<typename LhsCharT, typename RhsCharT>
+	bool operator()(const std::basic_string<LhsCharT>& in_lhs, const std::basic_string<RhsCharT>& in_rhs) const noexcept {
+		return equals<LhsCharT, RhsCharT>(in_lhs, in_rhs);
+	}
+};
+
+/**
+ * Calculates the hash of a string based on its folded codepoints, such that a unicode string will always produce the
+ * same hash regardless of underlying encoding or the casing of its values.
+ *
+ * This is not intended for generating hashses of arbitrary data; it's specifically intended for strings of text
+ */
+struct text_hashi {
+	using is_transparent = std::true_type;
+
+	template<typename CharT>
+	static uint64_t hash(const CharT* data, const CharT* end) {
+		uint64_t hash = 14695981039346656037ULL;
+
+		get_endpoint_result decode;
+		while (data != end) {
+			decode = decode_codepoint({data, static_cast<size_t>(end - data)});
+			if (decode.units == 0) {
+				return hash;
+			}
+
+			hash = hash ^ fold(decode.codepoint);
+			hash = hash * 1099511628211ULL;
+			data += decode.units;
+		}
+
+		return hash;
+	}
+
+	auto operator()(const std::basic_string<char>& in_key) const noexcept { // ASSUMES UTF-8
+		return hash(in_key.data(), in_key.data() + in_key.size());
+	}
+
+	auto operator()(std::basic_string_view<char> in_key) const noexcept {
+		return hash(in_key.data(), in_key.data() + in_key.size());
+	}
+
+	auto operator()(const std::basic_string<char8_t>& in_key) const noexcept { // ASSUMES UTF-8
+		return hash(in_key.data(), in_key.data() + in_key.size());
+	}
+
+	auto operator()(std::basic_string_view<char8_t> in_key) const noexcept {
+		return hash(in_key.data(), in_key.data() + in_key.size());
+	}
+
+	auto operator()(const std::basic_string<char16_t>& in_key) const noexcept { // ASSUMES UTF-8
+		return hash(in_key.data(), in_key.data() + in_key.size());
+	}
+
+	auto operator()(std::basic_string_view<char16_t> in_key) const noexcept {
+		return hash(in_key.data(), in_key.data() + in_key.size());
+	}
+
+	auto operator()(const std::basic_string<char32_t>& in_key) const noexcept { // ASSUMES UTF-8
+		return hash(in_key.data(), in_key.data() + in_key.size());
+	}
+
+	auto operator()(std::basic_string_view<char32_t> in_key) const noexcept {
+		return hash(in_key.data(), in_key.data() + in_key.size());
+	}
+};
+
+struct text_equali {
+	using is_transparent = std::true_type;
+
+	template<typename LhsCharT, typename RhsCharT>
+	bool operator()(std::basic_string_view<LhsCharT> in_lhs, std::basic_string_view<RhsCharT> in_rhs) const noexcept {
+		return equalsi<LhsCharT, RhsCharT>(in_lhs, in_rhs);
+	}
+
+	template<typename LhsCharT, typename RhsCharT>
+	bool operator()(std::basic_string_view<LhsCharT> in_lhs, const std::basic_string<RhsCharT>& in_rhs) const noexcept {
+		return equalsi<LhsCharT, RhsCharT>(in_lhs, in_rhs);
+	}
+
+	template<typename LhsCharT, typename RhsCharT>
+	bool operator()(const std::basic_string<LhsCharT>& in_lhs, std::basic_string_view<RhsCharT> in_rhs) const noexcept {
+		return equalsi<LhsCharT, RhsCharT>(in_lhs, in_rhs);
+	}
+
+	template<typename LhsCharT, typename RhsCharT>
+	bool operator()(const std::basic_string<LhsCharT>& in_lhs, const std::basic_string<RhsCharT>& in_rhs) const noexcept {
+		return equalsi<LhsCharT, RhsCharT>(in_lhs, in_rhs);
+	}
+};
+
 /** to_lower / to_upper */
 //char32_t to_lower(char32_t in_chr); // TODO: implement
 //char32_t to_upper(char32_t in_chr); // TODO: implement
diff --git a/src/include/jessilib/unicode_sequence.hpp b/src/include/jessilib/unicode_sequence.hpp
index b8c0678..e0a2ce9 100644
--- a/src/include/jessilib/unicode_sequence.hpp
+++ b/src/include/jessilib/unicode_sequence.hpp
@@ -45,6 +45,41 @@ bool shrink_tree_member_compare(const shrink_sequence_tree_member<CharT>& in_lhs
 	return in_lhs.first < in_rhs;
 }
 
+// Lessers on left
+template<typename CharT, const shrink_sequence_tree<CharT> TreeBegin, size_t TreeSize>
+constexpr bool is_sorted() {
+	auto head = TreeBegin;
+	constexpr auto end = TreeBegin + TreeSize;
+
+	if (head == end) {
+		return true;
+	}
+
+	while (head + 1 != end) {
+		const auto next = head + 1;
+		if (head->first > next->first) {
+			return false;
+		}
+
+		++head;
+	}
+
+	return true;
+}
+
+// Checks whether or not an escape tree consists solely of ASCII / Basic Latin; NOTE: DOES NOT RECURSE
+template<typename CharT, const shrink_sequence_tree<CharT> TreeBegin, size_t TreeSize>
+constexpr bool is_simple() {
+	// This was going to be used to slightly optimized the searching, until it was realized:
+	// 1) How small the trees are, making the searches require at most maybe 4 checks in most cases
+	static_assert(is_sorted<CharT, TreeBegin, TreeSize>(), "Tree must be pre-sorted");
+	if constexpr (TreeSize == 0) {
+		return true;
+	}
+
+	return TreeBegin[TreeSize - 1].first <= 0x7F;
+}
+
 // Only use for ASTs where each character process is guaranteed to write at most 1 character for each character consumed
 template<typename CharT, const shrink_sequence_tree<CharT> SequenceTreeBegin, size_t SequenceTreeSize>
 bool apply_shrink_sequence_tree(std::basic_string<CharT>& inout_string) {
@@ -274,7 +309,7 @@ constexpr shrink_sequence_tree_member<CharT> make_tree_sequence_pair() {
 	return { InCodepointV, [](CharT*& in_write_head, std::basic_string_view<CharT>& read_view) {
 		auto decode = decode_codepoint(read_view); // TODO: make constexpr
 
-		constexpr auto SubTreeEnd = SubTreeBegin + SubTreeSize;
+		constexpr shrink_sequence_tree_member<CharT>* SubTreeEnd = SubTreeBegin + SubTreeSize;
 		auto parser = std::lower_bound(SubTreeBegin, SubTreeEnd, decode.codepoint, &shrink_tree_member_compare<CharT>);
 		if (parser == SubTreeEnd || parser->first != decode.codepoint) {
 			if constexpr (FailNotFound) {
@@ -299,27 +334,9 @@ constexpr shrink_sequence_tree_member<CharT> make_tree_sequence_pair() {
 	} };
 }
 
-// Lessers on left
-template<typename CharT, const shrink_sequence_tree<CharT> SubTreeBegin, size_t SubTreeSize>
-constexpr bool is_sorted() {
-	auto head = SubTreeBegin;
-	constexpr auto end = SubTreeBegin + SubTreeSize;
-
-	if (head == end) {
-		return true;
-	}
-
-	while (head + 1 != end) {
-		const auto next = head + 1;
-		if (head->first > next->first) {
-			return false;
-		}
-
-		++head;
-	}
-
-	return true;
-}
+/**
+ * C++ escape sequence parser
+ */
 
 template<typename CharT>
 static constexpr shrink_sequence_tree<CharT> cpp_escapes_main_tree{
@@ -378,4 +395,13 @@ bool apply_cpp_escape_sequences(std::basic_string<CharT>& inout_string) {
 	return apply_shrink_sequence_tree<CharT, cpp_escapes_root_tree<CharT>, std::size(cpp_escapes_root_tree<CharT>)>(inout_string);
 }
 
+/**
+ * Query string escape sequence parser
+ */
+
+static constexpr shrink_sequence_tree<char8_t> http_query_escapes_root_tree{
+	make_hex_sequence_pair<char8_t, U'%', 2, false, false>(),
+	make_simple_sequence_pair<char8_t, U'+', ' '>()
+};
+
 } // namespace jessilib