Browse Source

Add text_hash, text_hashi, text_equal, and text_equals structs to help with unordered_maps, partiuclarly case-insensitive lookups

master
Jessica James 3 years ago
parent
commit
8d3efe0835
  1. 172
      src/include/jessilib/unicode.hpp
  2. 70
      src/include/jessilib/unicode_sequence.hpp

172
src/include/jessilib/unicode.hpp

@ -710,7 +710,7 @@ constexpr void join_append(T&){}; // noop
template<typename OutT, typename InT, typename... ArgsT>
constexpr void join_append(OutT& out_string, InT&& in_string, ArgsT&&... in_args) {
using InCharT = typename std::remove_cvref_t<InT>::value_type;
if constexpr (std::is_same_v<typename std::remove_cvref_t<OutT>::value_type, typename std::remove_cvref_t<InT>::value_type>) {
if constexpr (std::is_same_v<typename std::remove_cvref_t<OutT>::value_type, InCharT>) {
// Join these straight together
out_string += std::forward<InT>(in_string);
}
@ -738,6 +738,176 @@ OutT join(ArgsT&&... args) {
return result;
}
/**
* Calculates the hash of a string based on its codepoints, such that a unicode string will always produce the same hash
* regardless of underlying encoding
*
* This is not intended for generating hashses of arbitrary data; it's specifically intended for strings of text
*/
struct text_hash {
using is_transparent = std::true_type;
template<typename CharT>
static uint64_t hash(const CharT* data, const CharT* end) {
uint64_t hash = 14695981039346656037ULL;
get_endpoint_result decode;
while (data != end) {
decode = decode_codepoint({data, static_cast<size_t>(end - data)});
if (decode.units == 0) {
return hash;
}
hash = hash ^ decode.codepoint;
hash = hash * 1099511628211ULL;
data += decode.units;
}
return hash;
}
auto operator()(const std::basic_string<char>& in_key) const noexcept { // ASSUMES UTF-8
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(std::basic_string_view<char> in_key) const noexcept {
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(const std::basic_string<char8_t>& in_key) const noexcept { // ASSUMES UTF-8
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(std::basic_string_view<char8_t> in_key) const noexcept {
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(const std::basic_string<char16_t>& in_key) const noexcept { // ASSUMES UTF-8
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(std::basic_string_view<char16_t> in_key) const noexcept {
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(const std::basic_string<char32_t>& in_key) const noexcept { // ASSUMES UTF-8
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(std::basic_string_view<char32_t> in_key) const noexcept {
return hash(in_key.data(), in_key.data() + in_key.size());
}
};
struct text_equal {
using is_transparent = std::true_type;
template<typename LhsCharT, typename RhsCharT>
bool operator()(std::basic_string_view<LhsCharT> in_lhs, std::basic_string_view<RhsCharT> in_rhs) const noexcept {
return equals<LhsCharT, RhsCharT>(in_lhs, in_rhs);
}
template<typename LhsCharT, typename RhsCharT>
bool operator()(std::basic_string_view<LhsCharT> in_lhs, const std::basic_string<RhsCharT>& in_rhs) const noexcept {
return equals<LhsCharT, RhsCharT>(in_lhs, in_rhs);
}
template<typename LhsCharT, typename RhsCharT>
bool operator()(const std::basic_string<LhsCharT>& in_lhs, std::basic_string_view<RhsCharT> in_rhs) const noexcept {
return equals<LhsCharT, RhsCharT>(in_lhs, in_rhs);
}
template<typename LhsCharT, typename RhsCharT>
bool operator()(const std::basic_string<LhsCharT>& in_lhs, const std::basic_string<RhsCharT>& in_rhs) const noexcept {
return equals<LhsCharT, RhsCharT>(in_lhs, in_rhs);
}
};
/**
* Calculates the hash of a string based on its folded codepoints, such that a unicode string will always produce the
* same hash regardless of underlying encoding or the casing of its values.
*
* This is not intended for generating hashses of arbitrary data; it's specifically intended for strings of text
*/
struct text_hashi {
using is_transparent = std::true_type;
template<typename CharT>
static uint64_t hash(const CharT* data, const CharT* end) {
uint64_t hash = 14695981039346656037ULL;
get_endpoint_result decode;
while (data != end) {
decode = decode_codepoint({data, static_cast<size_t>(end - data)});
if (decode.units == 0) {
return hash;
}
hash = hash ^ fold(decode.codepoint);
hash = hash * 1099511628211ULL;
data += decode.units;
}
return hash;
}
auto operator()(const std::basic_string<char>& in_key) const noexcept { // ASSUMES UTF-8
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(std::basic_string_view<char> in_key) const noexcept {
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(const std::basic_string<char8_t>& in_key) const noexcept { // ASSUMES UTF-8
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(std::basic_string_view<char8_t> in_key) const noexcept {
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(const std::basic_string<char16_t>& in_key) const noexcept { // ASSUMES UTF-8
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(std::basic_string_view<char16_t> in_key) const noexcept {
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(const std::basic_string<char32_t>& in_key) const noexcept { // ASSUMES UTF-8
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(std::basic_string_view<char32_t> in_key) const noexcept {
return hash(in_key.data(), in_key.data() + in_key.size());
}
};
struct text_equali {
using is_transparent = std::true_type;
template<typename LhsCharT, typename RhsCharT>
bool operator()(std::basic_string_view<LhsCharT> in_lhs, std::basic_string_view<RhsCharT> in_rhs) const noexcept {
return equalsi<LhsCharT, RhsCharT>(in_lhs, in_rhs);
}
template<typename LhsCharT, typename RhsCharT>
bool operator()(std::basic_string_view<LhsCharT> in_lhs, const std::basic_string<RhsCharT>& in_rhs) const noexcept {
return equalsi<LhsCharT, RhsCharT>(in_lhs, in_rhs);
}
template<typename LhsCharT, typename RhsCharT>
bool operator()(const std::basic_string<LhsCharT>& in_lhs, std::basic_string_view<RhsCharT> in_rhs) const noexcept {
return equalsi<LhsCharT, RhsCharT>(in_lhs, in_rhs);
}
template<typename LhsCharT, typename RhsCharT>
bool operator()(const std::basic_string<LhsCharT>& in_lhs, const std::basic_string<RhsCharT>& in_rhs) const noexcept {
return equalsi<LhsCharT, RhsCharT>(in_lhs, in_rhs);
}
};
/** to_lower / to_upper */
//char32_t to_lower(char32_t in_chr); // TODO: implement
//char32_t to_upper(char32_t in_chr); // TODO: implement

70
src/include/jessilib/unicode_sequence.hpp

@ -45,6 +45,41 @@ bool shrink_tree_member_compare(const shrink_sequence_tree_member<CharT>& in_lhs
return in_lhs.first < in_rhs;
}
// Lessers on left
template<typename CharT, const shrink_sequence_tree<CharT> TreeBegin, size_t TreeSize>
constexpr bool is_sorted() {
auto head = TreeBegin;
constexpr auto end = TreeBegin + TreeSize;
if (head == end) {
return true;
}
while (head + 1 != end) {
const auto next = head + 1;
if (head->first > next->first) {
return false;
}
++head;
}
return true;
}
// Checks whether or not an escape tree consists solely of ASCII / Basic Latin; NOTE: DOES NOT RECURSE
template<typename CharT, const shrink_sequence_tree<CharT> TreeBegin, size_t TreeSize>
constexpr bool is_simple() {
// This was going to be used to slightly optimized the searching, until it was realized:
// 1) How small the trees are, making the searches require at most maybe 4 checks in most cases
static_assert(is_sorted<CharT, TreeBegin, TreeSize>(), "Tree must be pre-sorted");
if constexpr (TreeSize == 0) {
return true;
}
return TreeBegin[TreeSize - 1].first <= 0x7F;
}
// Only use for ASTs where each character process is guaranteed to write at most 1 character for each character consumed
template<typename CharT, const shrink_sequence_tree<CharT> SequenceTreeBegin, size_t SequenceTreeSize>
bool apply_shrink_sequence_tree(std::basic_string<CharT>& inout_string) {
@ -274,7 +309,7 @@ constexpr shrink_sequence_tree_member<CharT> make_tree_sequence_pair() {
return { InCodepointV, [](CharT*& in_write_head, std::basic_string_view<CharT>& read_view) {
auto decode = decode_codepoint(read_view); // TODO: make constexpr
constexpr auto SubTreeEnd = SubTreeBegin + SubTreeSize;
constexpr shrink_sequence_tree_member<CharT>* SubTreeEnd = SubTreeBegin + SubTreeSize;
auto parser = std::lower_bound(SubTreeBegin, SubTreeEnd, decode.codepoint, &shrink_tree_member_compare<CharT>);
if (parser == SubTreeEnd || parser->first != decode.codepoint) {
if constexpr (FailNotFound) {
@ -299,27 +334,9 @@ constexpr shrink_sequence_tree_member<CharT> make_tree_sequence_pair() {
} };
}
// Lessers on left
template<typename CharT, const shrink_sequence_tree<CharT> SubTreeBegin, size_t SubTreeSize>
constexpr bool is_sorted() {
auto head = SubTreeBegin;
constexpr auto end = SubTreeBegin + SubTreeSize;
if (head == end) {
return true;
}
while (head + 1 != end) {
const auto next = head + 1;
if (head->first > next->first) {
return false;
}
++head;
}
return true;
}
/**
* C++ escape sequence parser
*/
template<typename CharT>
static constexpr shrink_sequence_tree<CharT> cpp_escapes_main_tree{
@ -378,4 +395,13 @@ bool apply_cpp_escape_sequences(std::basic_string<CharT>& inout_string) {
return apply_shrink_sequence_tree<CharT, cpp_escapes_root_tree<CharT>, std::size(cpp_escapes_root_tree<CharT>)>(inout_string);
}
/**
* Query string escape sequence parser
*/
static constexpr shrink_sequence_tree<char8_t> http_query_escapes_root_tree{
make_hex_sequence_pair<char8_t, U'%', 2, false, false>(),
make_simple_sequence_pair<char8_t, U'+', ' '>()
};
} // namespace jessilib

Loading…
Cancel
Save