From e3142da3298c735a4d339de671d230f9263c5dcd Mon Sep 17 00:00:00 2001 From: Jessica James Date: Sun, 5 Dec 2021 16:56:45 -0600 Subject: [PATCH] Initial implementation for syntax_tree, with deserialize_html_form as a prototype; add container::push; split query_string stuff to http_query.hpp --- src/include/jessilib/http_query.hpp | 190 ++++++++++ src/include/jessilib/object.hpp | 29 +- src/include/jessilib/type_traits.hpp | 143 ++++++- src/include/jessilib/unicode.hpp | 415 +-------------------- src/include/jessilib/unicode_base.hpp | 56 +++ src/include/jessilib/unicode_compare.hpp | 430 ++++++++++++++++++++++ src/include/jessilib/unicode_sequence.hpp | 147 +++----- src/include/jessilib/unicode_syntax.hpp | 139 +++++++ src/test/CMakeLists.txt | 2 +- src/test/http_query.cpp | 238 ++++++++++++ src/test/unicode.cpp | 6 + src/test/unicode_sequence.cpp | 137 +------ 12 files changed, 1281 insertions(+), 651 deletions(-) create mode 100644 src/include/jessilib/http_query.hpp create mode 100644 src/include/jessilib/unicode_compare.hpp create mode 100644 src/include/jessilib/unicode_syntax.hpp create mode 100644 src/test/http_query.cpp diff --git a/src/include/jessilib/http_query.hpp b/src/include/jessilib/http_query.hpp new file mode 100644 index 0000000..b60f5c3 --- /dev/null +++ b/src/include/jessilib/http_query.hpp @@ -0,0 +1,190 @@ +/** + * Copyright (C) 2021 Jessica James. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * Written by Jessica James + */ + +/** + * @file unicode_sequence.hpp + * @author Jessica James + * + * Unicode-aware syntax tree parsing utilities + */ + +#pragma once + +#include "unicode_syntax.hpp" +#include "unicode_sequence.hpp" +#include "type_traits.hpp" + +namespace jessilib { + +/** + * Query string escape sequence parser + */ + +template* = nullptr> // make_hex_sequence_pair isn't going to play well with other types +static constexpr shrink_sequence_tree http_query_escapes_root_tree{ + make_hex_sequence_pair(), + make_simple_sequence_pair() +}; +static_assert(is_sorted, std::size(http_query_escapes_root_tree)>(), "Tree must be pre-sorted"); +static_assert(is_sorted, std::size(http_query_escapes_root_tree)>(), "Tree must be pre-sorted"); + +template* = nullptr> +constexpr bool deserialize_http_query(std::basic_string& inout_string) { + return apply_shrink_sequence_tree, std::size(http_query_escapes_root_tree)>(inout_string); +} + +// TODO: decide whether to take this approach, where query strings are assumed to represent UTF-8 text data, OR implement +// such that calling deserialize_http_query will assume the relevant encoding (i.e: calling with char16_t would read in +// escaped query values as bytes in codepoint char16_t, rather than utf-8 encoding sequence) +/*template* = nullptr> +bool deserialize_http_query(std::basic_string& inout_string) { + //TODO: optimize this? + std::basic_string u8query_string = string_cast(inout_string); + bool result = deserialize_http_query(u8query_string); + inout_string = string_cast(u8query_string); + return result; +}*/ + +/** + * HTML form parser + */ + +template +struct HTMLFormContext { + using container_type = ContainerT; + container_type& out_container; + CharT* write_head; + const CharT* key_start = write_head; + const CharT* value_start{}; // value_start is also key_end +}; + +template +constexpr syntax_tree_member make_value_start_pair() { + // '=' + return { InCodepointV, [](ContextT& inout_context, std::basic_string_view&) constexpr { + if (inout_context.value_start != nullptr) { + // There's already a value pending; this must just be part of the value. + inout_context.write_head += encode_codepoint(inout_context.write_head, InCodepointV); + return true; + } + + // Start pending_value + inout_context.value_start = inout_context.write_head; + return true; + } }; +} + +template +constexpr bool value_end_action(ContextT& inout_context, std::basic_string_view&) { + const CharT* value_end = inout_context.write_head; + const CharT* key_start = inout_context.key_start; + const CharT* value_start = inout_context.value_start; + if (value_start != nullptr) { + // Terminate key & value, push them to table + std::basic_string_view key{ key_start, static_cast(value_start - key_start) }; + std::basic_string_view value{ value_start, static_cast(value_end - value_start) }; + jessilib::container::push(inout_context.out_container, key, value); + + // Start reading next key + inout_context.key_start = value_end; + inout_context.value_start = nullptr; + return true; + } + + // This is a valueless key; terminate the key and push it + std::basic_string_view key{ key_start, static_cast(value_end - key_start) }; + jessilib::container::push(inout_context.out_container, key, std::basic_string_view{}); + + // Start reading next key + inout_context.key_start = value_end; + return true; +} + +template +constexpr syntax_tree_member make_value_end_pair() { + // '&' + return { InCodepointV, value_end_action }; +} + +template +constexpr syntax_tree_member make_hex_syntax_shrink_pair() { + return { InCodepointV, [](ContextT& inout_context, std::basic_string_view& inout_read_view) constexpr { + return hex_shrink_sequence_action(inout_context.write_head, inout_read_view); + } }; +} + +template +constexpr syntax_tree_member make_simple_shrink_pair() { + return { + InCodepointV, + [](ContextT& inout_context, std::basic_string_view&) constexpr { + *inout_context.write_head = static_cast(OutCodepointV); + ++inout_context.write_head; + return true; + } + }; +} + +template +bool html_form_default_action(get_endpoint_result decode, ContextT& inout_context, std::basic_string_view& inout_read_view) { + // A regular character; copy it and advance the read/write heads + CharT*& write_head = inout_context.write_head; + CharT* write_end = write_head + decode.units; + while (write_head != write_end) { + *write_head = inout_read_view.front(); + ++write_head; + inout_read_view.remove_prefix(1); + } + + return true; +} + +template* = nullptr> // make_hex_sequence_pair isn't going to play well with other types +static constexpr syntax_tree> html_form_root_tree{ + make_hex_syntax_shrink_pair, U'%', 2, true, false>(), + make_value_end_pair, U'&'>(), + make_simple_shrink_pair, U'+', ' '>(), + make_value_start_pair, U'='>() +}; + +template* = nullptr> +constexpr bool deserialize_html_form(ContainerT& out_container, std::basic_string& inout_string) { + if (inout_string.empty()) { + return true; // even though apply_syntax_tree checks for this, check it here anyways so we don't call value_end_action + } + + HTMLFormContext context{ out_container, inout_string.data() }; + constexpr auto& html_form_tree = html_form_root_tree; + static_assert(is_sorted(), "Tree must be pre-sorted"); + + std::basic_string_view read_view{ inout_string }; + if (apply_syntax_tree + (context, read_view)) { + value_end_action(context, read_view); + return true; + } + + return false; +} + +} // namespace jessilib diff --git a/src/include/jessilib/object.hpp b/src/include/jessilib/object.hpp index 3cf625f..32cce7c 100644 --- a/src/include/jessilib/object.hpp +++ b/src/include/jessilib/object.hpp @@ -101,7 +101,7 @@ public: template::type>::value && !is_sequence_container::type>::value - && (!is_associative_container::type>::value || std::is_same::type, map_type>::value)>::type* = nullptr> + && (!is_associative_container::type>::value || std::is_same::type, map_type>::value)>::type* = nullptr> object(T&& in_value) : m_value{ typename is_backing::type>::type{ std::forward(in_value) } } { // Empty ctor body @@ -140,10 +140,10 @@ public: // Non-map_type associative containers (container) template::type>::value - && (std::is_convertible::type>::key_type, string_type>::value - || std::is_convertible::type>::key_type, string_view_type>::value) - && !std::is_same::type>::value_type, object>::value>::type* = nullptr> + typename std::enable_if::type>::value + && (std::is_convertible::type>::key_type, string_type>::value + || std::is_convertible::type>::key_type, string_view_type>::value) + && !std::is_same::type>::value_type, object>::value>::type* = nullptr> object(T&& in_value) : m_value{ map_type{} } { auto& map = std::get(m_value); @@ -474,6 +474,25 @@ private: // represent as a map, whereas an actual xml document is sequenced }; // object +namespace container { + +template>* = nullptr> +constexpr void push(ContainerT& inout_container, LeftT&& in_key, RightT&& in_value) { + auto object_type = inout_container.type(); + if (object_type == object::type::null || object_type == object::type::map) { + // Push to map if null or map type + inout_container[in_key] = in_value; + } + else if (object_type == object::type::array) { + // Push to back of array if array type + inout_container[inout_container.size()][in_key] = in_value; + } + // else // do nothing; pushing a key/value pair isn't valid here +} + +} // namespace container + } // namespace jessilib diff --git a/src/include/jessilib/type_traits.hpp b/src/include/jessilib/type_traits.hpp index 85fd80d..223deea 100644 --- a/src/include/jessilib/type_traits.hpp +++ b/src/include/jessilib/type_traits.hpp @@ -20,9 +20,14 @@ #include #include + +// Container types we're using, more or less purely because we can't forward declare these at all #include #include #include +#include +#include +#include #include #include #include @@ -32,13 +37,6 @@ namespace jessilib { -/** remove_cvref (can be replaced with C++20) */ - -template -struct remove_cvref { - typedef std::remove_cv_t> type; -}; - /** is_basic_string */ template @@ -65,6 +63,20 @@ struct is_basic_string_view> { constexpr bool operator()() const noexcept { return true; } }; +/** is_pair */ + +template +struct is_pair : std::false_type {}; + +template +struct is_pair> { + using first_type = LeftT; + using second_type = RightT; + static constexpr bool value{ true }; + constexpr operator bool() const noexcept { return true; } + constexpr bool operator()() const noexcept { return true; } +}; + /** is_vector */ template @@ -104,6 +116,45 @@ struct is_forward_list> { constexpr bool operator()() const noexcept { return true; } }; +/** is_stack */ + +template +struct is_stack : std::false_type {}; + +template +struct is_stack> { + using type = T; + static constexpr bool value{ true }; + constexpr operator bool() const noexcept { return true; } + constexpr bool operator()() const noexcept { return true; } +}; + +/** is_queue */ + +template +struct is_queue : std::false_type {}; + +template +struct is_queue> { + using type = T; + static constexpr bool value{ true }; + constexpr operator bool() const noexcept { return true; } + constexpr bool operator()() const noexcept { return true; } +}; + +/** is_deque */ + +template +struct is_deque : std::false_type {}; + +template +struct is_deque> { + using type = T; + static constexpr bool value{ true }; + constexpr operator bool() const noexcept { return true; } + constexpr bool operator()() const noexcept { return true; } +}; + /** is_set */ template @@ -170,6 +221,20 @@ struct is_map> { constexpr bool operator()() const noexcept { return true; } }; +/** is_multimap */ + +template +struct is_multimap : std::false_type {}; + +template +struct is_multimap> { + using key_type = KeyT; + using value_type = ValueT; + static constexpr bool value{ true }; + constexpr operator bool() const noexcept { return true; } + constexpr bool operator()() const noexcept { return true; } +}; + /** is_unordered_map */ template @@ -184,6 +249,18 @@ struct is_unordered_map> { constexpr bool operator()() const noexcept { return true; } }; +template +struct is_unordered_multimap : std::false_type {}; + +template +struct is_unordered_multimap> { + using key_type = KeyT; + using value_type = ValueT; + static constexpr bool value{ true }; + constexpr operator bool() const noexcept { return true; } + constexpr bool operator()() const noexcept { return true; } +}; + /** is_associative_container */ template @@ -236,6 +313,7 @@ struct is_sequence_container> { constexpr bool operator()() const noexcept { return true; } }; +// Sets are really associative containers, not sequence... template struct is_sequence_container> { using type = T; @@ -268,4 +346,55 @@ struct is_sequence_container> { constexpr bool operator()() const noexcept { return true; } }; +/** + * Push helper for pushing key/value pairs to arbitrary container types + * + * If ContainerT is associative: set key/value + * If ContainerT is multi-associative: add key/value + * If ContainerT is sequential: push key/value pair to back + */ +namespace container { +/** Pushing to associative containers */ +template::value || is_unordered_map::value>* = nullptr> +constexpr void push(ContainerT& inout_container, LeftT&& in_key, RightT&& in_value) { + inout_container[in_key] = in_value; +} + +template::value || is_unordered_set::value>* = nullptr> +constexpr void push(ContainerT& inout_container, LeftT&& in_key, RightT&& in_value) { + auto insert_result = inout_container.insert({in_key, in_value}); + if (!insert_result) { + *insert_result.first = { in_key, in_value }; + } +} + +template::value || is_unordered_multimap::value + || is_multiset::value || is_unordered_multiset::value>* = nullptr> +constexpr void push(ContainerT& inout_container, LeftT&& in_key, RightT&& in_value) { + inout_container.insert({in_key, in_value}); +} + +/** Pushing to sequential containers */ +template::value || is_list::value || is_deque::value>* = nullptr> +constexpr void push(ContainerT& inout_container, LeftT&& in_key, RightT&& in_value) { + inout_container.push_back({in_key, in_value}); +} + +template::value || is_stack::value>* = nullptr> +constexpr void push(ContainerT& inout_container, LeftT&& in_key, RightT&& in_value) { + inout_container.push({in_key, in_value}); +} + +template::value>* = nullptr> +constexpr void push(ContainerT& inout_container, LeftT&& in_key, RightT&& in_value) { + inout_container.push_front({in_key, in_value}); +} + +} // namespace container } // namespace jessilib diff --git a/src/include/jessilib/unicode.hpp b/src/include/jessilib/unicode.hpp index 3ab4818..25aadab 100644 --- a/src/include/jessilib/unicode.hpp +++ b/src/include/jessilib/unicode.hpp @@ -18,10 +18,7 @@ #pragma once -#include -#include -#include -#include "unicode_base.hpp" +#include "unicode_compare.hpp" namespace jessilib { @@ -179,246 +176,6 @@ std::basic_string string_cast(const InT& in_string) { } } -/** - * Checks if two codepoints are equal to each-other (case insensitive) - * - * @param lhs First codepoint to compare - * @param rhs Second codepoint to compare - * @return True if the characters are equal, false otherwise - */ -inline bool equalsi(char32_t lhs, char32_t rhs) { - return lhs == rhs - || fold(lhs) == fold(rhs); -} - -// Should just make these methods container-type agnostic rather than this mess... -#define ADAPT_BASIC_STRING(method) \ - template \ - auto method(const std::basic_string& lhs, std::basic_string_view rhs) { \ - return method(static_cast>(lhs), rhs); } \ - template \ - auto method(std::basic_string_view lhs, const std::basic_string& rhs) { \ - return method(lhs, static_cast>(rhs)); } \ - template \ - auto method(const std::basic_string& lhs, const std::basic_string& rhs) { \ - return method(static_cast>(lhs), static_cast>(rhs)); } - -/** - * Checks if two strings are equal - * - * @tparam LhsCharT Character type for left-hand parameter - * @tparam RhsCharT Character type for right-hand parameter - * @param lhs First string to compare - * @param rhs Second string to compare against - * @return True if the strings are equal, false otherwise - */ -template -bool equals(std::basic_string_view lhs, std::basic_string_view rhs) { - // If lhs and rhs are the same type, compare their sizes and quickly return if not same - if constexpr (std::is_same_v) { - return lhs == rhs; - } - - while (!lhs.empty() && !rhs.empty()) { - auto lhs_front = decode_codepoint(lhs); - auto rhs_front = decode_codepoint(rhs); - - if (lhs_front.units == 0 - || rhs_front.units == 0) { - // Failed to decode front codepoint; bad unicode sequence - return false; - } - - if (lhs_front.codepoint != rhs_front.codepoint) { - // Codepoints aren't the same - return false; - } - - // Codepoints are equal; trim off the fronts and continue - lhs.remove_prefix(lhs_front.units); - rhs.remove_prefix(rhs_front.units); - } - - return lhs.empty() && rhs.empty(); -} - -ADAPT_BASIC_STRING(equals) - -/** - * Checks if two strings are equal (case insensitive) - * - * @tparam LhsCharT Character type for left-hand parameter - * @tparam RhsCharT Character type for right-hand parameter - * @param lhs First string to compare - * @param rhs Second string to compare against - * @return True if the strings are equal, false otherwise - */ -template -bool equalsi(std::basic_string_view lhs, std::basic_string_view rhs) { - // If lhs and rhs are the same type, compare their sizes and quickly return if not same - if constexpr (std::is_same_v) { - if (lhs.size() != rhs.size()) { - return false; - } - } - - while (!lhs.empty() && !rhs.empty()) { - auto lhs_front = decode_codepoint(lhs); - auto rhs_front = decode_codepoint(rhs); - - if (lhs_front.units == 0 - || rhs_front.units == 0) { - // Failed to decode front codepoint; bad unicode sequence - return false; - } - - if (!equalsi(lhs_front.codepoint, rhs_front.codepoint)) { - // Codepoints don't fold to same value - return false; - } - - // Codepoints are equal; trim off the fronts and continue - lhs.remove_prefix(lhs_front.units); - rhs.remove_prefix(rhs_front.units); - } - - return lhs.empty() && rhs.empty(); -} - -ADAPT_BASIC_STRING(equalsi) - -/** - * Checks if a string starts with a substring - * - * @tparam LhsCharT Character type for underlying string - * @tparam RhsCharT Character type for prefix string - * @param in_string String to check for prefix - * @param in_prefix Substring prefix to check for - * @return Data length of in_prefix in terms of LhsCharT if in_string starts with in_prefix, 0 otherwise - */ -template -size_t starts_with_length(std::basic_string_view in_string, std::basic_string_view in_prefix) { - // If in_string and in_prefix are the same type, compare their sizes and quickly return if in_string is too small - if constexpr (std::is_same_v) { - if (in_string.size() < in_prefix.size()) { - return 0; - } - } - - size_t codepoints_removed{}; - while (!in_string.empty() && !in_prefix.empty()) { - get_endpoint_result string_front = decode_codepoint(in_string); - get_endpoint_result prefix_front = decode_codepoint(in_prefix); - - if (string_front.units == 0 - || prefix_front.units == 0) { - // Failed to decode front codepoint; bad unicode sequence - return 0; - } - - if (string_front.codepoint != prefix_front.codepoint) { - // Codepoints aren't the same - return 0; - } - - // Codepoints are equal; trim off the fronts and continue - in_string.remove_prefix(string_front.units); - in_prefix.remove_prefix(prefix_front.units); - codepoints_removed += string_front.units; - } - - if (!in_prefix.empty()) { - // We reached end of in_string before end of prefix - return 0; - } - - return codepoints_removed; -} - -ADAPT_BASIC_STRING(starts_with_length) - -/** - * Checks if a string starts with a substring (case insensitive) - * - * @tparam LhsCharT Character type for underlying string - * @tparam RhsCharT Character type for prefix string - * @param in_string String to check for prefix - * @param in_prefix Substring prefix to check for - * @return Data length of in_prefix in terms of LhsCharT if in_string starts with in_prefix, 0 otherwise - */ -template -size_t starts_with_lengthi(std::basic_string_view in_string, std::basic_string_view in_prefix) { - // If in_string and in_prefix are the same type, skip decoding each point - if constexpr (std::is_same_v) { - if (in_string.size() < in_prefix.size()) { - return 0; - } - } - - size_t codepoints_removed{}; - while (!in_string.empty() && !in_prefix.empty()) { - get_endpoint_result string_front = decode_codepoint(in_string); - get_endpoint_result prefix_front = decode_codepoint(in_prefix); - - if (string_front.units == 0 - || prefix_front.units == 0) { - // Failed to decode front codepoint; bad unicode sequence - return 0; - } - - if (!equalsi(string_front.codepoint, prefix_front.codepoint)) { - // Codepoints don't fold to same value - return 0; - } - - // Codepoints are equal; trim off the fronts and continue - in_string.remove_prefix(string_front.units); - in_prefix.remove_prefix(prefix_front.units); - codepoints_removed += string_front.units; - } - - if (!in_prefix.empty()) { - // We reached end of in_string before end of prefix - return 0; - } - - return codepoints_removed; -} - -ADAPT_BASIC_STRING(starts_with_lengthi) - -/** - * Checks if a string starts with a substring - * - * @tparam LhsCharT Character type for underlying string - * @tparam RhsCharT Character type for prefix string - * @param in_string String to check for prefix - * @param in_prefix Prefix to check for - * @return True if both strings are valid and in_string starts with in_prefix, false otherwise - */ -template -bool starts_with(std::basic_string_view in_string, std::basic_string_view in_prefix) { - return starts_with_length(in_string, in_prefix) != 0; -} - -ADAPT_BASIC_STRING(starts_with) - -/** - * Checks if a string starts with a substring (case insensitive) - * - * @tparam LhsCharT Character type for underlying string - * @tparam RhsCharT Character type for prefix string - * @param in_string String to check for prefix - * @param in_prefix Prefix to check for - * @return True if both strings are valid and in_string starts with in_prefix, false otherwise - */ -template -bool starts_withi(std::basic_string_view in_string, std::basic_string_view in_prefix) { - return starts_with_lengthi(in_string, in_prefix) != 0; -} - -ADAPT_BASIC_STRING(starts_withi) - /** * Searches a string for a specified substring * @@ -651,176 +408,6 @@ OutT join(ArgsT&&... args) { return result; } -/** - * Calculates the hash of a string based on its codepoints, such that a unicode string will always produce the same hash - * regardless of underlying encoding - * - * This is not intended for generating hashses of arbitrary data; it's specifically intended for strings of text - */ -struct text_hash { - using is_transparent = std::true_type; - - template - static uint64_t hash(const CharT* data, const CharT* end) { - uint64_t hash = 14695981039346656037ULL; - - get_endpoint_result decode; - while (data != end) { - decode = decode_codepoint(data, end); - if (decode.units == 0) { - return hash; - } - - hash = hash ^ decode.codepoint; - hash = hash * 1099511628211ULL; - data += decode.units; - } - - return hash; - } - - auto operator()(const std::basic_string& in_key) const noexcept { // ASSUMES UTF-8 - return hash(in_key.data(), in_key.data() + in_key.size()); - } - - auto operator()(std::basic_string_view in_key) const noexcept { - return hash(in_key.data(), in_key.data() + in_key.size()); - } - - auto operator()(const std::basic_string& in_key) const noexcept { // ASSUMES UTF-8 - return hash(in_key.data(), in_key.data() + in_key.size()); - } - - auto operator()(std::basic_string_view in_key) const noexcept { - return hash(in_key.data(), in_key.data() + in_key.size()); - } - - auto operator()(const std::basic_string& in_key) const noexcept { // ASSUMES UTF-8 - return hash(in_key.data(), in_key.data() + in_key.size()); - } - - auto operator()(std::basic_string_view in_key) const noexcept { - return hash(in_key.data(), in_key.data() + in_key.size()); - } - - auto operator()(const std::basic_string& in_key) const noexcept { // ASSUMES UTF-8 - return hash(in_key.data(), in_key.data() + in_key.size()); - } - - auto operator()(std::basic_string_view in_key) const noexcept { - return hash(in_key.data(), in_key.data() + in_key.size()); - } -}; - -struct text_equal { - using is_transparent = std::true_type; - - template - bool operator()(std::basic_string_view in_lhs, std::basic_string_view in_rhs) const noexcept { - return equals(in_lhs, in_rhs); - } - - template - bool operator()(std::basic_string_view in_lhs, const std::basic_string& in_rhs) const noexcept { - return equals(in_lhs, in_rhs); - } - - template - bool operator()(const std::basic_string& in_lhs, std::basic_string_view in_rhs) const noexcept { - return equals(in_lhs, in_rhs); - } - - template - bool operator()(const std::basic_string& in_lhs, const std::basic_string& in_rhs) const noexcept { - return equals(in_lhs, in_rhs); - } -}; - -/** - * Calculates the hash of a string based on its folded codepoints, such that a unicode string will always produce the - * same hash regardless of underlying encoding or the casing of its values. - * - * This is not intended for generating hashses of arbitrary data; it's specifically intended for strings of text - */ -struct text_hashi { - using is_transparent = std::true_type; - - template - static uint64_t hash(const CharT* data, const CharT* end) { - uint64_t hash = 14695981039346656037ULL; - - get_endpoint_result decode; - while (data != end) { - decode = decode_codepoint(data, end - data); - if (decode.units == 0) { - return hash; - } - - hash = hash ^ fold(decode.codepoint); - hash = hash * 1099511628211ULL; - data += decode.units; - } - - return hash; - } - - auto operator()(const std::basic_string& in_key) const noexcept { // ASSUMES UTF-8 - return hash(in_key.data(), in_key.data() + in_key.size()); - } - - auto operator()(std::basic_string_view in_key) const noexcept { - return hash(in_key.data(), in_key.data() + in_key.size()); - } - - auto operator()(const std::basic_string& in_key) const noexcept { // ASSUMES UTF-8 - return hash(in_key.data(), in_key.data() + in_key.size()); - } - - auto operator()(std::basic_string_view in_key) const noexcept { - return hash(in_key.data(), in_key.data() + in_key.size()); - } - - auto operator()(const std::basic_string& in_key) const noexcept { // ASSUMES UTF-8 - return hash(in_key.data(), in_key.data() + in_key.size()); - } - - auto operator()(std::basic_string_view in_key) const noexcept { - return hash(in_key.data(), in_key.data() + in_key.size()); - } - - auto operator()(const std::basic_string& in_key) const noexcept { // ASSUMES UTF-8 - return hash(in_key.data(), in_key.data() + in_key.size()); - } - - auto operator()(std::basic_string_view in_key) const noexcept { - return hash(in_key.data(), in_key.data() + in_key.size()); - } -}; - -struct text_equali { - using is_transparent = std::true_type; - - template - bool operator()(std::basic_string_view in_lhs, std::basic_string_view in_rhs) const noexcept { - return equalsi(in_lhs, in_rhs); - } - - template - bool operator()(std::basic_string_view in_lhs, const std::basic_string& in_rhs) const noexcept { - return equalsi(in_lhs, in_rhs); - } - - template - bool operator()(const std::basic_string& in_lhs, std::basic_string_view in_rhs) const noexcept { - return equalsi(in_lhs, in_rhs); - } - - template - bool operator()(const std::basic_string& in_lhs, const std::basic_string& in_rhs) const noexcept { - return equalsi(in_lhs, in_rhs); - } -}; - /** to_lower / to_upper */ //char32_t to_lower(char32_t in_chr); // TODO: implement //char32_t to_upper(char32_t in_chr); // TODO: implement diff --git a/src/include/jessilib/unicode_base.hpp b/src/include/jessilib/unicode_base.hpp index 30c1e3b..034c7a0 100644 --- a/src/include/jessilib/unicode_base.hpp +++ b/src/include/jessilib/unicode_base.hpp @@ -167,6 +167,62 @@ using encode_buffer_type = CharT[unicode_traits::max_units_per_codepoint] char32_t fold(char32_t in_codepoint); // Folds codepoint for case-insensitive checks (not for human output) constexpr int as_base(char32_t in_character, unsigned int base); // The value represented by in_character in terms of base if valid, -1 otherwise +/** + * Checks if two codepoints are equal to each-other (case insensitive) + * + * @param lhs First codepoint to compare + * @param rhs Second codepoint to compare + * @return True if the characters are equal, false otherwise + */ +inline bool equalsi(char32_t lhs, char32_t rhs) { + return lhs == rhs + || fold(lhs) == fold(rhs); +} + +template +struct codepoint_info { +private: + template + static constexpr size_t encode_codepoint_length(char32_t in_codepoint) { + encode_buffer_type buffer{}; + return encode_codepoint(buffer, in_codepoint); + } + +public: + static constexpr char32_t value = InCodepointV; + template + static constexpr size_t encode_length = encode_codepoint_length(InCodepointV); + + template + using encode_buffer = CharT[encode_length]; + + static constexpr size_t utf8_length = encode_length; + static constexpr size_t utf16_length = encode_length; + static constexpr size_t utf32_length = encode_length; + static constexpr size_t wchar_length = encode_length; + + using utf8_buffer = char8_t[utf8_length]; + using utf16_buffer = char16_t[utf16_length]; + using utf32_buffer = char32_t[utf32_length]; + using wchar_buffer = wchar_t[wchar_length]; + + static constexpr void encode(utf8_buffer& buffer) { + encode_codepoint(buffer, InCodepointV); + } + + static constexpr void encode(utf16_buffer& buffer) { + encode_codepoint(buffer, InCodepointV); + } + + static constexpr void encode(utf32_buffer& buffer) { + encode_codepoint(buffer, InCodepointV); + } + + static constexpr void encode(wchar_buffer& buffer) { + encode_codepoint(buffer, InCodepointV); + } +}; + /** * Inline constexpr encode implementation */ diff --git a/src/include/jessilib/unicode_compare.hpp b/src/include/jessilib/unicode_compare.hpp new file mode 100644 index 0000000..7a68e30 --- /dev/null +++ b/src/include/jessilib/unicode_compare.hpp @@ -0,0 +1,430 @@ +/** + * Copyright (C) 2021 Jessica James. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * Written by Jessica James + */ + +/** + * @file unicode_hash.hpp + * @author Jessica James + * + * Unicode-aware encoding-agnostic string comparison & hashing utilities + */ + +#pragma once + +#include "unicode_base.hpp" + +namespace jessilib { + +// Should just make these methods container-type agnostic rather than this mess... +#define ADAPT_BASIC_STRING(method) \ + template \ + auto method(const std::basic_string& lhs, std::basic_string_view rhs) { \ + return method(static_cast>(lhs), rhs); } \ + template \ + auto method(std::basic_string_view lhs, const std::basic_string& rhs) { \ + return method(lhs, static_cast>(rhs)); } \ + template \ + auto method(const std::basic_string& lhs, const std::basic_string& rhs) { \ + return method(static_cast>(lhs), static_cast>(rhs)); } + +/** + * Checks if two strings are equal + * + * @tparam LhsCharT Character type for left-hand parameter + * @tparam RhsCharT Character type for right-hand parameter + * @param lhs First string to compare + * @param rhs Second string to compare against + * @return True if the strings are equal, false otherwise + */ +template +bool equals(std::basic_string_view lhs, std::basic_string_view rhs) { + // If lhs and rhs are the same type, compare their sizes and quickly return if not same + if constexpr (std::is_same_v) { + return lhs == rhs; + } + + while (!lhs.empty() && !rhs.empty()) { + auto lhs_front = decode_codepoint(lhs); + auto rhs_front = decode_codepoint(rhs); + + if (lhs_front.units == 0 + || rhs_front.units == 0) { + // Failed to decode front codepoint; bad unicode sequence + return false; + } + + if (lhs_front.codepoint != rhs_front.codepoint) { + // Codepoints aren't the same + return false; + } + + // Codepoints are equal; trim off the fronts and continue + lhs.remove_prefix(lhs_front.units); + rhs.remove_prefix(rhs_front.units); + } + + return lhs.empty() && rhs.empty(); +} + +ADAPT_BASIC_STRING(equals) + +/** + * Checks if two strings are equal (case insensitive) + * + * @tparam LhsCharT Character type for left-hand parameter + * @tparam RhsCharT Character type for right-hand parameter + * @param lhs First string to compare + * @param rhs Second string to compare against + * @return True if the strings are equal, false otherwise + */ +template +bool equalsi(std::basic_string_view lhs, std::basic_string_view rhs) { + // If lhs and rhs are the same type, compare their sizes and quickly return if not same + if constexpr (std::is_same_v) { + if (lhs.size() != rhs.size()) { + return false; + } + } + + while (!lhs.empty() && !rhs.empty()) { + auto lhs_front = decode_codepoint(lhs); + auto rhs_front = decode_codepoint(rhs); + + if (lhs_front.units == 0 + || rhs_front.units == 0) { + // Failed to decode front codepoint; bad unicode sequence + return false; + } + + if (!equalsi(lhs_front.codepoint, rhs_front.codepoint)) { + // Codepoints don't fold to same value + return false; + } + + // Codepoints are equal; trim off the fronts and continue + lhs.remove_prefix(lhs_front.units); + rhs.remove_prefix(rhs_front.units); + } + + return lhs.empty() && rhs.empty(); +} + +ADAPT_BASIC_STRING(equalsi) + +/** + * Checks if a string starts with a substring + * + * @tparam LhsCharT Character type for underlying string + * @tparam RhsCharT Character type for prefix string + * @param in_string String to check for prefix + * @param in_prefix Substring prefix to check for + * @return Data length of in_prefix in terms of LhsCharT if in_string starts with in_prefix, 0 otherwise + */ +template +size_t starts_with_length(std::basic_string_view in_string, std::basic_string_view in_prefix) { + // If in_string and in_prefix are the same type, compare their sizes and quickly return if in_string is too small + if constexpr (std::is_same_v) { + if (in_string.size() < in_prefix.size()) { + return 0; + } + } + + size_t codepoints_removed{}; + while (!in_string.empty() && !in_prefix.empty()) { + get_endpoint_result string_front = decode_codepoint(in_string); + get_endpoint_result prefix_front = decode_codepoint(in_prefix); + + if (string_front.units == 0 + || prefix_front.units == 0) { + // Failed to decode front codepoint; bad unicode sequence + return 0; + } + + if (string_front.codepoint != prefix_front.codepoint) { + // Codepoints aren't the same + return 0; + } + + // Codepoints are equal; trim off the fronts and continue + in_string.remove_prefix(string_front.units); + in_prefix.remove_prefix(prefix_front.units); + codepoints_removed += string_front.units; + } + + if (!in_prefix.empty()) { + // We reached end of in_string before end of prefix + return 0; + } + + return codepoints_removed; +} + +ADAPT_BASIC_STRING(starts_with_length) + +/** + * Checks if a string starts with a substring (case insensitive) + * + * @tparam LhsCharT Character type for underlying string + * @tparam RhsCharT Character type for prefix string + * @param in_string String to check for prefix + * @param in_prefix Substring prefix to check for + * @return Data length of in_prefix in terms of LhsCharT if in_string starts with in_prefix, 0 otherwise + */ +template +size_t starts_with_lengthi(std::basic_string_view in_string, std::basic_string_view in_prefix) { + // If in_string and in_prefix are the same type, skip decoding each point + if constexpr (std::is_same_v) { + if (in_string.size() < in_prefix.size()) { + return 0; + } + } + + size_t codepoints_removed{}; + while (!in_string.empty() && !in_prefix.empty()) { + get_endpoint_result string_front = decode_codepoint(in_string); + get_endpoint_result prefix_front = decode_codepoint(in_prefix); + + if (string_front.units == 0 + || prefix_front.units == 0) { + // Failed to decode front codepoint; bad unicode sequence + return 0; + } + + if (!equalsi(string_front.codepoint, prefix_front.codepoint)) { + // Codepoints don't fold to same value + return 0; + } + + // Codepoints are equal; trim off the fronts and continue + in_string.remove_prefix(string_front.units); + in_prefix.remove_prefix(prefix_front.units); + codepoints_removed += string_front.units; + } + + if (!in_prefix.empty()) { + // We reached end of in_string before end of prefix + return 0; + } + + return codepoints_removed; +} + +ADAPT_BASIC_STRING(starts_with_lengthi) + +/** + * Checks if a string starts with a substring + * + * @tparam LhsCharT Character type for underlying string + * @tparam RhsCharT Character type for prefix string + * @param in_string String to check for prefix + * @param in_prefix Prefix to check for + * @return True if both strings are valid and in_string starts with in_prefix, false otherwise + */ +template +bool starts_with(std::basic_string_view in_string, std::basic_string_view in_prefix) { + return starts_with_length(in_string, in_prefix) != 0; +} + +ADAPT_BASIC_STRING(starts_with) + +/** + * Checks if a string starts with a substring (case insensitive) + * + * @tparam LhsCharT Character type for underlying string + * @tparam RhsCharT Character type for prefix string + * @param in_string String to check for prefix + * @param in_prefix Prefix to check for + * @return True if both strings are valid and in_string starts with in_prefix, false otherwise + */ +template +bool starts_withi(std::basic_string_view in_string, std::basic_string_view in_prefix) { + return starts_with_lengthi(in_string, in_prefix) != 0; +} + +ADAPT_BASIC_STRING(starts_withi) + +/** + * Calculates the hash of a string based on its codepoints, such that a unicode string will always produce the same hash + * regardless of underlying encoding + * + * This is not intended for generating hashses of arbitrary data; it's specifically intended for strings of text + */ +struct text_hash { + using is_transparent = std::true_type; + + template + static uint64_t hash(const CharT* data, const CharT* end) { + uint64_t hash = 14695981039346656037ULL; + + get_endpoint_result decode; + while (data != end) { + decode = decode_codepoint(data, end); + if (decode.units == 0) { + return hash; + } + + hash = hash ^ decode.codepoint; + hash = hash * 1099511628211ULL; + data += decode.units; + } + + return hash; + } + + auto operator()(const std::basic_string& in_key) const noexcept { // ASSUMES UTF-8 + return hash(in_key.data(), in_key.data() + in_key.size()); + } + + auto operator()(std::basic_string_view in_key) const noexcept { + return hash(in_key.data(), in_key.data() + in_key.size()); + } + + auto operator()(const std::basic_string& in_key) const noexcept { // ASSUMES UTF-8 + return hash(in_key.data(), in_key.data() + in_key.size()); + } + + auto operator()(std::basic_string_view in_key) const noexcept { + return hash(in_key.data(), in_key.data() + in_key.size()); + } + + auto operator()(const std::basic_string& in_key) const noexcept { // ASSUMES UTF-8 + return hash(in_key.data(), in_key.data() + in_key.size()); + } + + auto operator()(std::basic_string_view in_key) const noexcept { + return hash(in_key.data(), in_key.data() + in_key.size()); + } + + auto operator()(const std::basic_string& in_key) const noexcept { // ASSUMES UTF-8 + return hash(in_key.data(), in_key.data() + in_key.size()); + } + + auto operator()(std::basic_string_view in_key) const noexcept { + return hash(in_key.data(), in_key.data() + in_key.size()); + } +}; + +struct text_equal { + using is_transparent = std::true_type; + + template + bool operator()(std::basic_string_view in_lhs, std::basic_string_view in_rhs) const noexcept { + return equals(in_lhs, in_rhs); + } + + template + bool operator()(std::basic_string_view in_lhs, const std::basic_string& in_rhs) const noexcept { + return equals(in_lhs, in_rhs); + } + + template + bool operator()(const std::basic_string& in_lhs, std::basic_string_view in_rhs) const noexcept { + return equals(in_lhs, in_rhs); + } + + template + bool operator()(const std::basic_string& in_lhs, const std::basic_string& in_rhs) const noexcept { + return equals(in_lhs, in_rhs); + } +}; + +/** + * Calculates the hash of a string based on its folded codepoints, such that a unicode string will always produce the + * same hash regardless of underlying encoding or the casing of its values. + * + * This is not intended for generating hashses of arbitrary data; it's specifically intended for strings of text + */ +struct text_hashi { + using is_transparent = std::true_type; + + template + static uint64_t hash(const CharT* data, const CharT* end) { + uint64_t hash = 14695981039346656037ULL; + + get_endpoint_result decode; + while (data != end) { + decode = decode_codepoint(data, end - data); + if (decode.units == 0) { + return hash; + } + + hash = hash ^ fold(decode.codepoint); + hash = hash * 1099511628211ULL; + data += decode.units; + } + + return hash; + } + + auto operator()(const std::basic_string& in_key) const noexcept { // ASSUMES UTF-8 + return hash(in_key.data(), in_key.data() + in_key.size()); + } + + auto operator()(std::basic_string_view in_key) const noexcept { + return hash(in_key.data(), in_key.data() + in_key.size()); + } + + auto operator()(const std::basic_string& in_key) const noexcept { // ASSUMES UTF-8 + return hash(in_key.data(), in_key.data() + in_key.size()); + } + + auto operator()(std::basic_string_view in_key) const noexcept { + return hash(in_key.data(), in_key.data() + in_key.size()); + } + + auto operator()(const std::basic_string& in_key) const noexcept { // ASSUMES UTF-8 + return hash(in_key.data(), in_key.data() + in_key.size()); + } + + auto operator()(std::basic_string_view in_key) const noexcept { + return hash(in_key.data(), in_key.data() + in_key.size()); + } + + auto operator()(const std::basic_string& in_key) const noexcept { // ASSUMES UTF-8 + return hash(in_key.data(), in_key.data() + in_key.size()); + } + + auto operator()(std::basic_string_view in_key) const noexcept { + return hash(in_key.data(), in_key.data() + in_key.size()); + } +}; + +struct text_equali { + using is_transparent = std::true_type; + + template + bool operator()(std::basic_string_view in_lhs, std::basic_string_view in_rhs) const noexcept { + return equalsi(in_lhs, in_rhs); + } + + template + bool operator()(std::basic_string_view in_lhs, const std::basic_string& in_rhs) const noexcept { + return equalsi(in_lhs, in_rhs); + } + + template + bool operator()(const std::basic_string& in_lhs, std::basic_string_view in_rhs) const noexcept { + return equalsi(in_lhs, in_rhs); + } + + template + bool operator()(const std::basic_string& in_lhs, const std::basic_string& in_rhs) const noexcept { + return equalsi(in_lhs, in_rhs); + } +}; + +} // namespace jessilib diff --git a/src/include/jessilib/unicode_sequence.hpp b/src/include/jessilib/unicode_sequence.hpp index a912e66..34bc600 100644 --- a/src/include/jessilib/unicode_sequence.hpp +++ b/src/include/jessilib/unicode_sequence.hpp @@ -237,68 +237,71 @@ constexpr shrink_sequence_tree_member make_octal_sequence_pair() { }; } -template -constexpr shrink_sequence_tree_member make_hex_sequence_pair() { - static_assert(MaxDigitsV > 0); - - return { - InCodepointV, - [](CharT*& in_write_head, std::basic_string_view& read_view) constexpr { - // Does not modify - auto read_hex = [](uint32_t& out_value, std::basic_string_view in_view, size_t max_digits) { - size_t result{}; - int hex_value; - out_value = 0; - while (result != max_digits - && !in_view.empty()) { - hex_value = as_base(in_view.front(), 16); // hexadecimal characters are always 1 unit - if (hex_value < 0) { - // Not a hexadecimal character; push what we have and handle this - return result; - } - - out_value <<= 4; - out_value |= hex_value; - - in_view.remove_prefix(1); - ++result; - } - - // Number of elements that are hexadecimal digits +template +constexpr bool hex_shrink_sequence_action(CharT*& in_write_head, std::basic_string_view& read_view) { + // Does not modify + auto read_hex = [](uint32_t& out_value, std::basic_string_view in_view, size_t max_digits) constexpr { + size_t result{}; + int hex_value; + out_value = 0; + while (result != max_digits + && !in_view.empty()) { + hex_value = as_base(in_view.front(), 16); // hexadecimal characters are always 1 unit + if (hex_value < 0) { + // Not a hexadecimal character; push what we have and handle this return result; - }; + } - // Read in hex value - uint32_t hex_value; - size_t units_read = read_hex(hex_value, read_view, MaxDigitsV); + out_value <<= 4; + out_value |= hex_value; - // Sanity check digits read - if constexpr(ExactDigitsV) { - if (units_read != MaxDigitsV) { - // We expected example MaxDigitsV digits; fail - return false; - } - } - else { - if (units_read == 0) { - // We didn't read any digits; fail - return false; - } - } + in_view.remove_prefix(1); + ++result; + } - // We read an acceptable number of digits; write the unit and call it a day - read_view.remove_prefix(units_read); - if constexpr (IsUnicode) { - in_write_head += encode_codepoint(in_write_head, hex_value); - } - else { - static_assert(MaxDigitsV <= sizeof(CharT) * 2); - *in_write_head = static_cast(hex_value); - ++in_write_head; - } + // Number of elements that are hexadecimal digits + return result; + }; - return true; + // Read in hex value + uint32_t hex_value; + size_t units_read = read_hex(hex_value, read_view, MaxDigitsV); + + // Sanity check digits read + if constexpr(ExactDigitsV) { + if (units_read != MaxDigitsV) { + // We expected example MaxDigitsV digits; fail + return false; } + } + else { + if (units_read == 0) { + // We didn't read any digits; fail + return false; + } + } + + // We read an acceptable number of digits; write the unit and call it a day + read_view.remove_prefix(units_read); + if constexpr (IsUnicode) { + in_write_head += encode_codepoint(in_write_head, hex_value); + } + else { + static_assert(MaxDigitsV <= sizeof(CharT) * 2); + *in_write_head = static_cast(hex_value); + ++in_write_head; + } + + return true; +} + +template +constexpr shrink_sequence_tree_member make_hex_sequence_pair() { + static_assert(MaxDigitsV > 0); + + return { + InCodepointV, + hex_shrink_sequence_action }; } @@ -394,36 +397,4 @@ constexpr bool apply_cpp_escape_sequences(std::basic_string& inout_string return apply_shrink_sequence_tree, std::size(cpp_escapes_root_tree)>(inout_string); } -/** - * Query string escape sequence parser - */ - -template* = nullptr> // make_hex_sequence_pair isn't going to play well with other types -static constexpr shrink_sequence_tree http_query_escapes_root_tree{ - make_hex_sequence_pair(), - make_simple_sequence_pair() -}; -static_assert(is_sorted, std::size(http_query_escapes_root_tree)>(), "Tree must be pre-sorted"); -static_assert(is_sorted, std::size(http_query_escapes_root_tree)>(), "Tree must be pre-sorted"); - -template* = nullptr> -constexpr bool deserialize_http_query(std::basic_string& inout_string) { - return apply_shrink_sequence_tree, std::size(http_query_escapes_root_tree)>(inout_string); -} - -// TODO: decide whether to take this approach, where query strings are assumed to represent UTF-8 text data, OR implement -// such that calling deserialize_http_query will assume the relevant encoding (i.e: calling with char16_t would read in -// escaped query values as bytes in codepoint char16_t, rather than utf-8 encoding sequence) -/*template* = nullptr> -bool deserialize_http_query(std::basic_string& inout_string) { - //TODO: optimize this? - std::basic_string u8query_string = string_cast(inout_string); - bool result = deserialize_http_query(u8query_string); - inout_string = string_cast(u8query_string); - return result; -}*/ - } // namespace jessilib diff --git a/src/include/jessilib/unicode_syntax.hpp b/src/include/jessilib/unicode_syntax.hpp new file mode 100644 index 0000000..a37a7cd --- /dev/null +++ b/src/include/jessilib/unicode_syntax.hpp @@ -0,0 +1,139 @@ +/** + * Copyright (C) 2021 Jessica James. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * Written by Jessica James + */ + +/** + * @file unicode_sequence.hpp + * @author Jessica James + * + * Unicode-aware syntax tree parsing utilities + */ + +#pragma once + +#include "unicode_base.hpp" + +namespace jessilib { + +/** + * Syntax tree; move this to another file later + */ + +template +using syntax_tree_action = bool(*)(ContextT& inout_context, std::basic_string_view& inout_read_view); + +template +using default_syntax_tree_action = bool(*)(get_endpoint_result in_codepoint, ContextT& inout_context, std::basic_string_view& inout_read_view); + +template +using syntax_tree = const std::pair>[]; + +template +using syntax_tree_member = const std::pair>; + +template +constexpr bool syntax_tree_member_compare(const syntax_tree_member& in_lhs, const char32_t in_rhs) { + return in_lhs.first < in_rhs; +} + +// Lessers on left +template TreeBegin, size_t TreeSize> +constexpr bool is_sorted() { + auto head = TreeBegin; + constexpr auto end = TreeBegin + TreeSize; + + if (head == end) { + return true; + } + + while (head + 1 != end) { + const auto next = head + 1; + if (head->first > next->first) { + return false; + } + + ++head; + } + + return true; +} + +template +bool fail_action(get_endpoint_result, ContextT&, std::basic_string_view&) { + return false; +} + +template +bool noop_action(get_endpoint_result decode, ContextT&, std::basic_string_view& inout_read_view) { + inout_read_view.remove_prefix(decode.units); + return true; +} + +template SubTreeBegin, size_t SubTreeSize, default_syntax_tree_action DefaultActionF = fail_action> +constexpr syntax_tree_member make_tree_pair() { + return { InCodepointV, [](ContextT& inout_context, std::basic_string_view& inout_read_view) constexpr { + auto decode = decode_codepoint(inout_read_view); + if (decode.units == 0) { + return false; + } + + constexpr syntax_tree_member* SubTreeEnd = SubTreeBegin + SubTreeSize; + auto parser = std::lower_bound(SubTreeBegin, SubTreeEnd, decode.codepoint, &syntax_tree_member_compare); + if (parser == SubTreeEnd || parser->first != decode.codepoint) { + return DefaultActionF(decode, inout_context, inout_read_view); + } + + // This is a parsed sequence; pass it to the parser + inout_read_view.remove_prefix(decode.units); + return (parser->second)(inout_context, inout_read_view); + } }; +} + +template SequenceTreeBegin, size_t SequenceTreeSize, + default_syntax_tree_action DefaultActionF = noop_action> +constexpr bool apply_syntax_tree(ContextT& inout_context, std::basic_string_view& inout_read_view) { + if (inout_read_view.empty()) { + // Nothing to parse + return true; + } + + get_endpoint_result decode; + constexpr auto SubTreeEnd = SequenceTreeBegin + SequenceTreeSize; + while ((decode = decode_codepoint(inout_read_view)).units != 0) { + auto parser = std::lower_bound(SequenceTreeBegin, SubTreeEnd, decode.codepoint, &syntax_tree_member_compare); + if (parser == SubTreeEnd || parser->first != decode.codepoint) { + // Just a normal character; pass it to the default handler + if (!DefaultActionF(decode, inout_context, inout_read_view)) { + return false; + } + + continue; + } + + // This is a parsed sequence; pass it to the parser instead + inout_read_view.remove_prefix(decode.units); + if (!(parser->second)(inout_context, inout_read_view)) { + // Bad input received; give up + return false; + } + } + + // We've finished parsing successfully + return true; +} + +} // namespace jessilib diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index f6175e3..9aad6ee 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -1,6 +1,6 @@ # Setup source files set(SOURCE_FILES - timer.cpp thread_pool.cpp util.cpp object.cpp parser.cpp config.cpp parsers/json.cpp unicode.cpp app_parameters.cpp io/color.cpp duration.cpp split.cpp split_compilation.cpp word_split.cpp unicode_sequence.cpp) + timer.cpp thread_pool.cpp util.cpp object.cpp parser.cpp config.cpp parsers/json.cpp unicode.cpp app_parameters.cpp io/color.cpp duration.cpp split.cpp split_compilation.cpp word_split.cpp unicode_sequence.cpp http_query.cpp) # Setup gtest set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) diff --git a/src/test/http_query.cpp b/src/test/http_query.cpp new file mode 100644 index 0000000..fd51b67 --- /dev/null +++ b/src/test/http_query.cpp @@ -0,0 +1,238 @@ +/** + * Copyright (C) 2021 Jessica James. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * Written by Jessica James + */ + +#include "jessilib/http_query.hpp" +#include +#include "test.hpp" + +using namespace std::literals; + +// Compile-time tests for constexpr on compilers which support C++20 constexpr std::string +#ifdef __cpp_lib_constexpr_string +constexpr std::string query_constexpr(std::string_view in_expression) { + std::string result{ in_expression }; + jessilib::deserialize_http_query(result); + return result; +} +static_assert(query_constexpr("test"s) == "test"s); +static_assert(query_constexpr("first+second"s) == "first second"s); +static_assert(query_constexpr("first%20second"s) == "first second"s); +#endif // __cpp_lib_constexpr_string + +using char_types = ::testing::Types; +using utf8_char_types = ::testing::Types; + +template +class QuerySequenceTest : public ::testing::Test { +public: +}; +TYPED_TEST_SUITE(QuerySequenceTest, utf8_char_types); + +constexpr char32_t MAX_LOOP_CODEPOINT = 0x100FF; // use 0x10FFFF for full testing + +TYPED_TEST(QuerySequenceTest, single_chars) { + // [U+0000, U+100FF) + for (char32_t codepoint = 0; codepoint < MAX_LOOP_CODEPOINT; ++codepoint) { + std::basic_string expected; + size_t units = jessilib::encode_codepoint(expected, codepoint); + EXPECT_NE(units, 0); + EXPECT_EQ(units, expected.size()); + + // Construct the query string + std::basic_string query_string; + for (auto& unit : expected) { + char encoded[3] { '%', 0, 0 }; + char* encoded_end = encoded + sizeof(encoded); + auto to_chars_result = std::to_chars(encoded + 1, encoded_end, static_cast(unit), 16); + ASSERT_EQ(to_chars_result.ec, std::errc{}) // assertion will fail when `unit` is signed type + << "For unit " << static_cast(unit) << " in codepoint " << static_cast(codepoint) << std::endl; + + if (to_chars_result.ptr != encoded_end) { + // Only wrote one hex; shift it + encoded[2] = encoded[1]; + encoded[1] = '0'; + } + + EXPECT_EQ(encoded[0], '%'); + EXPECT_NE(encoded[1], 0); + EXPECT_NE(encoded[2], 0); + query_string.insert(query_string.end(), encoded, encoded_end); + } + EXPECT_EQ(query_string.size(), expected.size() * 3); + + // Decode & check the query string + jessilib::deserialize_http_query(query_string); + EXPECT_EQ(query_string, expected); + } +} + +TYPED_TEST(QuerySequenceTest, invalids) { + std::basic_string query_string, long_query_string; + for (size_t unit = 0; unit <= 0xFF; ++unit) { + TypeParam encoded[2] { '%', static_cast(unit) }; + TypeParam* encoded_end = encoded + sizeof(encoded); + query_string.insert(query_string.end(), encoded, encoded_end); + + long_query_string += query_string; + jessilib::deserialize_http_query(query_string); + EXPECT_TRUE(query_string.empty()) + << "in unit: " << unit << std::endl; + } + + jessilib::deserialize_http_query(long_query_string); + EXPECT_TRUE(long_query_string.empty()); +} + +TYPED_TEST(QuerySequenceTest, invalids_2len) { + std::basic_string query_string, long_query_string; + for (size_t unit = 0; unit <= 0xFFFF; ++unit) { + TypeParam first = static_cast(unit >> 8); // order of these two doesn't matter + TypeParam second = static_cast(unit & 0xFF); + if (jessilib::as_base(first, 16) >= 0 + && jessilib::as_base(second, 16) >= 0) { + continue; + } + TypeParam encoded[3] { '%', static_cast(first), static_cast(second) }; + TypeParam* encoded_end = encoded + sizeof(encoded); + query_string.insert(query_string.end(), encoded, encoded_end); + + long_query_string += query_string; + jessilib::deserialize_http_query(query_string); + EXPECT_TRUE(query_string.empty()) + << "in unit: " << unit << std::endl; + } + + jessilib::deserialize_http_query(long_query_string); + EXPECT_TRUE(long_query_string.empty()); +} + +TYPED_TEST(QuerySequenceTest, invalids_trailing) { + std::basic_string query_string, long_query_string; + for (size_t unit = 0; unit <= 0xFF; ++unit) { + TypeParam encoded[3] { '%', static_cast(unit), '%' }; + TypeParam* encoded_end = encoded + sizeof(encoded); + query_string.insert(query_string.end(), encoded, encoded_end); + + long_query_string += query_string; + jessilib::deserialize_http_query(query_string); + EXPECT_TRUE(query_string.empty()) + << "in unit: " << unit << std::endl; + } + + jessilib::deserialize_http_query(long_query_string); + EXPECT_TRUE(long_query_string.empty()); +} + +TYPED_TEST(QuerySequenceTest, invalids_2len_trailing) { + std::basic_string query_string, long_query_string; + for (size_t unit = 0; unit <= 0xFFFF; ++unit) { + TypeParam first = static_cast(unit >> 8); // order of these two doesn't matter + TypeParam second = static_cast(unit & 0xFF); + if (jessilib::as_base(first, 16) >= 0 + && jessilib::as_base(second, 16) >= 0) { + continue; + } + TypeParam encoded[4] { '%', static_cast(first), static_cast(second), '%' }; + TypeParam* encoded_end = encoded + sizeof(encoded); + query_string.insert(query_string.end(), encoded, encoded_end); + + long_query_string += query_string; + jessilib::deserialize_http_query(query_string); + EXPECT_TRUE(query_string.empty()) + << "in unit: " << unit << std::endl; + } + + jessilib::deserialize_http_query(long_query_string); + EXPECT_TRUE(long_query_string.empty()); +} + +TEST(HtmlFormParser, empty) { + std::vector> parsed_result; + std::string query_text; + EXPECT_TRUE(jessilib::deserialize_html_form(parsed_result, query_text)); + EXPECT_TRUE(query_text.empty()); + EXPECT_TRUE(parsed_result.empty()); +} + +TEST(HtmlFormParser, one_key) { + std::vector> parsed_result; + std::string query_text = "key"; + EXPECT_TRUE(jessilib::deserialize_html_form(parsed_result, query_text)); + EXPECT_EQ(query_text, "key"); + EXPECT_EQ(parsed_result.size(), 1); + EXPECT_EQ(parsed_result[0].first, query_text); + EXPECT_TRUE(parsed_result[0].second.empty()); +} + +TEST(HtmlFormParser, one_key_and_value) { + std::vector> parsed_result; + std::string query_text = "key=value"; + EXPECT_TRUE(jessilib::deserialize_html_form(parsed_result, query_text)); + EXPECT_TRUE(query_text.starts_with("keyvalue")); + EXPECT_EQ(parsed_result.size(), 1); + EXPECT_EQ(parsed_result[0].first, "key"); + EXPECT_EQ(parsed_result[0].second, "value"); +} + +TEST(HtmlFormParser, one_key_and_value_trailing) { + std::vector> parsed_result; + std::string query_text = "key=value&"; + EXPECT_TRUE(jessilib::deserialize_html_form(parsed_result, query_text)); + EXPECT_TRUE(query_text.starts_with("keyvalue")); + EXPECT_EQ(parsed_result.size(), 2); + EXPECT_EQ(parsed_result[0].first, "key"); + EXPECT_EQ(parsed_result[0].second, "value"); + EXPECT_TRUE(parsed_result[1].first.empty()); + EXPECT_TRUE(parsed_result[1].second.empty()); +} + +TEST(HtmlFormParser, two_key_one_value) { + std::vector> parsed_result; + std::string query_text = "key=value&second_key"; + EXPECT_TRUE(jessilib::deserialize_html_form(parsed_result, query_text)); + EXPECT_TRUE(query_text.starts_with("keyvaluesecond_key")); + EXPECT_EQ(parsed_result.size(), 2); + EXPECT_EQ(parsed_result[0].first, "key"); + EXPECT_EQ(parsed_result[0].second, "value"); + EXPECT_EQ(parsed_result[1].first, "second_key"); + EXPECT_TRUE(parsed_result[1].second.empty()); +} + +TEST(HtmlFormParser, two_key_two_value) { + std::vector> parsed_result; + std::string query_text = "key=value&second_key=second=value"; + EXPECT_TRUE(jessilib::deserialize_html_form(parsed_result, query_text)); + EXPECT_TRUE(query_text.starts_with("keyvaluesecond_keysecond=value")); + EXPECT_EQ(parsed_result.size(), 2); + EXPECT_EQ(parsed_result[0].first, "key"); + EXPECT_EQ(parsed_result[0].second, "value"); + EXPECT_EQ(parsed_result[1].first, "second_key"); + EXPECT_EQ(parsed_result[1].second, "second=value"); +} + +TEST(HtmlFormParser, some_sequences) { + std::vector> parsed_result; + std::string query_text = "k+y=va+u%20&%73econd%5Fke%79=second_valu%65"; + EXPECT_TRUE(jessilib::deserialize_html_form(parsed_result, query_text)); + EXPECT_TRUE(query_text.starts_with("k yva u second_keysecond_value")); + EXPECT_EQ(parsed_result.size(), 2); + EXPECT_EQ(parsed_result[0].first, "k y"); + EXPECT_EQ(parsed_result[0].second, "va u "); + EXPECT_EQ(parsed_result[1].first, "second_key"); + EXPECT_EQ(parsed_result[1].second, "second_value"); +} diff --git a/src/test/unicode.cpp b/src/test/unicode.cpp index 859374d..077f729 100644 --- a/src/test/unicode.cpp +++ b/src/test/unicode.cpp @@ -25,6 +25,12 @@ using namespace jessilib; using namespace std::literals; +static_assert(codepoint_info::utf8_length == 1); +static_assert(codepoint_info::utf16_length == 1); +static_assert(codepoint_info::utf32_length == 1); +static_assert(codepoint_info::wchar_length == 1); +static_assert(codepoint_info::encode_length == 1); + /** encode_codepoint */ TEST(UTF8Test, encode_codepoint) { diff --git a/src/test/unicode_sequence.cpp b/src/test/unicode_sequence.cpp index 7ab26c5..64fcedc 100644 --- a/src/test/unicode_sequence.cpp +++ b/src/test/unicode_sequence.cpp @@ -21,7 +21,7 @@ #include "jessilib/unicode.hpp" // string_cast #include "test.hpp" -using namespace std; +using namespace std::literals; // Compile-time tests for constexpr on compilers which support C++20 constexpr std::string #ifdef __cpp_lib_constexpr_string @@ -30,17 +30,8 @@ constexpr std::string cpp_constexpr(std::string_view in_expression) { jessilib::apply_cpp_escape_sequences(result); return result; } - -constexpr std::string query_constexpr(std::string_view in_expression) { - std::string result{ in_expression }; - jessilib::deserialize_http_query(result); - return result; -} static_assert(cpp_constexpr("test"s) == "test"s); static_assert(cpp_constexpr("\\r\\n"s) == "\r\n"s); -static_assert(query_constexpr("test"s) == "test"s); -static_assert(query_constexpr("first+second"s) == "first second"s); -static_assert(query_constexpr("first%20second"s) == "first second"s); #endif // __cpp_lib_constexpr_string using char_types = ::testing::Types; @@ -57,12 +48,6 @@ public: }; TYPED_TEST_SUITE(UnicodeSequenceTest, char_types); -template -class UnicodeUTF8SequenceTest : public ::testing::Test { -public: -}; -TYPED_TEST_SUITE(UnicodeUTF8SequenceTest, utf8_char_types); - constexpr char32_t MAX_LOOP_CODEPOINT = 0x100FF; // use 0x10FFFF for full testing #define TEST_CPP_SEQUENCE(expr) \ @@ -212,123 +197,3 @@ TYPED_TEST(UnicodeSequenceTest, cpp_u32) { EXPECT_EQ(decode.codepoint, static_cast(codepoint)); } } - -/** - * Query strings - */ - -TYPED_TEST(UnicodeUTF8SequenceTest, single_chars) { - // [U+0000, U+100FF) - for (char32_t codepoint = 0; codepoint < MAX_LOOP_CODEPOINT; ++codepoint) { - std::basic_string expected; - size_t units = jessilib::encode_codepoint(expected, codepoint); - EXPECT_NE(units, 0); - EXPECT_EQ(units, expected.size()); - - // Construct the query string - std::basic_string query_string; - for (auto& unit : expected) { - char encoded[3] { '%', 0, 0 }; - char* encoded_end = encoded + sizeof(encoded); - auto to_chars_result = std::to_chars(encoded + 1, encoded_end, static_cast(unit), 16); - ASSERT_EQ(to_chars_result.ec, std::errc{}) // assertion will fail when `unit` is signed type - << "For unit " << static_cast(unit) << " in codepoint " << static_cast(codepoint) << std::endl; - - if (to_chars_result.ptr != encoded_end) { - // Only wrote one hex; shift it - encoded[2] = encoded[1]; - encoded[1] = '0'; - } - - EXPECT_EQ(encoded[0], '%'); - EXPECT_NE(encoded[1], 0); - EXPECT_NE(encoded[2], 0); - query_string.insert(query_string.end(), encoded, encoded_end); - } - EXPECT_EQ(query_string.size(), expected.size() * 3); - - // Decode & check the query string - jessilib::deserialize_http_query(query_string); - EXPECT_EQ(query_string, expected); - } -} - -TYPED_TEST(UnicodeUTF8SequenceTest, invalids) { - std::basic_string query_string, long_query_string; - for (size_t unit = 0; unit <= 0xFF; ++unit) { - TypeParam encoded[2] { '%', static_cast(unit) }; - TypeParam* encoded_end = encoded + sizeof(encoded); - query_string.insert(query_string.end(), encoded, encoded_end); - - long_query_string += query_string; - jessilib::deserialize_http_query(query_string); - EXPECT_TRUE(query_string.empty()) - << "in unit: " << unit << std::endl; - } - - jessilib::deserialize_http_query(long_query_string); - EXPECT_TRUE(long_query_string.empty()); -} - -TYPED_TEST(UnicodeUTF8SequenceTest, invalids_2len) { - std::basic_string query_string, long_query_string; - for (size_t unit = 0; unit <= 0xFFFF; ++unit) { - TypeParam first = static_cast(unit >> 8); // order of these two doesn't matter - TypeParam second = static_cast(unit & 0xFF); - if (jessilib::as_base(first, 16) >= 0 - && jessilib::as_base(second, 16) >= 0) { - continue; - } - TypeParam encoded[3] { '%', static_cast(first), static_cast(second) }; - TypeParam* encoded_end = encoded + sizeof(encoded); - query_string.insert(query_string.end(), encoded, encoded_end); - - long_query_string += query_string; - jessilib::deserialize_http_query(query_string); - EXPECT_TRUE(query_string.empty()) - << "in unit: " << unit << std::endl; - } - - jessilib::deserialize_http_query(long_query_string); - EXPECT_TRUE(long_query_string.empty()); -} - -TYPED_TEST(UnicodeUTF8SequenceTest, invalids_trailing) { - std::basic_string query_string, long_query_string; - for (size_t unit = 0; unit <= 0xFF; ++unit) { - TypeParam encoded[3] { '%', static_cast(unit), '%' }; - TypeParam* encoded_end = encoded + sizeof(encoded); - query_string.insert(query_string.end(), encoded, encoded_end); - - long_query_string += query_string; - jessilib::deserialize_http_query(query_string); - EXPECT_TRUE(query_string.empty()) - << "in unit: " << unit << std::endl; - } - - jessilib::deserialize_http_query(long_query_string); - EXPECT_TRUE(long_query_string.empty()); -} - -TYPED_TEST(UnicodeUTF8SequenceTest, invalids_2len_trailing) { - std::basic_string query_string, long_query_string; - for (size_t unit = 0; unit <= 0xFFFF; ++unit) { - TypeParam first = static_cast(unit >> 8); // order of these two doesn't matter - TypeParam second = static_cast(unit & 0xFF); - if (jessilib::as_base(first, 16) >= 0 - && jessilib::as_base(second, 16) >= 0) { - continue; - } - TypeParam encoded[4] { '%', static_cast(first), static_cast(second), '%' }; - TypeParam* encoded_end = encoded + sizeof(encoded); - query_string.insert(query_string.end(), encoded, encoded_end); - - long_query_string += query_string; - jessilib::deserialize_http_query(query_string); - EXPECT_TRUE(query_string.empty()) - << "in unit: " << unit << std::endl; - } - - jessilib::deserialize_http_query(long_query_string); - EXPECT_TRUE(long_query_string.empty()); -}