Browse Source

Initial implementation for syntax_tree, with deserialize_html_form as a prototype; add container::push; split query_string stuff to http_query.hpp

master
Jessica James 3 years ago
parent
commit
e3142da329
  1. 190
      src/include/jessilib/http_query.hpp
  2. 29
      src/include/jessilib/object.hpp
  3. 143
      src/include/jessilib/type_traits.hpp
  4. 415
      src/include/jessilib/unicode.hpp
  5. 56
      src/include/jessilib/unicode_base.hpp
  6. 430
      src/include/jessilib/unicode_compare.hpp
  7. 51
      src/include/jessilib/unicode_sequence.hpp
  8. 139
      src/include/jessilib/unicode_syntax.hpp
  9. 2
      src/test/CMakeLists.txt
  10. 238
      src/test/http_query.cpp
  11. 6
      src/test/unicode.cpp
  12. 137
      src/test/unicode_sequence.cpp

190
src/include/jessilib/http_query.hpp

@ -0,0 +1,190 @@
/**
* Copyright (C) 2021 Jessica James.
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*
* Written by Jessica James <jessica.aj@outlook.com>
*/
/**
* @file unicode_sequence.hpp
* @author Jessica James
*
* Unicode-aware syntax tree parsing utilities
*/
#pragma once
#include "unicode_syntax.hpp"
#include "unicode_sequence.hpp"
#include "type_traits.hpp"
namespace jessilib {
/**
* Query string escape sequence parser
*/
template<typename CharT,
std::enable_if_t<sizeof(CharT) == 1>* = nullptr> // make_hex_sequence_pair isn't going to play well with other types
static constexpr shrink_sequence_tree<CharT> http_query_escapes_root_tree{
make_hex_sequence_pair<CharT, U'%', 2, true, false>(),
make_simple_sequence_pair<CharT, U'+', ' '>()
};
static_assert(is_sorted<char, http_query_escapes_root_tree<char>, std::size(http_query_escapes_root_tree<char>)>(), "Tree must be pre-sorted");
static_assert(is_sorted<char8_t, http_query_escapes_root_tree<char8_t>, std::size(http_query_escapes_root_tree<char8_t>)>(), "Tree must be pre-sorted");
template<typename CharT,
std::enable_if_t<sizeof(CharT) == 1>* = nullptr>
constexpr bool deserialize_http_query(std::basic_string<CharT>& inout_string) {
return apply_shrink_sequence_tree<CharT, http_query_escapes_root_tree<CharT>, std::size(http_query_escapes_root_tree<CharT>)>(inout_string);
}
// TODO: decide whether to take this approach, where query strings are assumed to represent UTF-8 text data, OR implement
// such that calling deserialize_http_query will assume the relevant encoding (i.e: calling with char16_t would read in
// escaped query values as bytes in codepoint char16_t, rather than utf-8 encoding sequence)
/*template<typename CharT,
std::enable_if_t<sizeof(CharT) != 1>* = nullptr>
bool deserialize_http_query(std::basic_string<CharT>& inout_string) {
//TODO: optimize this?
std::basic_string<char8_t> u8query_string = string_cast<char8_t>(inout_string);
bool result = deserialize_http_query<char8_t>(u8query_string);
inout_string = string_cast<CharT>(u8query_string);
return result;
}*/
/**
* HTML form parser
*/
template<typename CharT, typename ContainerT>
struct HTMLFormContext {
using container_type = ContainerT;
container_type& out_container;
CharT* write_head;
const CharT* key_start = write_head;
const CharT* value_start{}; // value_start is also key_end
};
template<typename CharT, typename ContextT, char32_t InCodepointV>
constexpr syntax_tree_member<CharT, ContextT> make_value_start_pair() {
// '='
return { InCodepointV, [](ContextT& inout_context, std::basic_string_view<CharT>&) constexpr {
if (inout_context.value_start != nullptr) {
// There's already a value pending; this must just be part of the value.
inout_context.write_head += encode_codepoint(inout_context.write_head, InCodepointV);
return true;
}
// Start pending_value
inout_context.value_start = inout_context.write_head;
return true;
} };
}
template<typename CharT, typename ContextT>
constexpr bool value_end_action(ContextT& inout_context, std::basic_string_view<CharT>&) {
const CharT* value_end = inout_context.write_head;
const CharT* key_start = inout_context.key_start;
const CharT* value_start = inout_context.value_start;
if (value_start != nullptr) {
// Terminate key & value, push them to table
std::basic_string_view<CharT> key{ key_start, static_cast<size_t>(value_start - key_start) };
std::basic_string_view<CharT> value{ value_start, static_cast<size_t>(value_end - value_start) };
jessilib::container::push(inout_context.out_container, key, value);
// Start reading next key
inout_context.key_start = value_end;
inout_context.value_start = nullptr;
return true;
}
// This is a valueless key; terminate the key and push it
std::basic_string_view<CharT> key{ key_start, static_cast<size_t>(value_end - key_start) };
jessilib::container::push(inout_context.out_container, key, std::basic_string_view<CharT>{});
// Start reading next key
inout_context.key_start = value_end;
return true;
}
template<typename CharT, typename ContextT, char32_t InCodepointV>
constexpr syntax_tree_member<CharT, ContextT> make_value_end_pair() {
// '&'
return { InCodepointV, value_end_action<CharT, ContextT> };
}
template<typename CharT, typename ContextT, char32_t InCodepointV, size_t MaxDigitsV, bool ExactDigitsV, bool IsUnicode>
constexpr syntax_tree_member<CharT, ContextT> make_hex_syntax_shrink_pair() {
return { InCodepointV, [](ContextT& inout_context, std::basic_string_view<CharT>& inout_read_view) constexpr {
return hex_shrink_sequence_action<CharT, MaxDigitsV, ExactDigitsV, IsUnicode>(inout_context.write_head, inout_read_view);
} };
}
template<typename CharT, typename ContextT, char32_t InCodepointV, char8_t OutCodepointV>
constexpr syntax_tree_member<CharT, ContextT> make_simple_shrink_pair() {
return {
InCodepointV,
[](ContextT& inout_context, std::basic_string_view<CharT>&) constexpr {
*inout_context.write_head = static_cast<CharT>(OutCodepointV);
++inout_context.write_head;
return true;
}
};
}
template<typename CharT, typename ContextT>
bool html_form_default_action(get_endpoint_result decode, ContextT& inout_context, std::basic_string_view<CharT>& inout_read_view) {
// A regular character; copy it and advance the read/write heads
CharT*& write_head = inout_context.write_head;
CharT* write_end = write_head + decode.units;
while (write_head != write_end) {
*write_head = inout_read_view.front();
++write_head;
inout_read_view.remove_prefix(1);
}
return true;
}
template<typename CharT, typename ContainerT,
std::enable_if_t<sizeof(CharT) == 1>* = nullptr> // make_hex_sequence_pair isn't going to play well with other types
static constexpr syntax_tree<CharT, HTMLFormContext<CharT, ContainerT>> html_form_root_tree{
make_hex_syntax_shrink_pair<CharT, HTMLFormContext<CharT, ContainerT>, U'%', 2, true, false>(),
make_value_end_pair<CharT, HTMLFormContext<CharT, ContainerT>, U'&'>(),
make_simple_shrink_pair<CharT, HTMLFormContext<CharT, ContainerT>, U'+', ' '>(),
make_value_start_pair<CharT, HTMLFormContext<CharT, ContainerT>, U'='>()
};
template<typename CharT, typename ContainerT,
std::enable_if_t<sizeof(CharT) == 1>* = nullptr>
constexpr bool deserialize_html_form(ContainerT& out_container, std::basic_string<CharT>& inout_string) {
if (inout_string.empty()) {
return true; // even though apply_syntax_tree checks for this, check it here anyways so we don't call value_end_action
}
HTMLFormContext<CharT, ContainerT> context{ out_container, inout_string.data() };
constexpr auto& html_form_tree = html_form_root_tree<CharT, ContainerT>;
static_assert(is_sorted<char, decltype(context), html_form_tree, std::size(html_form_tree)>(), "Tree must be pre-sorted");
std::basic_string_view<CharT> read_view{ inout_string };
if (apply_syntax_tree<CharT, decltype(context), html_form_tree, std::size(html_form_tree), html_form_default_action>
(context, read_view)) {
value_end_action<CharT, decltype(context)>(context, read_view);
return true;
}
return false;
}
} // namespace jessilib

29
src/include/jessilib/object.hpp

@ -101,7 +101,7 @@ public:
template<typename T, template<typename T,
typename std::enable_if<is_backing<typename std::decay<T>::type>::value typename std::enable_if<is_backing<typename std::decay<T>::type>::value
&& !is_sequence_container<typename std::decay<T>::type>::value && !is_sequence_container<typename std::decay<T>::type>::value
&& (!is_associative_container<typename std::decay<T>::type>::value || std::is_same<typename remove_cvref<T>::type, map_type>::value)>::type* = nullptr> && (!is_associative_container<typename std::decay<T>::type>::value || std::is_same<typename std::remove_cvref<T>::type, map_type>::value)>::type* = nullptr>
object(T&& in_value) object(T&& in_value)
: m_value{ typename is_backing<typename std::decay<T>::type>::type{ std::forward<T>(in_value) } } { : m_value{ typename is_backing<typename std::decay<T>::type>::type{ std::forward<T>(in_value) } } {
// Empty ctor body // Empty ctor body
@ -140,10 +140,10 @@ public:
// Non-map_type associative containers (container<string_type, T>) // Non-map_type associative containers (container<string_type, T>)
template<typename T, template<typename T,
typename std::enable_if<is_associative_container<typename remove_cvref<T>::type>::value typename std::enable_if<is_associative_container<typename std::remove_cvref<T>::type>::value
&& (std::is_convertible<typename is_associative_container<typename remove_cvref<T>::type>::key_type, string_type>::value && (std::is_convertible<typename is_associative_container<typename std::remove_cvref<T>::type>::key_type, string_type>::value
|| std::is_convertible<typename is_associative_container<typename remove_cvref<T>::type>::key_type, string_view_type>::value) || std::is_convertible<typename is_associative_container<typename std::remove_cvref<T>::type>::key_type, string_view_type>::value)
&& !std::is_same<typename is_associative_container<typename remove_cvref<T>::type>::value_type, object>::value>::type* = nullptr> && !std::is_same<typename is_associative_container<typename std::remove_cvref<T>::type>::value_type, object>::value>::type* = nullptr>
object(T&& in_value) object(T&& in_value)
: m_value{ map_type{} } { : m_value{ map_type{} } {
auto& map = std::get<map_type>(m_value); auto& map = std::get<map_type>(m_value);
@ -474,6 +474,25 @@ private:
// represent as a map, whereas an actual xml document is sequenced // represent as a map, whereas an actual xml document is sequenced
}; // object }; // object
namespace container {
template<typename ContainerT, typename LeftT, typename RightT,
typename std::enable_if_t<std::is_same_v<ContainerT, object>>* = nullptr>
constexpr void push(ContainerT& inout_container, LeftT&& in_key, RightT&& in_value) {
auto object_type = inout_container.type();
if (object_type == object::type::null || object_type == object::type::map) {
// Push to map if null or map type
inout_container[in_key] = in_value;
}
else if (object_type == object::type::array) {
// Push to back of array if array type
inout_container[inout_container.size()][in_key] = in_value;
}
// else // do nothing; pushing a key/value pair isn't valid here
}
} // namespace container
} // namespace jessilib } // namespace jessilib

143
src/include/jessilib/type_traits.hpp

@ -20,9 +20,14 @@
#include <cstddef> #include <cstddef>
#include <type_traits> #include <type_traits>
// Container types we're using, more or less purely because we can't forward declare these at all
#include <vector> #include <vector>
#include <list> #include <list>
#include <forward_list> #include <forward_list>
#include <stack>
#include <queue>
#include <deque>
#include <set> #include <set>
#include <unordered_set> #include <unordered_set>
#include <map> #include <map>
@ -32,13 +37,6 @@
namespace jessilib { namespace jessilib {
/** remove_cvref (can be replaced with C++20) */
template<class T>
struct remove_cvref {
typedef std::remove_cv_t<std::remove_reference_t<T>> type;
};
/** is_basic_string */ /** is_basic_string */
template<typename T> template<typename T>
@ -65,6 +63,20 @@ struct is_basic_string_view<std::basic_string_view<T>> {
constexpr bool operator()() const noexcept { return true; } constexpr bool operator()() const noexcept { return true; }
}; };
/** is_pair */
template<typename T>
struct is_pair : std::false_type {};
template<typename LeftT, typename RightT>
struct is_pair<std::pair<LeftT, RightT>> {
using first_type = LeftT;
using second_type = RightT;
static constexpr bool value{ true };
constexpr operator bool() const noexcept { return true; }
constexpr bool operator()() const noexcept { return true; }
};
/** is_vector */ /** is_vector */
template<typename T> template<typename T>
@ -104,6 +116,45 @@ struct is_forward_list<std::forward_list<T>> {
constexpr bool operator()() const noexcept { return true; } constexpr bool operator()() const noexcept { return true; }
}; };
/** is_stack */
template<typename T>
struct is_stack : std::false_type {};
template<typename T>
struct is_stack<std::stack<T>> {
using type = T;
static constexpr bool value{ true };
constexpr operator bool() const noexcept { return true; }
constexpr bool operator()() const noexcept { return true; }
};
/** is_queue */
template<typename T>
struct is_queue : std::false_type {};
template<typename T>
struct is_queue<std::queue<T>> {
using type = T;
static constexpr bool value{ true };
constexpr operator bool() const noexcept { return true; }
constexpr bool operator()() const noexcept { return true; }
};
/** is_deque */
template<typename T>
struct is_deque : std::false_type {};
template<typename T>
struct is_deque<std::deque<T>> {
using type = T;
static constexpr bool value{ true };
constexpr operator bool() const noexcept { return true; }
constexpr bool operator()() const noexcept { return true; }
};
/** is_set */ /** is_set */
template<typename T> template<typename T>
@ -170,6 +221,20 @@ struct is_map<std::map<KeyT, ValueT>> {
constexpr bool operator()() const noexcept { return true; } constexpr bool operator()() const noexcept { return true; }
}; };
/** is_multimap */
template<typename T>
struct is_multimap : std::false_type {};
template<typename KeyT, typename ValueT>
struct is_multimap<std::multimap<KeyT, ValueT>> {
using key_type = KeyT;
using value_type = ValueT;
static constexpr bool value{ true };
constexpr operator bool() const noexcept { return true; }
constexpr bool operator()() const noexcept { return true; }
};
/** is_unordered_map */ /** is_unordered_map */
template<typename T> template<typename T>
@ -184,6 +249,18 @@ struct is_unordered_map<std::unordered_map<KeyT, ValueT>> {
constexpr bool operator()() const noexcept { return true; } constexpr bool operator()() const noexcept { return true; }
}; };
template<typename T>
struct is_unordered_multimap : std::false_type {};
template<typename KeyT, typename ValueT>
struct is_unordered_multimap<std::unordered_multimap<KeyT, ValueT>> {
using key_type = KeyT;
using value_type = ValueT;
static constexpr bool value{ true };
constexpr operator bool() const noexcept { return true; }
constexpr bool operator()() const noexcept { return true; }
};
/** is_associative_container */ /** is_associative_container */
template<typename T> template<typename T>
@ -236,6 +313,7 @@ struct is_sequence_container<std::forward_list<T>> {
constexpr bool operator()() const noexcept { return true; } constexpr bool operator()() const noexcept { return true; }
}; };
// Sets are really associative containers, not sequence...
template<typename T> template<typename T>
struct is_sequence_container<std::set<T>> { struct is_sequence_container<std::set<T>> {
using type = T; using type = T;
@ -268,4 +346,55 @@ struct is_sequence_container<std::unordered_multiset<T>> {
constexpr bool operator()() const noexcept { return true; } constexpr bool operator()() const noexcept { return true; }
}; };
/**
* Push helper for pushing key/value pairs to arbitrary container types
*
* If ContainerT is associative: set key/value
* If ContainerT is multi-associative: add key/value
* If ContainerT is sequential: push key/value pair to back
*/
namespace container {
/** Pushing to associative containers */
template<typename ContainerT, typename LeftT, typename RightT,
typename std::enable_if_t<is_map<ContainerT>::value || is_unordered_map<ContainerT>::value>* = nullptr>
constexpr void push(ContainerT& inout_container, LeftT&& in_key, RightT&& in_value) {
inout_container[in_key] = in_value;
}
template<typename ContainerT, typename LeftT, typename RightT,
typename std::enable_if_t<is_set<ContainerT>::value || is_unordered_set<ContainerT>::value>* = nullptr>
constexpr void push(ContainerT& inout_container, LeftT&& in_key, RightT&& in_value) {
auto insert_result = inout_container.insert({in_key, in_value});
if (!insert_result) {
*insert_result.first = { in_key, in_value };
}
}
template<typename ContainerT, typename LeftT, typename RightT,
typename std::enable_if_t<is_multimap<ContainerT>::value || is_unordered_multimap<ContainerT>::value
|| is_multiset<ContainerT>::value || is_unordered_multiset<ContainerT>::value>* = nullptr>
constexpr void push(ContainerT& inout_container, LeftT&& in_key, RightT&& in_value) {
inout_container.insert({in_key, in_value});
}
/** Pushing to sequential containers */
template<typename ContainerT, typename LeftT, typename RightT,
typename std::enable_if_t<is_vector<ContainerT>::value || is_list<ContainerT>::value || is_deque<ContainerT>::value>* = nullptr>
constexpr void push(ContainerT& inout_container, LeftT&& in_key, RightT&& in_value) {
inout_container.push_back({in_key, in_value});
}
template<typename ContainerT, typename LeftT, typename RightT,
typename std::enable_if_t<is_queue<ContainerT>::value || is_stack<ContainerT>::value>* = nullptr>
constexpr void push(ContainerT& inout_container, LeftT&& in_key, RightT&& in_value) {
inout_container.push({in_key, in_value});
}
template<typename ContainerT, typename LeftT, typename RightT,
typename std::enable_if_t<is_forward_list<ContainerT>::value>* = nullptr>
constexpr void push(ContainerT& inout_container, LeftT&& in_key, RightT&& in_value) {
inout_container.push_front({in_key, in_value});
}
} // namespace container
} // namespace jessilib } // namespace jessilib

415
src/include/jessilib/unicode.hpp

@ -18,10 +18,7 @@
#pragma once #pragma once
#include <string> #include "unicode_compare.hpp"
#include <string_view>
#include <ostream>
#include "unicode_base.hpp"
namespace jessilib { namespace jessilib {
@ -179,246 +176,6 @@ std::basic_string<OutCharT> string_cast(const InT& in_string) {
} }
} }
/**
* Checks if two codepoints are equal to each-other (case insensitive)
*
* @param lhs First codepoint to compare
* @param rhs Second codepoint to compare
* @return True if the characters are equal, false otherwise
*/
inline bool equalsi(char32_t lhs, char32_t rhs) {
return lhs == rhs
|| fold(lhs) == fold(rhs);
}
// Should just make these methods container-type agnostic rather than this mess...
#define ADAPT_BASIC_STRING(method) \
template<typename LhsCharT, typename RhsCharT> \
auto method(const std::basic_string<LhsCharT>& lhs, std::basic_string_view<RhsCharT> rhs) { \
return method(static_cast<std::basic_string_view<LhsCharT>>(lhs), rhs); } \
template<typename LhsCharT, typename RhsCharT> \
auto method(std::basic_string_view<LhsCharT> lhs, const std::basic_string<RhsCharT>& rhs) { \
return method(lhs, static_cast<std::basic_string_view<RhsCharT>>(rhs)); } \
template<typename LhsCharT, typename RhsCharT> \
auto method(const std::basic_string<LhsCharT>& lhs, const std::basic_string<RhsCharT>& rhs) { \
return method(static_cast<std::basic_string_view<LhsCharT>>(lhs), static_cast<std::basic_string_view<RhsCharT>>(rhs)); }
/**
* Checks if two strings are equal
*
* @tparam LhsCharT Character type for left-hand parameter
* @tparam RhsCharT Character type for right-hand parameter
* @param lhs First string to compare
* @param rhs Second string to compare against
* @return True if the strings are equal, false otherwise
*/
template<typename LhsCharT, typename RhsCharT>
bool equals(std::basic_string_view<LhsCharT> lhs, std::basic_string_view<RhsCharT> rhs) {
// If lhs and rhs are the same type, compare their sizes and quickly return if not same
if constexpr (std::is_same_v<LhsCharT, RhsCharT>) {
return lhs == rhs;
}
while (!lhs.empty() && !rhs.empty()) {
auto lhs_front = decode_codepoint(lhs);
auto rhs_front = decode_codepoint(rhs);
if (lhs_front.units == 0
|| rhs_front.units == 0) {
// Failed to decode front codepoint; bad unicode sequence
return false;
}
if (lhs_front.codepoint != rhs_front.codepoint) {
// Codepoints aren't the same
return false;
}
// Codepoints are equal; trim off the fronts and continue
lhs.remove_prefix(lhs_front.units);
rhs.remove_prefix(rhs_front.units);
}
return lhs.empty() && rhs.empty();
}
ADAPT_BASIC_STRING(equals)
/**
* Checks if two strings are equal (case insensitive)
*
* @tparam LhsCharT Character type for left-hand parameter
* @tparam RhsCharT Character type for right-hand parameter
* @param lhs First string to compare
* @param rhs Second string to compare against
* @return True if the strings are equal, false otherwise
*/
template<typename LhsCharT, typename RhsCharT>
bool equalsi(std::basic_string_view<LhsCharT> lhs, std::basic_string_view<RhsCharT> rhs) {
// If lhs and rhs are the same type, compare their sizes and quickly return if not same
if constexpr (std::is_same_v<LhsCharT, RhsCharT>) {
if (lhs.size() != rhs.size()) {
return false;
}
}
while (!lhs.empty() && !rhs.empty()) {
auto lhs_front = decode_codepoint(lhs);
auto rhs_front = decode_codepoint(rhs);
if (lhs_front.units == 0
|| rhs_front.units == 0) {
// Failed to decode front codepoint; bad unicode sequence
return false;
}
if (!equalsi(lhs_front.codepoint, rhs_front.codepoint)) {
// Codepoints don't fold to same value
return false;
}
// Codepoints are equal; trim off the fronts and continue
lhs.remove_prefix(lhs_front.units);
rhs.remove_prefix(rhs_front.units);
}
return lhs.empty() && rhs.empty();
}
ADAPT_BASIC_STRING(equalsi)
/**
* Checks if a string starts with a substring
*
* @tparam LhsCharT Character type for underlying string
* @tparam RhsCharT Character type for prefix string
* @param in_string String to check for prefix
* @param in_prefix Substring prefix to check for
* @return Data length of in_prefix in terms of LhsCharT if in_string starts with in_prefix, 0 otherwise
*/
template<typename LhsCharT, typename RhsCharT>
size_t starts_with_length(std::basic_string_view<LhsCharT> in_string, std::basic_string_view<RhsCharT> in_prefix) {
// If in_string and in_prefix are the same type, compare their sizes and quickly return if in_string is too small
if constexpr (std::is_same_v<LhsCharT, RhsCharT>) {
if (in_string.size() < in_prefix.size()) {
return 0;
}
}
size_t codepoints_removed{};
while (!in_string.empty() && !in_prefix.empty()) {
get_endpoint_result string_front = decode_codepoint(in_string);
get_endpoint_result prefix_front = decode_codepoint(in_prefix);
if (string_front.units == 0
|| prefix_front.units == 0) {
// Failed to decode front codepoint; bad unicode sequence
return 0;
}
if (string_front.codepoint != prefix_front.codepoint) {
// Codepoints aren't the same
return 0;
}
// Codepoints are equal; trim off the fronts and continue
in_string.remove_prefix(string_front.units);
in_prefix.remove_prefix(prefix_front.units);
codepoints_removed += string_front.units;
}
if (!in_prefix.empty()) {
// We reached end of in_string before end of prefix
return 0;
}
return codepoints_removed;
}
ADAPT_BASIC_STRING(starts_with_length)
/**
* Checks if a string starts with a substring (case insensitive)
*
* @tparam LhsCharT Character type for underlying string
* @tparam RhsCharT Character type for prefix string
* @param in_string String to check for prefix
* @param in_prefix Substring prefix to check for
* @return Data length of in_prefix in terms of LhsCharT if in_string starts with in_prefix, 0 otherwise
*/
template<typename LhsCharT, typename RhsCharT>
size_t starts_with_lengthi(std::basic_string_view<LhsCharT> in_string, std::basic_string_view<RhsCharT> in_prefix) {
// If in_string and in_prefix are the same type, skip decoding each point
if constexpr (std::is_same_v<LhsCharT, RhsCharT>) {
if (in_string.size() < in_prefix.size()) {
return 0;
}
}
size_t codepoints_removed{};
while (!in_string.empty() && !in_prefix.empty()) {
get_endpoint_result string_front = decode_codepoint(in_string);
get_endpoint_result prefix_front = decode_codepoint(in_prefix);
if (string_front.units == 0
|| prefix_front.units == 0) {
// Failed to decode front codepoint; bad unicode sequence
return 0;
}
if (!equalsi(string_front.codepoint, prefix_front.codepoint)) {
// Codepoints don't fold to same value
return 0;
}
// Codepoints are equal; trim off the fronts and continue
in_string.remove_prefix(string_front.units);
in_prefix.remove_prefix(prefix_front.units);
codepoints_removed += string_front.units;
}
if (!in_prefix.empty()) {
// We reached end of in_string before end of prefix
return 0;
}
return codepoints_removed;
}
ADAPT_BASIC_STRING(starts_with_lengthi)
/**
* Checks if a string starts with a substring
*
* @tparam LhsCharT Character type for underlying string
* @tparam RhsCharT Character type for prefix string
* @param in_string String to check for prefix
* @param in_prefix Prefix to check for
* @return True if both strings are valid and in_string starts with in_prefix, false otherwise
*/
template<typename LhsCharT, typename RhsCharT>
bool starts_with(std::basic_string_view<LhsCharT> in_string, std::basic_string_view<RhsCharT> in_prefix) {
return starts_with_length<LhsCharT, RhsCharT>(in_string, in_prefix) != 0;
}
ADAPT_BASIC_STRING(starts_with)
/**
* Checks if a string starts with a substring (case insensitive)
*
* @tparam LhsCharT Character type for underlying string
* @tparam RhsCharT Character type for prefix string
* @param in_string String to check for prefix
* @param in_prefix Prefix to check for
* @return True if both strings are valid and in_string starts with in_prefix, false otherwise
*/
template<typename LhsCharT, typename RhsCharT>
bool starts_withi(std::basic_string_view<LhsCharT> in_string, std::basic_string_view<RhsCharT> in_prefix) {
return starts_with_lengthi<LhsCharT, RhsCharT>(in_string, in_prefix) != 0;
}
ADAPT_BASIC_STRING(starts_withi)
/** /**
* Searches a string for a specified substring * Searches a string for a specified substring
* *
@ -651,176 +408,6 @@ OutT join(ArgsT&&... args) {
return result; return result;
} }
/**
* Calculates the hash of a string based on its codepoints, such that a unicode string will always produce the same hash
* regardless of underlying encoding
*
* This is not intended for generating hashses of arbitrary data; it's specifically intended for strings of text
*/
struct text_hash {
using is_transparent = std::true_type;
template<typename CharT>
static uint64_t hash(const CharT* data, const CharT* end) {
uint64_t hash = 14695981039346656037ULL;
get_endpoint_result decode;
while (data != end) {
decode = decode_codepoint(data, end);
if (decode.units == 0) {
return hash;
}
hash = hash ^ decode.codepoint;
hash = hash * 1099511628211ULL;
data += decode.units;
}
return hash;
}
auto operator()(const std::basic_string<char>& in_key) const noexcept { // ASSUMES UTF-8
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(std::basic_string_view<char> in_key) const noexcept {
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(const std::basic_string<char8_t>& in_key) const noexcept { // ASSUMES UTF-8
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(std::basic_string_view<char8_t> in_key) const noexcept {
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(const std::basic_string<char16_t>& in_key) const noexcept { // ASSUMES UTF-8
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(std::basic_string_view<char16_t> in_key) const noexcept {
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(const std::basic_string<char32_t>& in_key) const noexcept { // ASSUMES UTF-8
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(std::basic_string_view<char32_t> in_key) const noexcept {
return hash(in_key.data(), in_key.data() + in_key.size());
}
};
struct text_equal {
using is_transparent = std::true_type;
template<typename LhsCharT, typename RhsCharT>
bool operator()(std::basic_string_view<LhsCharT> in_lhs, std::basic_string_view<RhsCharT> in_rhs) const noexcept {
return equals<LhsCharT, RhsCharT>(in_lhs, in_rhs);
}
template<typename LhsCharT, typename RhsCharT>
bool operator()(std::basic_string_view<LhsCharT> in_lhs, const std::basic_string<RhsCharT>& in_rhs) const noexcept {
return equals<LhsCharT, RhsCharT>(in_lhs, in_rhs);
}
template<typename LhsCharT, typename RhsCharT>
bool operator()(const std::basic_string<LhsCharT>& in_lhs, std::basic_string_view<RhsCharT> in_rhs) const noexcept {
return equals<LhsCharT, RhsCharT>(in_lhs, in_rhs);
}
template<typename LhsCharT, typename RhsCharT>
bool operator()(const std::basic_string<LhsCharT>& in_lhs, const std::basic_string<RhsCharT>& in_rhs) const noexcept {
return equals<LhsCharT, RhsCharT>(in_lhs, in_rhs);
}
};
/**
* Calculates the hash of a string based on its folded codepoints, such that a unicode string will always produce the
* same hash regardless of underlying encoding or the casing of its values.
*
* This is not intended for generating hashses of arbitrary data; it's specifically intended for strings of text
*/
struct text_hashi {
using is_transparent = std::true_type;
template<typename CharT>
static uint64_t hash(const CharT* data, const CharT* end) {
uint64_t hash = 14695981039346656037ULL;
get_endpoint_result decode;
while (data != end) {
decode = decode_codepoint(data, end - data);
if (decode.units == 0) {
return hash;
}
hash = hash ^ fold(decode.codepoint);
hash = hash * 1099511628211ULL;
data += decode.units;
}
return hash;
}
auto operator()(const std::basic_string<char>& in_key) const noexcept { // ASSUMES UTF-8
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(std::basic_string_view<char> in_key) const noexcept {
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(const std::basic_string<char8_t>& in_key) const noexcept { // ASSUMES UTF-8
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(std::basic_string_view<char8_t> in_key) const noexcept {
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(const std::basic_string<char16_t>& in_key) const noexcept { // ASSUMES UTF-8
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(std::basic_string_view<char16_t> in_key) const noexcept {
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(const std::basic_string<char32_t>& in_key) const noexcept { // ASSUMES UTF-8
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(std::basic_string_view<char32_t> in_key) const noexcept {
return hash(in_key.data(), in_key.data() + in_key.size());
}
};
struct text_equali {
using is_transparent = std::true_type;
template<typename LhsCharT, typename RhsCharT>
bool operator()(std::basic_string_view<LhsCharT> in_lhs, std::basic_string_view<RhsCharT> in_rhs) const noexcept {
return equalsi<LhsCharT, RhsCharT>(in_lhs, in_rhs);
}
template<typename LhsCharT, typename RhsCharT>
bool operator()(std::basic_string_view<LhsCharT> in_lhs, const std::basic_string<RhsCharT>& in_rhs) const noexcept {
return equalsi<LhsCharT, RhsCharT>(in_lhs, in_rhs);
}
template<typename LhsCharT, typename RhsCharT>
bool operator()(const std::basic_string<LhsCharT>& in_lhs, std::basic_string_view<RhsCharT> in_rhs) const noexcept {
return equalsi<LhsCharT, RhsCharT>(in_lhs, in_rhs);
}
template<typename LhsCharT, typename RhsCharT>
bool operator()(const std::basic_string<LhsCharT>& in_lhs, const std::basic_string<RhsCharT>& in_rhs) const noexcept {
return equalsi<LhsCharT, RhsCharT>(in_lhs, in_rhs);
}
};
/** to_lower / to_upper */ /** to_lower / to_upper */
//char32_t to_lower(char32_t in_chr); // TODO: implement //char32_t to_lower(char32_t in_chr); // TODO: implement
//char32_t to_upper(char32_t in_chr); // TODO: implement //char32_t to_upper(char32_t in_chr); // TODO: implement

56
src/include/jessilib/unicode_base.hpp

@ -167,6 +167,62 @@ using encode_buffer_type = CharT[unicode_traits<CharT>::max_units_per_codepoint]
char32_t fold(char32_t in_codepoint); // Folds codepoint for case-insensitive checks (not for human output) char32_t fold(char32_t in_codepoint); // Folds codepoint for case-insensitive checks (not for human output)
constexpr int as_base(char32_t in_character, unsigned int base); // The value represented by in_character in terms of base if valid, -1 otherwise constexpr int as_base(char32_t in_character, unsigned int base); // The value represented by in_character in terms of base if valid, -1 otherwise
/**
* Checks if two codepoints are equal to each-other (case insensitive)
*
* @param lhs First codepoint to compare
* @param rhs Second codepoint to compare
* @return True if the characters are equal, false otherwise
*/
inline bool equalsi(char32_t lhs, char32_t rhs) {
return lhs == rhs
|| fold(lhs) == fold(rhs);
}
template<char32_t InCodepointV>
struct codepoint_info {
private:
template<typename CharT>
static constexpr size_t encode_codepoint_length(char32_t in_codepoint) {
encode_buffer_type<CharT> buffer{};
return encode_codepoint(buffer, in_codepoint);
}
public:
static constexpr char32_t value = InCodepointV;
template<typename CharT>
static constexpr size_t encode_length = encode_codepoint_length<CharT>(InCodepointV);
template<typename CharT>
using encode_buffer = CharT[encode_length<CharT>];
static constexpr size_t utf8_length = encode_length<char8_t>;
static constexpr size_t utf16_length = encode_length<char16_t>;
static constexpr size_t utf32_length = encode_length<char32_t>;
static constexpr size_t wchar_length = encode_length<wchar_t>;
using utf8_buffer = char8_t[utf8_length];
using utf16_buffer = char16_t[utf16_length];
using utf32_buffer = char32_t[utf32_length];
using wchar_buffer = wchar_t[wchar_length];
static constexpr void encode(utf8_buffer& buffer) {
encode_codepoint(buffer, InCodepointV);
}
static constexpr void encode(utf16_buffer& buffer) {
encode_codepoint(buffer, InCodepointV);
}
static constexpr void encode(utf32_buffer& buffer) {
encode_codepoint(buffer, InCodepointV);
}
static constexpr void encode(wchar_buffer& buffer) {
encode_codepoint(buffer, InCodepointV);
}
};
/** /**
* Inline constexpr encode implementation * Inline constexpr encode implementation
*/ */

430
src/include/jessilib/unicode_compare.hpp

@ -0,0 +1,430 @@
/**
* Copyright (C) 2021 Jessica James.
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*
* Written by Jessica James <jessica.aj@outlook.com>
*/
/**
* @file unicode_hash.hpp
* @author Jessica James
*
* Unicode-aware encoding-agnostic string comparison & hashing utilities
*/
#pragma once
#include "unicode_base.hpp"
namespace jessilib {
// Should just make these methods container-type agnostic rather than this mess...
#define ADAPT_BASIC_STRING(method) \
template<typename LhsCharT, typename RhsCharT> \
auto method(const std::basic_string<LhsCharT>& lhs, std::basic_string_view<RhsCharT> rhs) { \
return method(static_cast<std::basic_string_view<LhsCharT>>(lhs), rhs); } \
template<typename LhsCharT, typename RhsCharT> \
auto method(std::basic_string_view<LhsCharT> lhs, const std::basic_string<RhsCharT>& rhs) { \
return method(lhs, static_cast<std::basic_string_view<RhsCharT>>(rhs)); } \
template<typename LhsCharT, typename RhsCharT> \
auto method(const std::basic_string<LhsCharT>& lhs, const std::basic_string<RhsCharT>& rhs) { \
return method(static_cast<std::basic_string_view<LhsCharT>>(lhs), static_cast<std::basic_string_view<RhsCharT>>(rhs)); }
/**
* Checks if two strings are equal
*
* @tparam LhsCharT Character type for left-hand parameter
* @tparam RhsCharT Character type for right-hand parameter
* @param lhs First string to compare
* @param rhs Second string to compare against
* @return True if the strings are equal, false otherwise
*/
template<typename LhsCharT, typename RhsCharT>
bool equals(std::basic_string_view<LhsCharT> lhs, std::basic_string_view<RhsCharT> rhs) {
// If lhs and rhs are the same type, compare their sizes and quickly return if not same
if constexpr (std::is_same_v<LhsCharT, RhsCharT>) {
return lhs == rhs;
}
while (!lhs.empty() && !rhs.empty()) {
auto lhs_front = decode_codepoint(lhs);
auto rhs_front = decode_codepoint(rhs);
if (lhs_front.units == 0
|| rhs_front.units == 0) {
// Failed to decode front codepoint; bad unicode sequence
return false;
}
if (lhs_front.codepoint != rhs_front.codepoint) {
// Codepoints aren't the same
return false;
}
// Codepoints are equal; trim off the fronts and continue
lhs.remove_prefix(lhs_front.units);
rhs.remove_prefix(rhs_front.units);
}
return lhs.empty() && rhs.empty();
}
ADAPT_BASIC_STRING(equals)
/**
* Checks if two strings are equal (case insensitive)
*
* @tparam LhsCharT Character type for left-hand parameter
* @tparam RhsCharT Character type for right-hand parameter
* @param lhs First string to compare
* @param rhs Second string to compare against
* @return True if the strings are equal, false otherwise
*/
template<typename LhsCharT, typename RhsCharT>
bool equalsi(std::basic_string_view<LhsCharT> lhs, std::basic_string_view<RhsCharT> rhs) {
// If lhs and rhs are the same type, compare their sizes and quickly return if not same
if constexpr (std::is_same_v<LhsCharT, RhsCharT>) {
if (lhs.size() != rhs.size()) {
return false;
}
}
while (!lhs.empty() && !rhs.empty()) {
auto lhs_front = decode_codepoint(lhs);
auto rhs_front = decode_codepoint(rhs);
if (lhs_front.units == 0
|| rhs_front.units == 0) {
// Failed to decode front codepoint; bad unicode sequence
return false;
}
if (!equalsi(lhs_front.codepoint, rhs_front.codepoint)) {
// Codepoints don't fold to same value
return false;
}
// Codepoints are equal; trim off the fronts and continue
lhs.remove_prefix(lhs_front.units);
rhs.remove_prefix(rhs_front.units);
}
return lhs.empty() && rhs.empty();
}
ADAPT_BASIC_STRING(equalsi)
/**
* Checks if a string starts with a substring
*
* @tparam LhsCharT Character type for underlying string
* @tparam RhsCharT Character type for prefix string
* @param in_string String to check for prefix
* @param in_prefix Substring prefix to check for
* @return Data length of in_prefix in terms of LhsCharT if in_string starts with in_prefix, 0 otherwise
*/
template<typename LhsCharT, typename RhsCharT>
size_t starts_with_length(std::basic_string_view<LhsCharT> in_string, std::basic_string_view<RhsCharT> in_prefix) {
// If in_string and in_prefix are the same type, compare their sizes and quickly return if in_string is too small
if constexpr (std::is_same_v<LhsCharT, RhsCharT>) {
if (in_string.size() < in_prefix.size()) {
return 0;
}
}
size_t codepoints_removed{};
while (!in_string.empty() && !in_prefix.empty()) {
get_endpoint_result string_front = decode_codepoint(in_string);
get_endpoint_result prefix_front = decode_codepoint(in_prefix);
if (string_front.units == 0
|| prefix_front.units == 0) {
// Failed to decode front codepoint; bad unicode sequence
return 0;
}
if (string_front.codepoint != prefix_front.codepoint) {
// Codepoints aren't the same
return 0;
}
// Codepoints are equal; trim off the fronts and continue
in_string.remove_prefix(string_front.units);
in_prefix.remove_prefix(prefix_front.units);
codepoints_removed += string_front.units;
}
if (!in_prefix.empty()) {
// We reached end of in_string before end of prefix
return 0;
}
return codepoints_removed;
}
ADAPT_BASIC_STRING(starts_with_length)
/**
* Checks if a string starts with a substring (case insensitive)
*
* @tparam LhsCharT Character type for underlying string
* @tparam RhsCharT Character type for prefix string
* @param in_string String to check for prefix
* @param in_prefix Substring prefix to check for
* @return Data length of in_prefix in terms of LhsCharT if in_string starts with in_prefix, 0 otherwise
*/
template<typename LhsCharT, typename RhsCharT>
size_t starts_with_lengthi(std::basic_string_view<LhsCharT> in_string, std::basic_string_view<RhsCharT> in_prefix) {
// If in_string and in_prefix are the same type, skip decoding each point
if constexpr (std::is_same_v<LhsCharT, RhsCharT>) {
if (in_string.size() < in_prefix.size()) {
return 0;
}
}
size_t codepoints_removed{};
while (!in_string.empty() && !in_prefix.empty()) {
get_endpoint_result string_front = decode_codepoint(in_string);
get_endpoint_result prefix_front = decode_codepoint(in_prefix);
if (string_front.units == 0
|| prefix_front.units == 0) {
// Failed to decode front codepoint; bad unicode sequence
return 0;
}
if (!equalsi(string_front.codepoint, prefix_front.codepoint)) {
// Codepoints don't fold to same value
return 0;
}
// Codepoints are equal; trim off the fronts and continue
in_string.remove_prefix(string_front.units);
in_prefix.remove_prefix(prefix_front.units);
codepoints_removed += string_front.units;
}
if (!in_prefix.empty()) {
// We reached end of in_string before end of prefix
return 0;
}
return codepoints_removed;
}
ADAPT_BASIC_STRING(starts_with_lengthi)
/**
* Checks if a string starts with a substring
*
* @tparam LhsCharT Character type for underlying string
* @tparam RhsCharT Character type for prefix string
* @param in_string String to check for prefix
* @param in_prefix Prefix to check for
* @return True if both strings are valid and in_string starts with in_prefix, false otherwise
*/
template<typename LhsCharT, typename RhsCharT>
bool starts_with(std::basic_string_view<LhsCharT> in_string, std::basic_string_view<RhsCharT> in_prefix) {
return starts_with_length<LhsCharT, RhsCharT>(in_string, in_prefix) != 0;
}
ADAPT_BASIC_STRING(starts_with)
/**
* Checks if a string starts with a substring (case insensitive)
*
* @tparam LhsCharT Character type for underlying string
* @tparam RhsCharT Character type for prefix string
* @param in_string String to check for prefix
* @param in_prefix Prefix to check for
* @return True if both strings are valid and in_string starts with in_prefix, false otherwise
*/
template<typename LhsCharT, typename RhsCharT>
bool starts_withi(std::basic_string_view<LhsCharT> in_string, std::basic_string_view<RhsCharT> in_prefix) {
return starts_with_lengthi<LhsCharT, RhsCharT>(in_string, in_prefix) != 0;
}
ADAPT_BASIC_STRING(starts_withi)
/**
* Calculates the hash of a string based on its codepoints, such that a unicode string will always produce the same hash
* regardless of underlying encoding
*
* This is not intended for generating hashses of arbitrary data; it's specifically intended for strings of text
*/
struct text_hash {
using is_transparent = std::true_type;
template<typename CharT>
static uint64_t hash(const CharT* data, const CharT* end) {
uint64_t hash = 14695981039346656037ULL;
get_endpoint_result decode;
while (data != end) {
decode = decode_codepoint(data, end);
if (decode.units == 0) {
return hash;
}
hash = hash ^ decode.codepoint;
hash = hash * 1099511628211ULL;
data += decode.units;
}
return hash;
}
auto operator()(const std::basic_string<char>& in_key) const noexcept { // ASSUMES UTF-8
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(std::basic_string_view<char> in_key) const noexcept {
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(const std::basic_string<char8_t>& in_key) const noexcept { // ASSUMES UTF-8
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(std::basic_string_view<char8_t> in_key) const noexcept {
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(const std::basic_string<char16_t>& in_key) const noexcept { // ASSUMES UTF-8
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(std::basic_string_view<char16_t> in_key) const noexcept {
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(const std::basic_string<char32_t>& in_key) const noexcept { // ASSUMES UTF-8
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(std::basic_string_view<char32_t> in_key) const noexcept {
return hash(in_key.data(), in_key.data() + in_key.size());
}
};
struct text_equal {
using is_transparent = std::true_type;
template<typename LhsCharT, typename RhsCharT>
bool operator()(std::basic_string_view<LhsCharT> in_lhs, std::basic_string_view<RhsCharT> in_rhs) const noexcept {
return equals<LhsCharT, RhsCharT>(in_lhs, in_rhs);
}
template<typename LhsCharT, typename RhsCharT>
bool operator()(std::basic_string_view<LhsCharT> in_lhs, const std::basic_string<RhsCharT>& in_rhs) const noexcept {
return equals<LhsCharT, RhsCharT>(in_lhs, in_rhs);
}
template<typename LhsCharT, typename RhsCharT>
bool operator()(const std::basic_string<LhsCharT>& in_lhs, std::basic_string_view<RhsCharT> in_rhs) const noexcept {
return equals<LhsCharT, RhsCharT>(in_lhs, in_rhs);
}
template<typename LhsCharT, typename RhsCharT>
bool operator()(const std::basic_string<LhsCharT>& in_lhs, const std::basic_string<RhsCharT>& in_rhs) const noexcept {
return equals<LhsCharT, RhsCharT>(in_lhs, in_rhs);
}
};
/**
* Calculates the hash of a string based on its folded codepoints, such that a unicode string will always produce the
* same hash regardless of underlying encoding or the casing of its values.
*
* This is not intended for generating hashses of arbitrary data; it's specifically intended for strings of text
*/
struct text_hashi {
using is_transparent = std::true_type;
template<typename CharT>
static uint64_t hash(const CharT* data, const CharT* end) {
uint64_t hash = 14695981039346656037ULL;
get_endpoint_result decode;
while (data != end) {
decode = decode_codepoint(data, end - data);
if (decode.units == 0) {
return hash;
}
hash = hash ^ fold(decode.codepoint);
hash = hash * 1099511628211ULL;
data += decode.units;
}
return hash;
}
auto operator()(const std::basic_string<char>& in_key) const noexcept { // ASSUMES UTF-8
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(std::basic_string_view<char> in_key) const noexcept {
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(const std::basic_string<char8_t>& in_key) const noexcept { // ASSUMES UTF-8
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(std::basic_string_view<char8_t> in_key) const noexcept {
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(const std::basic_string<char16_t>& in_key) const noexcept { // ASSUMES UTF-8
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(std::basic_string_view<char16_t> in_key) const noexcept {
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(const std::basic_string<char32_t>& in_key) const noexcept { // ASSUMES UTF-8
return hash(in_key.data(), in_key.data() + in_key.size());
}
auto operator()(std::basic_string_view<char32_t> in_key) const noexcept {
return hash(in_key.data(), in_key.data() + in_key.size());
}
};
struct text_equali {
using is_transparent = std::true_type;
template<typename LhsCharT, typename RhsCharT>
bool operator()(std::basic_string_view<LhsCharT> in_lhs, std::basic_string_view<RhsCharT> in_rhs) const noexcept {
return equalsi<LhsCharT, RhsCharT>(in_lhs, in_rhs);
}
template<typename LhsCharT, typename RhsCharT>
bool operator()(std::basic_string_view<LhsCharT> in_lhs, const std::basic_string<RhsCharT>& in_rhs) const noexcept {
return equalsi<LhsCharT, RhsCharT>(in_lhs, in_rhs);
}
template<typename LhsCharT, typename RhsCharT>
bool operator()(const std::basic_string<LhsCharT>& in_lhs, std::basic_string_view<RhsCharT> in_rhs) const noexcept {
return equalsi<LhsCharT, RhsCharT>(in_lhs, in_rhs);
}
template<typename LhsCharT, typename RhsCharT>
bool operator()(const std::basic_string<LhsCharT>& in_lhs, const std::basic_string<RhsCharT>& in_rhs) const noexcept {
return equalsi<LhsCharT, RhsCharT>(in_lhs, in_rhs);
}
};
} // namespace jessilib

51
src/include/jessilib/unicode_sequence.hpp

@ -237,15 +237,10 @@ constexpr shrink_sequence_tree_member<CharT> make_octal_sequence_pair() {
}; };
} }
template<typename CharT, char32_t InCodepointV, size_t MaxDigitsV, bool ExactDigitsV, bool IsUnicode> template<typename CharT, size_t MaxDigitsV, bool ExactDigitsV, bool IsUnicode>
constexpr shrink_sequence_tree_member<CharT> make_hex_sequence_pair() { constexpr bool hex_shrink_sequence_action(CharT*& in_write_head, std::basic_string_view<CharT>& read_view) {
static_assert(MaxDigitsV > 0);
return {
InCodepointV,
[](CharT*& in_write_head, std::basic_string_view<CharT>& read_view) constexpr {
// Does not modify // Does not modify
auto read_hex = [](uint32_t& out_value, std::basic_string_view<CharT> in_view, size_t max_digits) { auto read_hex = [](uint32_t& out_value, std::basic_string_view<CharT> in_view, size_t max_digits) constexpr {
size_t result{}; size_t result{};
int hex_value; int hex_value;
out_value = 0; out_value = 0;
@ -299,6 +294,14 @@ constexpr shrink_sequence_tree_member<CharT> make_hex_sequence_pair() {
return true; return true;
} }
template<typename CharT, char32_t InCodepointV, size_t MaxDigitsV, bool ExactDigitsV, bool IsUnicode>
constexpr shrink_sequence_tree_member<CharT> make_hex_sequence_pair() {
static_assert(MaxDigitsV > 0);
return {
InCodepointV,
hex_shrink_sequence_action<CharT, MaxDigitsV, ExactDigitsV, IsUnicode>
}; };
} }
@ -394,36 +397,4 @@ constexpr bool apply_cpp_escape_sequences(std::basic_string<CharT>& inout_string
return apply_shrink_sequence_tree<CharT, cpp_escapes_root_tree<CharT>, std::size(cpp_escapes_root_tree<CharT>)>(inout_string); return apply_shrink_sequence_tree<CharT, cpp_escapes_root_tree<CharT>, std::size(cpp_escapes_root_tree<CharT>)>(inout_string);
} }
/**
* Query string escape sequence parser
*/
template<typename CharT,
std::enable_if_t<sizeof(CharT) == 1>* = nullptr> // make_hex_sequence_pair isn't going to play well with other types
static constexpr shrink_sequence_tree<CharT> http_query_escapes_root_tree{
make_hex_sequence_pair<CharT, U'%', 2, true, false>(),
make_simple_sequence_pair<CharT, U'+', ' '>()
};
static_assert(is_sorted<char, http_query_escapes_root_tree<char>, std::size(http_query_escapes_root_tree<char>)>(), "Tree must be pre-sorted");
static_assert(is_sorted<char8_t, http_query_escapes_root_tree<char8_t>, std::size(http_query_escapes_root_tree<char8_t>)>(), "Tree must be pre-sorted");
template<typename CharT,
std::enable_if_t<sizeof(CharT) == 1>* = nullptr>
constexpr bool deserialize_http_query(std::basic_string<CharT>& inout_string) {
return apply_shrink_sequence_tree<CharT, http_query_escapes_root_tree<CharT>, std::size(http_query_escapes_root_tree<CharT>)>(inout_string);
}
// TODO: decide whether to take this approach, where query strings are assumed to represent UTF-8 text data, OR implement
// such that calling deserialize_http_query will assume the relevant encoding (i.e: calling with char16_t would read in
// escaped query values as bytes in codepoint char16_t, rather than utf-8 encoding sequence)
/*template<typename CharT,
std::enable_if_t<sizeof(CharT) != 1>* = nullptr>
bool deserialize_http_query(std::basic_string<CharT>& inout_string) {
//TODO: optimize this?
std::basic_string<char8_t> u8query_string = string_cast<char8_t>(inout_string);
bool result = deserialize_http_query<char8_t>(u8query_string);
inout_string = string_cast<CharT>(u8query_string);
return result;
}*/
} // namespace jessilib } // namespace jessilib

139
src/include/jessilib/unicode_syntax.hpp

@ -0,0 +1,139 @@
/**
* Copyright (C) 2021 Jessica James.
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*
* Written by Jessica James <jessica.aj@outlook.com>
*/
/**
* @file unicode_sequence.hpp
* @author Jessica James
*
* Unicode-aware syntax tree parsing utilities
*/
#pragma once
#include "unicode_base.hpp"
namespace jessilib {
/**
* Syntax tree; move this to another file later
*/
template<typename CharT, typename ContextT>
using syntax_tree_action = bool(*)(ContextT& inout_context, std::basic_string_view<CharT>& inout_read_view);
template<typename CharT, typename ContextT>
using default_syntax_tree_action = bool(*)(get_endpoint_result in_codepoint, ContextT& inout_context, std::basic_string_view<CharT>& inout_read_view);
template<typename CharT, typename ContextT>
using syntax_tree = const std::pair<char32_t, syntax_tree_action<CharT, ContextT>>[];
template<typename CharT, typename ContextT>
using syntax_tree_member = const std::pair<char32_t, syntax_tree_action<CharT, ContextT>>;
template<typename CharT, typename ContextT>
constexpr bool syntax_tree_member_compare(const syntax_tree_member<CharT, ContextT>& in_lhs, const char32_t in_rhs) {
return in_lhs.first < in_rhs;
}
// Lessers on left
template<typename CharT, typename ContextT, const syntax_tree<CharT, ContextT> TreeBegin, size_t TreeSize>
constexpr bool is_sorted() {
auto head = TreeBegin;
constexpr auto end = TreeBegin + TreeSize;
if (head == end) {
return true;
}
while (head + 1 != end) {
const auto next = head + 1;
if (head->first > next->first) {
return false;
}
++head;
}
return true;
}
template<typename CharT, typename ContextT>
bool fail_action(get_endpoint_result, ContextT&, std::basic_string_view<CharT>&) {
return false;
}
template<typename CharT, typename ContextT>
bool noop_action(get_endpoint_result decode, ContextT&, std::basic_string_view<CharT>& inout_read_view) {
inout_read_view.remove_prefix(decode.units);
return true;
}
template<typename CharT, typename ContextT, char32_t InCodepointV, const syntax_tree<CharT, ContextT> SubTreeBegin, size_t SubTreeSize, default_syntax_tree_action<CharT, ContextT> DefaultActionF = fail_action<CharT, ContextT>>
constexpr syntax_tree_member<CharT, ContextT> make_tree_pair() {
return { InCodepointV, [](ContextT& inout_context, std::basic_string_view<CharT>& inout_read_view) constexpr {
auto decode = decode_codepoint(inout_read_view);
if (decode.units == 0) {
return false;
}
constexpr syntax_tree_member<CharT, ContextT>* SubTreeEnd = SubTreeBegin + SubTreeSize;
auto parser = std::lower_bound(SubTreeBegin, SubTreeEnd, decode.codepoint, &syntax_tree_member_compare<CharT>);
if (parser == SubTreeEnd || parser->first != decode.codepoint) {
return DefaultActionF(decode, inout_context, inout_read_view);
}
// This is a parsed sequence; pass it to the parser
inout_read_view.remove_prefix(decode.units);
return (parser->second)(inout_context, inout_read_view);
} };
}
template<typename CharT, typename ContextT, const syntax_tree<CharT, ContextT> SequenceTreeBegin, size_t SequenceTreeSize,
default_syntax_tree_action<CharT, ContextT> DefaultActionF = noop_action<CharT, ContextT>>
constexpr bool apply_syntax_tree(ContextT& inout_context, std::basic_string_view<CharT>& inout_read_view) {
if (inout_read_view.empty()) {
// Nothing to parse
return true;
}
get_endpoint_result decode;
constexpr auto SubTreeEnd = SequenceTreeBegin + SequenceTreeSize;
while ((decode = decode_codepoint(inout_read_view)).units != 0) {
auto parser = std::lower_bound(SequenceTreeBegin, SubTreeEnd, decode.codepoint, &syntax_tree_member_compare<CharT, ContextT>);
if (parser == SubTreeEnd || parser->first != decode.codepoint) {
// Just a normal character; pass it to the default handler
if (!DefaultActionF(decode, inout_context, inout_read_view)) {
return false;
}
continue;
}
// This is a parsed sequence; pass it to the parser instead
inout_read_view.remove_prefix(decode.units);
if (!(parser->second)(inout_context, inout_read_view)) {
// Bad input received; give up
return false;
}
}
// We've finished parsing successfully
return true;
}
} // namespace jessilib

2
src/test/CMakeLists.txt

@ -1,6 +1,6 @@
# Setup source files # Setup source files
set(SOURCE_FILES set(SOURCE_FILES
timer.cpp thread_pool.cpp util.cpp object.cpp parser.cpp config.cpp parsers/json.cpp unicode.cpp app_parameters.cpp io/color.cpp duration.cpp split.cpp split_compilation.cpp word_split.cpp unicode_sequence.cpp) timer.cpp thread_pool.cpp util.cpp object.cpp parser.cpp config.cpp parsers/json.cpp unicode.cpp app_parameters.cpp io/color.cpp duration.cpp split.cpp split_compilation.cpp word_split.cpp unicode_sequence.cpp http_query.cpp)
# Setup gtest # Setup gtest
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)

238
src/test/http_query.cpp

@ -0,0 +1,238 @@
/**
* Copyright (C) 2021 Jessica James.
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*
* Written by Jessica James <jessica.aj@outlook.com>
*/
#include "jessilib/http_query.hpp"
#include <charconv>
#include "test.hpp"
using namespace std::literals;
// Compile-time tests for constexpr on compilers which support C++20 constexpr std::string
#ifdef __cpp_lib_constexpr_string
constexpr std::string query_constexpr(std::string_view in_expression) {
std::string result{ in_expression };
jessilib::deserialize_http_query(result);
return result;
}
static_assert(query_constexpr("test"s) == "test"s);
static_assert(query_constexpr("first+second"s) == "first second"s);
static_assert(query_constexpr("first%20second"s) == "first second"s);
#endif // __cpp_lib_constexpr_string
using char_types = ::testing::Types<char, char8_t, char16_t, char32_t>;
using utf8_char_types = ::testing::Types<char, char8_t>;
template<typename T>
class QuerySequenceTest : public ::testing::Test {
public:
};
TYPED_TEST_SUITE(QuerySequenceTest, utf8_char_types);
constexpr char32_t MAX_LOOP_CODEPOINT = 0x100FF; // use 0x10FFFF for full testing
TYPED_TEST(QuerySequenceTest, single_chars) {
// [U+0000, U+100FF)
for (char32_t codepoint = 0; codepoint < MAX_LOOP_CODEPOINT; ++codepoint) {
std::basic_string<TypeParam> expected;
size_t units = jessilib::encode_codepoint(expected, codepoint);
EXPECT_NE(units, 0);
EXPECT_EQ(units, expected.size());
// Construct the query string
std::basic_string<TypeParam> query_string;
for (auto& unit : expected) {
char encoded[3] { '%', 0, 0 };
char* encoded_end = encoded + sizeof(encoded);
auto to_chars_result = std::to_chars(encoded + 1, encoded_end, static_cast<unsigned char>(unit), 16);
ASSERT_EQ(to_chars_result.ec, std::errc{}) // assertion will fail when `unit` is signed type
<< "For unit " << static_cast<int>(unit) << " in codepoint " << static_cast<int>(codepoint) << std::endl;
if (to_chars_result.ptr != encoded_end) {
// Only wrote one hex; shift it
encoded[2] = encoded[1];
encoded[1] = '0';
}
EXPECT_EQ(encoded[0], '%');
EXPECT_NE(encoded[1], 0);
EXPECT_NE(encoded[2], 0);
query_string.insert(query_string.end(), encoded, encoded_end);
}
EXPECT_EQ(query_string.size(), expected.size() * 3);
// Decode & check the query string
jessilib::deserialize_http_query(query_string);
EXPECT_EQ(query_string, expected);
}
}
TYPED_TEST(QuerySequenceTest, invalids) {
std::basic_string<TypeParam> query_string, long_query_string;
for (size_t unit = 0; unit <= 0xFF; ++unit) {
TypeParam encoded[2] { '%', static_cast<TypeParam>(unit) };
TypeParam* encoded_end = encoded + sizeof(encoded);
query_string.insert(query_string.end(), encoded, encoded_end);
long_query_string += query_string;
jessilib::deserialize_http_query(query_string);
EXPECT_TRUE(query_string.empty())
<< "in unit: " << unit << std::endl;
}
jessilib::deserialize_http_query(long_query_string);
EXPECT_TRUE(long_query_string.empty());
}
TYPED_TEST(QuerySequenceTest, invalids_2len) {
std::basic_string<TypeParam> query_string, long_query_string;
for (size_t unit = 0; unit <= 0xFFFF; ++unit) {
TypeParam first = static_cast<TypeParam>(unit >> 8); // order of these two doesn't matter
TypeParam second = static_cast<TypeParam>(unit & 0xFF);
if (jessilib::as_base(first, 16) >= 0
&& jessilib::as_base(second, 16) >= 0) {
continue;
}
TypeParam encoded[3] { '%', static_cast<TypeParam>(first), static_cast<TypeParam>(second) };
TypeParam* encoded_end = encoded + sizeof(encoded);
query_string.insert(query_string.end(), encoded, encoded_end);
long_query_string += query_string;
jessilib::deserialize_http_query(query_string);
EXPECT_TRUE(query_string.empty())
<< "in unit: " << unit << std::endl;
}
jessilib::deserialize_http_query(long_query_string);
EXPECT_TRUE(long_query_string.empty());
}
TYPED_TEST(QuerySequenceTest, invalids_trailing) {
std::basic_string<TypeParam> query_string, long_query_string;
for (size_t unit = 0; unit <= 0xFF; ++unit) {
TypeParam encoded[3] { '%', static_cast<TypeParam>(unit), '%' };
TypeParam* encoded_end = encoded + sizeof(encoded);
query_string.insert(query_string.end(), encoded, encoded_end);
long_query_string += query_string;
jessilib::deserialize_http_query(query_string);
EXPECT_TRUE(query_string.empty())
<< "in unit: " << unit << std::endl;
}
jessilib::deserialize_http_query(long_query_string);
EXPECT_TRUE(long_query_string.empty());
}
TYPED_TEST(QuerySequenceTest, invalids_2len_trailing) {
std::basic_string<TypeParam> query_string, long_query_string;
for (size_t unit = 0; unit <= 0xFFFF; ++unit) {
TypeParam first = static_cast<TypeParam>(unit >> 8); // order of these two doesn't matter
TypeParam second = static_cast<TypeParam>(unit & 0xFF);
if (jessilib::as_base(first, 16) >= 0
&& jessilib::as_base(second, 16) >= 0) {
continue;
}
TypeParam encoded[4] { '%', static_cast<TypeParam>(first), static_cast<TypeParam>(second), '%' };
TypeParam* encoded_end = encoded + sizeof(encoded);
query_string.insert(query_string.end(), encoded, encoded_end);
long_query_string += query_string;
jessilib::deserialize_http_query(query_string);
EXPECT_TRUE(query_string.empty())
<< "in unit: " << unit << std::endl;
}
jessilib::deserialize_http_query(long_query_string);
EXPECT_TRUE(long_query_string.empty());
}
TEST(HtmlFormParser, empty) {
std::vector<std::pair<std::string_view, std::string_view>> parsed_result;
std::string query_text;
EXPECT_TRUE(jessilib::deserialize_html_form(parsed_result, query_text));
EXPECT_TRUE(query_text.empty());
EXPECT_TRUE(parsed_result.empty());
}
TEST(HtmlFormParser, one_key) {
std::vector<std::pair<std::string_view, std::string_view>> parsed_result;
std::string query_text = "key";
EXPECT_TRUE(jessilib::deserialize_html_form(parsed_result, query_text));
EXPECT_EQ(query_text, "key");
EXPECT_EQ(parsed_result.size(), 1);
EXPECT_EQ(parsed_result[0].first, query_text);
EXPECT_TRUE(parsed_result[0].second.empty());
}
TEST(HtmlFormParser, one_key_and_value) {
std::vector<std::pair<std::string_view, std::string_view>> parsed_result;
std::string query_text = "key=value";
EXPECT_TRUE(jessilib::deserialize_html_form(parsed_result, query_text));
EXPECT_TRUE(query_text.starts_with("keyvalue"));
EXPECT_EQ(parsed_result.size(), 1);
EXPECT_EQ(parsed_result[0].first, "key");
EXPECT_EQ(parsed_result[0].second, "value");
}
TEST(HtmlFormParser, one_key_and_value_trailing) {
std::vector<std::pair<std::string_view, std::string_view>> parsed_result;
std::string query_text = "key=value&";
EXPECT_TRUE(jessilib::deserialize_html_form(parsed_result, query_text));
EXPECT_TRUE(query_text.starts_with("keyvalue"));
EXPECT_EQ(parsed_result.size(), 2);
EXPECT_EQ(parsed_result[0].first, "key");
EXPECT_EQ(parsed_result[0].second, "value");
EXPECT_TRUE(parsed_result[1].first.empty());
EXPECT_TRUE(parsed_result[1].second.empty());
}
TEST(HtmlFormParser, two_key_one_value) {
std::vector<std::pair<std::string_view, std::string_view>> parsed_result;
std::string query_text = "key=value&second_key";
EXPECT_TRUE(jessilib::deserialize_html_form(parsed_result, query_text));
EXPECT_TRUE(query_text.starts_with("keyvaluesecond_key"));
EXPECT_EQ(parsed_result.size(), 2);
EXPECT_EQ(parsed_result[0].first, "key");
EXPECT_EQ(parsed_result[0].second, "value");
EXPECT_EQ(parsed_result[1].first, "second_key");
EXPECT_TRUE(parsed_result[1].second.empty());
}
TEST(HtmlFormParser, two_key_two_value) {
std::vector<std::pair<std::string_view, std::string_view>> parsed_result;
std::string query_text = "key=value&second_key=second=value";
EXPECT_TRUE(jessilib::deserialize_html_form(parsed_result, query_text));
EXPECT_TRUE(query_text.starts_with("keyvaluesecond_keysecond=value"));
EXPECT_EQ(parsed_result.size(), 2);
EXPECT_EQ(parsed_result[0].first, "key");
EXPECT_EQ(parsed_result[0].second, "value");
EXPECT_EQ(parsed_result[1].first, "second_key");
EXPECT_EQ(parsed_result[1].second, "second=value");
}
TEST(HtmlFormParser, some_sequences) {
std::vector<std::pair<std::string_view, std::string_view>> parsed_result;
std::string query_text = "k+y=va+u%20&%73econd%5Fke%79=second_valu%65";
EXPECT_TRUE(jessilib::deserialize_html_form(parsed_result, query_text));
EXPECT_TRUE(query_text.starts_with("k yva u second_keysecond_value"));
EXPECT_EQ(parsed_result.size(), 2);
EXPECT_EQ(parsed_result[0].first, "k y");
EXPECT_EQ(parsed_result[0].second, "va u ");
EXPECT_EQ(parsed_result[1].first, "second_key");
EXPECT_EQ(parsed_result[1].second, "second_value");
}

6
src/test/unicode.cpp

@ -25,6 +25,12 @@
using namespace jessilib; using namespace jessilib;
using namespace std::literals; using namespace std::literals;
static_assert(codepoint_info<U'\n'>::utf8_length == 1);
static_assert(codepoint_info<U'\n'>::utf16_length == 1);
static_assert(codepoint_info<U'\n'>::utf32_length == 1);
static_assert(codepoint_info<U'\n'>::wchar_length == 1);
static_assert(codepoint_info<U'\n'>::encode_length<char8_t> == 1);
/** encode_codepoint */ /** encode_codepoint */
TEST(UTF8Test, encode_codepoint) { TEST(UTF8Test, encode_codepoint) {

137
src/test/unicode_sequence.cpp

@ -21,7 +21,7 @@
#include "jessilib/unicode.hpp" // string_cast #include "jessilib/unicode.hpp" // string_cast
#include "test.hpp" #include "test.hpp"
using namespace std; using namespace std::literals;
// Compile-time tests for constexpr on compilers which support C++20 constexpr std::string // Compile-time tests for constexpr on compilers which support C++20 constexpr std::string
#ifdef __cpp_lib_constexpr_string #ifdef __cpp_lib_constexpr_string
@ -30,17 +30,8 @@ constexpr std::string cpp_constexpr(std::string_view in_expression) {
jessilib::apply_cpp_escape_sequences(result); jessilib::apply_cpp_escape_sequences(result);
return result; return result;
} }
constexpr std::string query_constexpr(std::string_view in_expression) {
std::string result{ in_expression };
jessilib::deserialize_http_query(result);
return result;
}
static_assert(cpp_constexpr("test"s) == "test"s); static_assert(cpp_constexpr("test"s) == "test"s);
static_assert(cpp_constexpr("\\r\\n"s) == "\r\n"s); static_assert(cpp_constexpr("\\r\\n"s) == "\r\n"s);
static_assert(query_constexpr("test"s) == "test"s);
static_assert(query_constexpr("first+second"s) == "first second"s);
static_assert(query_constexpr("first%20second"s) == "first second"s);
#endif // __cpp_lib_constexpr_string #endif // __cpp_lib_constexpr_string
using char_types = ::testing::Types<char, char8_t, char16_t, char32_t>; using char_types = ::testing::Types<char, char8_t, char16_t, char32_t>;
@ -57,12 +48,6 @@ public:
}; };
TYPED_TEST_SUITE(UnicodeSequenceTest, char_types); TYPED_TEST_SUITE(UnicodeSequenceTest, char_types);
template<typename T>
class UnicodeUTF8SequenceTest : public ::testing::Test {
public:
};
TYPED_TEST_SUITE(UnicodeUTF8SequenceTest, utf8_char_types);
constexpr char32_t MAX_LOOP_CODEPOINT = 0x100FF; // use 0x10FFFF for full testing constexpr char32_t MAX_LOOP_CODEPOINT = 0x100FF; // use 0x10FFFF for full testing
#define TEST_CPP_SEQUENCE(expr) \ #define TEST_CPP_SEQUENCE(expr) \
@ -212,123 +197,3 @@ TYPED_TEST(UnicodeSequenceTest, cpp_u32) {
EXPECT_EQ(decode.codepoint, static_cast<char32_t>(codepoint)); EXPECT_EQ(decode.codepoint, static_cast<char32_t>(codepoint));
} }
} }
/**
* Query strings
*/
TYPED_TEST(UnicodeUTF8SequenceTest, single_chars) {
// [U+0000, U+100FF)
for (char32_t codepoint = 0; codepoint < MAX_LOOP_CODEPOINT; ++codepoint) {
std::basic_string<TypeParam> expected;
size_t units = jessilib::encode_codepoint(expected, codepoint);
EXPECT_NE(units, 0);
EXPECT_EQ(units, expected.size());
// Construct the query string
std::basic_string<TypeParam> query_string;
for (auto& unit : expected) {
char encoded[3] { '%', 0, 0 };
char* encoded_end = encoded + sizeof(encoded);
auto to_chars_result = std::to_chars(encoded + 1, encoded_end, static_cast<unsigned char>(unit), 16);
ASSERT_EQ(to_chars_result.ec, std::errc{}) // assertion will fail when `unit` is signed type
<< "For unit " << static_cast<int>(unit) << " in codepoint " << static_cast<int>(codepoint) << std::endl;
if (to_chars_result.ptr != encoded_end) {
// Only wrote one hex; shift it
encoded[2] = encoded[1];
encoded[1] = '0';
}
EXPECT_EQ(encoded[0], '%');
EXPECT_NE(encoded[1], 0);
EXPECT_NE(encoded[2], 0);
query_string.insert(query_string.end(), encoded, encoded_end);
}
EXPECT_EQ(query_string.size(), expected.size() * 3);
// Decode & check the query string
jessilib::deserialize_http_query(query_string);
EXPECT_EQ(query_string, expected);
}
}
TYPED_TEST(UnicodeUTF8SequenceTest, invalids) {
std::basic_string<TypeParam> query_string, long_query_string;
for (size_t unit = 0; unit <= 0xFF; ++unit) {
TypeParam encoded[2] { '%', static_cast<TypeParam>(unit) };
TypeParam* encoded_end = encoded + sizeof(encoded);
query_string.insert(query_string.end(), encoded, encoded_end);
long_query_string += query_string;
jessilib::deserialize_http_query(query_string);
EXPECT_TRUE(query_string.empty())
<< "in unit: " << unit << std::endl;
}
jessilib::deserialize_http_query(long_query_string);
EXPECT_TRUE(long_query_string.empty());
}
TYPED_TEST(UnicodeUTF8SequenceTest, invalids_2len) {
std::basic_string<TypeParam> query_string, long_query_string;
for (size_t unit = 0; unit <= 0xFFFF; ++unit) {
TypeParam first = static_cast<TypeParam>(unit >> 8); // order of these two doesn't matter
TypeParam second = static_cast<TypeParam>(unit & 0xFF);
if (jessilib::as_base(first, 16) >= 0
&& jessilib::as_base(second, 16) >= 0) {
continue;
}
TypeParam encoded[3] { '%', static_cast<TypeParam>(first), static_cast<TypeParam>(second) };
TypeParam* encoded_end = encoded + sizeof(encoded);
query_string.insert(query_string.end(), encoded, encoded_end);
long_query_string += query_string;
jessilib::deserialize_http_query(query_string);
EXPECT_TRUE(query_string.empty())
<< "in unit: " << unit << std::endl;
}
jessilib::deserialize_http_query(long_query_string);
EXPECT_TRUE(long_query_string.empty());
}
TYPED_TEST(UnicodeUTF8SequenceTest, invalids_trailing) {
std::basic_string<TypeParam> query_string, long_query_string;
for (size_t unit = 0; unit <= 0xFF; ++unit) {
TypeParam encoded[3] { '%', static_cast<TypeParam>(unit), '%' };
TypeParam* encoded_end = encoded + sizeof(encoded);
query_string.insert(query_string.end(), encoded, encoded_end);
long_query_string += query_string;
jessilib::deserialize_http_query(query_string);
EXPECT_TRUE(query_string.empty())
<< "in unit: " << unit << std::endl;
}
jessilib::deserialize_http_query(long_query_string);
EXPECT_TRUE(long_query_string.empty());
}
TYPED_TEST(UnicodeUTF8SequenceTest, invalids_2len_trailing) {
std::basic_string<TypeParam> query_string, long_query_string;
for (size_t unit = 0; unit <= 0xFFFF; ++unit) {
TypeParam first = static_cast<TypeParam>(unit >> 8); // order of these two doesn't matter
TypeParam second = static_cast<TypeParam>(unit & 0xFF);
if (jessilib::as_base(first, 16) >= 0
&& jessilib::as_base(second, 16) >= 0) {
continue;
}
TypeParam encoded[4] { '%', static_cast<TypeParam>(first), static_cast<TypeParam>(second), '%' };
TypeParam* encoded_end = encoded + sizeof(encoded);
query_string.insert(query_string.end(), encoded, encoded_end);
long_query_string += query_string;
jessilib::deserialize_http_query(query_string);
EXPECT_TRUE(query_string.empty())
<< "in unit: " << unit << std::endl;
}
jessilib::deserialize_http_query(long_query_string);
EXPECT_TRUE(long_query_string.empty());
}

Loading…
Cancel
Save