From a232d33d8ac54da009d557e441052e6b1b61a6ae Mon Sep 17 00:00:00 2001 From: Jessica James Date: Sat, 4 Dec 2021 21:10:10 -0600 Subject: [PATCH] Split core unicode methods off to unicode_base; made same methods constexpr except for fold & allocating encode; apply_cpp_escape_sequences & deserialize_http_query now constexpr --- src/common/unicode.cpp | 278 +------------ src/include/jessilib/unicode.hpp | 187 +++------ src/include/jessilib/unicode_base.hpp | 484 ++++++++++++++++++++++ src/include/jessilib/unicode_sequence.hpp | 17 +- src/test/unicode_sequence.cpp | 27 +- 5 files changed, 571 insertions(+), 422 deletions(-) create mode 100644 src/include/jessilib/unicode_base.hpp diff --git a/src/common/unicode.cpp b/src/common/unicode.cpp index a3abc32..a4031ad 100644 --- a/src/common/unicode.cpp +++ b/src/common/unicode.cpp @@ -16,146 +16,10 @@ * Written by Jessica James */ -#include "unicode.hpp" +#include "unicode_base.hpp" namespace jessilib { -/** encode_codepoint */ - -template -void append_helper(std::basic_string& out_string, T in_value) { - out_string += in_value; -} - -template -void append_helper(std::basic_ostream& out_string, T in_value) { - out_string << in_value; -} - -template -void append_helper(T*& out_string, T in_value) { - *out_string = in_value; - ++out_string; -} - -template -size_t encode_codepoint_utf8(T& out_destination, char32_t in_codepoint) { - if (in_codepoint > 0x10FFFF) { - return 0; - } - - if (in_codepoint <= 0x007F) { - // 1-byte sequence (7 bits) - append_helper(out_destination, static_cast(in_codepoint)); - return 1; - } - - if (in_codepoint <= 0x07FF) { - // 2-byte sequence (11 bits; 5 + 6) - append_helper(out_destination, static_cast(0xC0 | ((in_codepoint >> 6) & 0x1F))); - append_helper(out_destination, static_cast(0x80 | (in_codepoint & 0x3F))); - return 2; - } - - if (in_codepoint <= 0xFFFF) { - // 3-byte sequence (16 bits; 4 + 6 + 6) - append_helper(out_destination, static_cast(0xE0 | ((in_codepoint >> 12) & 0x0F))); - append_helper(out_destination, static_cast(0x80 | ((in_codepoint >> 6) & 0x3F))); - append_helper(out_destination, static_cast(0x80 | (in_codepoint & 0x3F))); - return 3; - } - - // 4-byte sequence (21 bits; 3 + 6 + 6 + 6) - append_helper(out_destination, static_cast(0xF0 | ((in_codepoint >> 18) & 0x07))); - append_helper(out_destination, static_cast(0x80 | ((in_codepoint >> 12) & 0x3F))); - append_helper(out_destination, static_cast(0x80 | ((in_codepoint >> 6) & 0x3F))); - append_helper(out_destination, static_cast(0x80 | (in_codepoint & 0x3F))); - return 4; -} - -template -size_t encode_codepoint_utf16(T& out_destination, char32_t in_codepoint) { - if (in_codepoint > 0x10FFFF) { - return 0; - } - - if (in_codepoint <= 0xFFFF) { - // 1-unit sequence - append_helper(out_destination, static_cast(in_codepoint)); - return 1; - } - - // 2-unit sequence - in_codepoint -= 0x10000; - append_helper(out_destination, static_cast((in_codepoint >> 10) + 0xD800)); - append_helper(out_destination, static_cast((in_codepoint & 0x03FF) + 0xDC00)); - return 2; -} - -template -size_t encode_codepoint_utf32(T& out_destination, char32_t in_codepoint) { - if (in_codepoint > 0x10FFFF) { - return 0; - } - - append_helper(out_destination, in_codepoint); - return 1; -} - -/** Strings */ - -size_t encode_codepoint(std::string& out_string, char32_t in_codepoint) { - return encode_codepoint_utf8(out_string, in_codepoint); -} - -size_t encode_codepoint(std::u8string& out_string, char32_t in_codepoint) { - return encode_codepoint_utf8(out_string, in_codepoint); -} - -size_t encode_codepoint(std::u16string& out_string, char32_t in_codepoint) { - return encode_codepoint_utf16(out_string, in_codepoint); -} - -size_t encode_codepoint(std::u32string& out_string, char32_t in_codepoint) { - return encode_codepoint_utf32(out_string, in_codepoint); -} - -/** Streams */ - -size_t encode_codepoint(std::basic_ostream& out_stream, char32_t in_codepoint) { - return encode_codepoint_utf8, char>(out_stream, in_codepoint); -} - -size_t encode_codepoint(std::basic_ostream& out_stream, char32_t in_codepoint) { - return encode_codepoint_utf8, char8_t>(out_stream, in_codepoint); -} - -size_t encode_codepoint(std::basic_ostream& out_stream, char32_t in_codepoint) { - return encode_codepoint_utf16(out_stream, in_codepoint); -} - -size_t encode_codepoint(std::basic_ostream& out_stream, char32_t in_codepoint) { - return encode_codepoint_utf32(out_stream, in_codepoint); -} - -/** Pointers */ - -size_t encode_codepoint(char* out_buffer, char32_t in_codepoint) { - return encode_codepoint_utf8(out_buffer, in_codepoint); -} - -size_t encode_codepoint(char8_t* out_buffer, char32_t in_codepoint) { - return encode_codepoint_utf8(out_buffer, in_codepoint); -} - -size_t encode_codepoint(char16_t* out_buffer, char32_t in_codepoint) { - return encode_codepoint_utf16(out_buffer, in_codepoint); -} - -size_t encode_codepoint(char32_t* out_buffer, char32_t in_codepoint) { - return encode_codepoint_utf32(out_buffer, in_codepoint); -} - /** Allocating */ std::u8string encode_codepoint_u8(char32_t in_codepoint) { @@ -176,126 +40,12 @@ std::u32string encode_codepoint_u32(char32_t in_codepoint) { return result; } -/** decode_codepoint */ - -get_endpoint_result decode_codepoint(const std::string_view& in_string) { - return decode_codepoint(std::u8string_view{ reinterpret_cast(in_string.data()), in_string.size() }); -} - -get_endpoint_result decode_codepoint(const std::u8string_view& in_string) { - get_endpoint_result result{ 0, 0 }; - - if (in_string.empty()) { - return result; - } - - if ((in_string.front() & 0x80) != 0) { // UTF-8 sequence{ - // Validity check - if (in_string.size() < 2 - || (in_string.front() & 0x40) == 0) { - // This is an invalid 1 byte sequence - return result; - } - - // get codepoint value - if ((in_string.front() & 0x20) != 0) { - // This is a 3+ byte sequence - if (in_string.size() < 3) { - // Invalid sequence; too few characters available - return result; - } - - if ((in_string.front() & 0x10) != 0) { - // This is a 4 byte sequence - if (in_string.size() < 4) { - // Invalid sequence; too few characters available - return result; - } - - result.codepoint = static_cast(in_string[0] & 0x0F) << 18; - result.codepoint += static_cast(in_string[1] & 0x3F) << 12; - result.codepoint += static_cast(in_string[2] & 0x3F) << 6; - result.codepoint += static_cast(in_string[3] & 0x3F); - result.units = 4; - return result; - } - - // this is a 3 byte sequence - result.codepoint = static_cast(in_string[0] & 0x0F) << 12; - result.codepoint += static_cast(in_string[1] & 0x3F) << 6; - result.codepoint += static_cast(in_string[2] & 0x3F); - result.units = 3; - return result; - } - - // This is a 2 byte sequence - result.codepoint = static_cast(in_string[0] & 0x1F) << 6; - result.codepoint += static_cast(in_string[1] & 0x3F); - result.units = 2; - return result; - } - - // This is a valid 1 byte sequence - result.codepoint = static_cast(in_string.front()); - result.units = 1; - +std::wstring encode_codepoint_w(char32_t in_codepoint) { + std::wstring result; + encode_codepoint(result, in_codepoint); return result; } -get_endpoint_result decode_codepoint(const std::u16string_view& in_string) { - if (in_string.empty()) { - return { 0, 0 }; - } - - if (is_high_surrogate(in_string.front()) // If this is a high surrogate codepoint... - && in_string.size() > 1 // And a codepoint follows this surrogate.. - && is_low_surrogate(in_string[1])) { // And that codepoint is a low surrogate... - // We have a valid surrogate pair; decode it into a codepoint and return - char32_t codepoint { static_cast( - ((in_string.front() - 0xD800U) * 0x400U) // high surrogate magic - + (in_string[1] - 0xDC00U) // low surrogate magic - + 0x10000ULL // more magic - ) }; - - return { codepoint, 2 }; - } - - // Codepoint is a single char16_t; return codepoint directly - return { in_string.front(), 1 }; -} - -get_endpoint_result decode_codepoint(const std::u32string_view& in_string) { - if (in_string.empty()) { - return { 0, 0 }; - } - - return { in_string.front(), 1 }; -} - -bool is_high_surrogate(char32_t in_codepoint) { - return in_codepoint >= 0xD800 && in_codepoint <= 0xDBFF; -} - -bool is_low_surrogate(char32_t in_codepoint) { - return in_codepoint >= 0xDC00 && in_codepoint <= 0xDFFF; -} - -get_endpoint_result decode_surrogate_pair(char16_t in_high_surrogate, char16_t in_low_surrogate) { - if (is_high_surrogate(in_high_surrogate) - && is_low_surrogate((in_low_surrogate))) { - // We have a valid surrogate pair; decode it into a codepoint and return - char32_t codepoint { static_cast( - ((in_high_surrogate - 0xD800U) * 0x400U) // high surrogate magic - + (in_low_surrogate - 0xDC00U) // low surrogate magic - + 0x10000ULL // more magic - ) }; - - return { codepoint, 2 }; - } - - return { 0, 0 }; -} - /** * Codepoint folding (case-insensitive character comparisons) */ @@ -549,24 +299,4 @@ char32_t fold(char32_t in_codepoint) { return match->fold(in_codepoint); } -static constexpr unsigned char base_table[]{ - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 127, 127, 127, 127, 127, 127, - 127, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 127, 127, 127, 127, 127, - 127, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 127, 127, 127, 127, 127, -}; - -int as_base(char32_t in_character, unsigned int base) { - if (in_character >= sizeof(base_table)) { - return -1; - } - - unsigned int result = base_table[in_character]; - if (result >= base) { - return -1; - } - - return base_table[in_character]; -} - } // namespace jessilib diff --git a/src/include/jessilib/unicode.hpp b/src/include/jessilib/unicode.hpp index f7e976c..3ab4818 100644 --- a/src/include/jessilib/unicode.hpp +++ b/src/include/jessilib/unicode.hpp @@ -21,109 +21,10 @@ #include #include #include +#include "unicode_base.hpp" namespace jessilib { -/** encode_codepoint */ - -/** - * Encodes a codepoint, and appends it to an output string - * - * @param out_string String to append - * @param in_codepoint Codepoint to encode - * @return Number of data elements appended to out_string - */ -size_t encode_codepoint(std::string& out_string, char32_t in_codepoint); // DEPRECATED: ASSUMES UTF-8 -size_t encode_codepoint(std::u8string& out_string, char32_t in_codepoint); -size_t encode_codepoint(std::u16string& out_string, char32_t in_codepoint); -size_t encode_codepoint(std::u32string& out_string, char32_t in_codepoint); - -/** - * Encodes a codepoint to an output stream - * - * @param out_stream Stream to write codepoint to - * @param in_codepoint Codepoint to encode - * @return Number of data elements appending to out_stream - */ -size_t encode_codepoint(std::basic_ostream& out_stream, char32_t in_codepoint); // DEPRECATED: ASSUMES UTF-8 -size_t encode_codepoint(std::basic_ostream& out_stream, char32_t in_codepoint); -size_t encode_codepoint(std::basic_ostream& out_stream, char32_t in_codepoint); -size_t encode_codepoint(std::basic_ostream& out_stream, char32_t in_codepoint); - -/** - * Encodes a codepoint directly to a character buffer - * Note: Do not use this without careful consideration; note the size requirements: - * 1) char8_t may write up to 4 elements - * 2) char16_t may write up to 2 elements - * 3) char32_t may write up to 1 element - * 4) char may write up to 4 elements; provided solely for compatibility/ease of use - * - * @param out_buffer Character buffer to write to - * @param in_codepoint Codepoint to encode - * @return Number of data elements written to out_buffer - */ -size_t encode_codepoint(char* out_buffer, char32_t in_codepoint); -size_t encode_codepoint(char8_t* out_buffer, char32_t in_codepoint); -size_t encode_codepoint(char16_t* out_buffer, char32_t in_codepoint); -size_t encode_codepoint(char32_t* out_buffer, char32_t in_codepoint); - -/** - * Encodes a codepoint and returns it as a string - * - * @param in_codepoint Codepoint to encode - * @return A string containing the codepoint encoded to the appropriate underlying CharT type - */ -std::u8string encode_codepoint_u8(char32_t in_codepoint); -std::u16string encode_codepoint_u16(char32_t in_codepoint); -std::u32string encode_codepoint_u32(char32_t in_codepoint); - -/** decode_codepoint */ - -struct get_endpoint_result { - char32_t codepoint{}; // Codepoint - size_t units{}; // Number of data units codepoint was represented by, or 0 -}; - -/** - * Decodes the front codepoint in a string - * - * @param in_string String to decode a codepoint from - * @return A struct containing a valid codepoint and the number of representative data units on success, zero otherwise. - */ -get_endpoint_result decode_codepoint(const std::string_view& in_string); // DEPRECATED: ASSUMES UTF-8 -get_endpoint_result decode_codepoint(const std::u8string_view& in_string); // UTF-8 -get_endpoint_result decode_codepoint(const std::u16string_view& in_string); // UTF-16 -get_endpoint_result decode_codepoint(const std::u32string_view& in_string); // UTF-32 - -/** advance_codepoint */ - -template -char32_t advance_codepoint(std::basic_string_view& in_string) { - auto result = decode_codepoint(in_string); - in_string.remove_prefix(result.units); - return result.codepoint; -} - -/** next_codepoint */ - -template -std::basic_string_view next_codepoint(const std::basic_string_view& in_string) { - return in_string.substr(decode_codepoint(in_string).units); -} - -/** is_valid_codepoint */ - -template -bool is_valid_codepoint(const std::basic_string_view& in_string) { - return decode_codepoint(in_string).units != 0; -} - -/** utf-16 surrogate helpers */ - -bool is_high_surrogate(char32_t in_codepoint); -bool is_low_surrogate(char32_t in_codepoint); -get_endpoint_result decode_surrogate_pair(char16_t in_high_surrogate, char16_t in_low_surrogate); - /** Utilities */ namespace impl_unicode { @@ -135,6 +36,8 @@ struct is_string : std::false_type {}; template struct is_string> { using type = T; + static constexpr bool is_fixed_array{ false }; + static constexpr bool is_container{ true }; static constexpr bool value{ true }; constexpr operator bool() const noexcept { return true; } constexpr bool operator()() const noexcept { return true; } @@ -143,6 +46,8 @@ struct is_string> { template struct is_string> { using type = T; + static constexpr bool is_fixed_array{ false }; + static constexpr bool is_container{ true }; static constexpr bool value{ true }; constexpr operator bool() const noexcept { return true; } constexpr bool operator()() const noexcept { return true; } @@ -151,6 +56,8 @@ struct is_string> { template struct is_string { using type = T; + static constexpr bool is_fixed_array{ false }; + static constexpr bool is_container{ false }; static constexpr bool value{ true }; constexpr operator bool() const noexcept { return true; } constexpr bool operator()() const noexcept { return true; } @@ -159,6 +66,8 @@ struct is_string { template struct is_string { using type = T; + static constexpr bool is_fixed_array{ true }; + static constexpr bool is_container{ false }; static constexpr bool value{ true }; constexpr operator bool() const noexcept { return true; } constexpr bool operator()() const noexcept { return true; } @@ -167,6 +76,8 @@ struct is_string { template struct is_string { using type = T; + static constexpr bool is_fixed_array{ true }; + static constexpr bool is_container{ false }; static constexpr bool value{ true }; constexpr operator bool() const noexcept { return true; } constexpr bool operator()() const noexcept { return true; } @@ -214,58 +125,60 @@ std::basic_string_view string_view_cast(const InT& in_string) { size_t out_string_units = in_string_bytes / sizeof(OutCharT); const OutCharT* data_begin = reinterpret_cast(in_string.data()); - std::basic_string_view result{ data_begin, out_string_units }; - - if (!is_valid(result)) { - // Result isn't valid; discard and return empty - return {}; - } - - return result; + return { data_begin, out_string_units }; } template std::basic_string string_cast(const InT& in_string) { static_assert(impl_unicode::is_string::value == true); using InCharT = typename impl_unicode::is_string::type; + using InEquivalentT = typename unicode_traits::equivalent_type; using InViewT = std::basic_string_view; - std::basic_string result; + using OutT = std::basic_string; - // Just do a dumb copy when same type & valid; should be slightly faster than re-encoding - if constexpr (std::is_same_v) { - if (is_valid(in_string)) { - result = in_string; - } - - return result; + if constexpr (std::is_same_v) { + // This does nothing at all; consider static_assert against this? + return in_string; } - - InViewT in_string_view = static_cast(in_string); - if constexpr (sizeof(InCharT) <= sizeof(OutCharT)) { - // When copying to a larger type, we will need _at most_ as many elements as the smaller storage type - result.reserve(in_string_view.size()); + else if constexpr (std::is_same_v + || std::is_same_v) { + // Just do a dumb copy when same or equivalent char types; should be faster than re-encoding + if constexpr (impl_unicode::is_string::is_container) { + return { reinterpret_cast(in_string.data()), in_string.size() }; + } + else if constexpr (impl_unicode::is_string::is_fixed_array) { + return { reinterpret_cast(in_string), std::size(in_string) - 1 }; // strip null term + } + else { + return { reinterpret_cast(in_string) }; + } } else { - result.reserve(in_string_view.size() * (sizeof(OutCharT) / sizeof(InCharT))); - } + // Last resort: reencode the string + std::basic_string result; + InViewT in_string_view = static_cast(in_string); + if constexpr (sizeof(InCharT) <= sizeof(OutCharT)) { + // When copying to a larger type, we will need _at most_ as many elements as the smaller storage type + result.reserve(in_string_view.size()); + } + else { + result.reserve(in_string_view.size() * (sizeof(OutCharT) / sizeof(InCharT))); + } - while (!in_string_view.empty()) { - get_endpoint_result string_front = decode_codepoint(in_string_view); - if (string_front.units == 0) { - return {}; + while (!in_string_view.empty()) { + get_endpoint_result string_front = decode_codepoint(in_string_view); + if (string_front.units == 0) { + return {}; + } + in_string_view.remove_prefix(string_front.units); + + encode_codepoint(result, string_front.codepoint); } - in_string_view.remove_prefix(string_front.units); - encode_codepoint(result, string_front.codepoint); + return result; } - - return result; } -/** single-unit helper utilities */ -char32_t fold(char32_t in_codepoint); // Folds codepoint for case-insensitive checks (not for human output) -int as_base(char32_t in_character, unsigned int base); // The value represented by in_character in terms of base if valid, -1 otherwise - /** * Checks if two codepoints are equal to each-other (case insensitive) * @@ -753,7 +666,7 @@ struct text_hash { get_endpoint_result decode; while (data != end) { - decode = decode_codepoint({data, static_cast(end - data)}); + decode = decode_codepoint(data, end); if (decode.units == 0) { return hash; } @@ -838,7 +751,7 @@ struct text_hashi { get_endpoint_result decode; while (data != end) { - decode = decode_codepoint({data, static_cast(end - data)}); + decode = decode_codepoint(data, end - data); if (decode.units == 0) { return hash; } diff --git a/src/include/jessilib/unicode_base.hpp b/src/include/jessilib/unicode_base.hpp new file mode 100644 index 0000000..30c1e3b --- /dev/null +++ b/src/include/jessilib/unicode_base.hpp @@ -0,0 +1,484 @@ +/** + * Copyright (C) 2018-2021 Jessica James. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * Written by Jessica James + */ + +#pragma once + +#include +#include +#include + +namespace jessilib { + +/** encode_codepoint */ + +/** + * Encodes a codepoint, and appends it to an output string + * + * @param out_string String to append + * @param in_codepoint Codepoint to encode + * @return Number of data elements appended to out_string + */ +template +constexpr size_t encode_codepoint(std::basic_string& out_string, char32_t in_codepoint); + +/** + * Encodes a codepoint to an output stream + * + * @param out_stream Stream to write codepoint to + * @param in_codepoint Codepoint to encode + * @return Number of data elements appending to out_stream + */ +template +constexpr size_t encode_codepoint(std::basic_ostream& out_stream, char32_t in_codepoint); + +/** + * Encodes a codepoint directly to a character buffer + * Note: Do not use this without careful consideration; note the size requirements: + * 1) char8_t may write up to 4 elements + * 2) char16_t may write up to 2 elements + * 3) char32_t may write up to 1 element + * 4) char may write up to 4 elements; provided solely for compatibility/ease of use + * + * @param out_buffer Character buffer to write to + * @param in_codepoint Codepoint to encode + * @return Number of data elements written to out_buffer + */ +template +constexpr size_t encode_codepoint(CharT* out_buffer, char32_t in_codepoint); + +/** + * Encodes a codepoint and returns it as a string + * + * @param in_codepoint Codepoint to encode + * @return A string containing the codepoint encoded to the appropriate underlying CharT type + */ +std::u8string encode_codepoint_u8(char32_t in_codepoint); +std::u16string encode_codepoint_u16(char32_t in_codepoint); +std::u32string encode_codepoint_u32(char32_t in_codepoint); +std::wstring encode_codepoint_w(char32_t in_codepoint); // ASSUMES UTF-16 OR UTF-32 + +/** decode_codepoint */ + +struct get_endpoint_result { + char32_t codepoint{}; // Codepoint + size_t units{}; // Number of data units codepoint was represented by, or 0 +}; + +/** + * Decodes the front codepoint in a string + * + * @param in_string String to decode a codepoint from + * @return A struct containing a valid codepoint and the number of representative data units on success, zero otherwise. + */ +template +constexpr get_endpoint_result decode_codepoint_utf8(std::basic_string_view in_string); // UTF-8 +template +constexpr get_endpoint_result decode_codepoint_utf16(std::basic_string_view in_string); // UTF-16 +template +constexpr get_endpoint_result decode_codepoint_utf32(std::basic_string_view in_string); // UTF-32 +template +constexpr get_endpoint_result decode_codepoint(std::basic_string_view in_string); // ASSUMES UTF-16 OR UTF-32 +template +constexpr get_endpoint_result decode_codepoint(const CharT* in_begin, size_t in_length); +template +constexpr get_endpoint_result decode_codepoint(const CharT* in_begin, const CharT* in_end); + +/** advance_codepoint */ + +template +char32_t advance_codepoint(std::basic_string_view& in_string) { + auto result = decode_codepoint(in_string); + in_string.remove_prefix(result.units); + return result.codepoint; +} + +/** next_codepoint */ + +template +std::basic_string_view next_codepoint(const std::basic_string_view& in_string) { + return in_string.substr(decode_codepoint(in_string).units); +} + +/** is_valid_codepoint */ + +template +bool is_valid_codepoint(const std::basic_string_view& in_string) { + return decode_codepoint(in_string).units != 0; +} + +/** utf-16 surrogate helpers */ + +constexpr bool is_high_surrogate(char32_t in_codepoint); +constexpr bool is_low_surrogate(char32_t in_codepoint); +constexpr get_endpoint_result decode_surrogate_pair(char16_t in_high_surrogate, char16_t in_low_surrogate); + +template +struct unicode_traits : std::false_type {}; + +template<> +struct unicode_traits : std::true_type { + using equivalent_type = char8_t; // DEPRECATE + static constexpr size_t max_units_per_codepoint = 4; +}; + +template<> +struct unicode_traits : std::true_type { + using equivalent_type = char; // DEPRECATE + static constexpr size_t max_units_per_codepoint = 4; +}; + +template<> +struct unicode_traits : std::true_type { + using equivalent_type = std::conditional_t; + static constexpr size_t max_units_per_codepoint = 2; +}; + +template<> +struct unicode_traits : std::true_type { + using equivalent_type = std::conditional_t; + static constexpr size_t max_units_per_codepoint = 1; +}; + +template<> +struct unicode_traits : std::true_type { + using equivalent_type = std::conditional_t; + static constexpr size_t max_units_per_codepoint = unicode_traits::max_units_per_codepoint; +}; + +template +using encode_buffer_type = CharT[unicode_traits::max_units_per_codepoint]; + +/** single-unit helper utilities */ +char32_t fold(char32_t in_codepoint); // Folds codepoint for case-insensitive checks (not for human output) +constexpr int as_base(char32_t in_character, unsigned int base); // The value represented by in_character in terms of base if valid, -1 otherwise + +/** + * Inline constexpr encode implementation + */ + +/** encode_codepoint */ + +namespace impl_unicode { + +template +constexpr void append_helper(std::basic_string& out_string, T in_value) { + out_string += in_value; +} + +template +constexpr void append_helper(std::basic_ostream& out_string, T in_value) { + out_string << in_value; +} + +template +constexpr void append_helper(T*& out_string, T in_value) { + *out_string = in_value; + ++out_string; +} + +} // namespace impl_unicode + +template +constexpr size_t encode_codepoint_utf8(T& out_destination, char32_t in_codepoint) { + if (in_codepoint > 0x10FFFF) { + return 0; + } + + if (in_codepoint <= 0x007F) { + // 1-byte sequence (7 bits) + impl_unicode::append_helper(out_destination, static_cast(in_codepoint)); + return 1; + } + + if (in_codepoint <= 0x07FF) { + // 2-byte sequence (11 bits; 5 + 6) + impl_unicode::append_helper(out_destination, static_cast(0xC0 | ((in_codepoint >> 6) & 0x1F))); + impl_unicode::append_helper(out_destination, static_cast(0x80 | (in_codepoint & 0x3F))); + return 2; + } + + if (in_codepoint <= 0xFFFF) { + // 3-byte sequence (16 bits; 4 + 6 + 6) + impl_unicode::append_helper(out_destination, static_cast(0xE0 | ((in_codepoint >> 12) & 0x0F))); + impl_unicode::append_helper(out_destination, static_cast(0x80 | ((in_codepoint >> 6) & 0x3F))); + impl_unicode::append_helper(out_destination, static_cast(0x80 | (in_codepoint & 0x3F))); + return 3; + } + + // 4-byte sequence (21 bits; 3 + 6 + 6 + 6) + impl_unicode::append_helper(out_destination, static_cast(0xF0 | ((in_codepoint >> 18) & 0x07))); + impl_unicode::append_helper(out_destination, static_cast(0x80 | ((in_codepoint >> 12) & 0x3F))); + impl_unicode::append_helper(out_destination, static_cast(0x80 | ((in_codepoint >> 6) & 0x3F))); + impl_unicode::append_helper(out_destination, static_cast(0x80 | (in_codepoint & 0x3F))); + return 4; +} + +template +constexpr size_t encode_codepoint_utf16(T& out_destination, char32_t in_codepoint) { + if (in_codepoint > 0x10FFFF) { + return 0; + } + + if (in_codepoint <= 0xFFFF) { + // 1-unit sequence + impl_unicode::append_helper(out_destination, static_cast(in_codepoint)); + return 1; + } + + // 2-unit sequence + in_codepoint -= 0x10000; + impl_unicode::append_helper(out_destination, static_cast((in_codepoint >> 10) + 0xD800)); + impl_unicode::append_helper(out_destination, static_cast((in_codepoint & 0x03FF) + 0xDC00)); + return 2; +} + +template +constexpr size_t encode_codepoint_utf32(T& out_destination, char32_t in_codepoint) { + if (in_codepoint > 0x10FFFF) { + return 0; + } + + impl_unicode::append_helper(out_destination, static_cast(in_codepoint)); + return 1; +} + +template +constexpr size_t encode_codepoint_w(T& out_destination, char32_t in_codepoint) { + if constexpr (std::is_same_v::equivalent_type, char16_t>) { + return encode_codepoint_utf16(out_destination, in_codepoint); + } + + if constexpr (std::is_same_v::equivalent_type, char32_t>) { + return encode_codepoint_utf32(out_destination, in_codepoint); + } +} + +template +constexpr size_t encode_codepoint_utf(T& out_destination, char32_t in_codepoint) { + if constexpr (std::is_same_v) { + return encode_codepoint_utf8(out_destination, in_codepoint); + } + else if constexpr (std::is_same_v) { + return encode_codepoint_utf16(out_destination, in_codepoint); + } + else if constexpr (std::is_same_v) { + return encode_codepoint_utf32(out_destination, in_codepoint); + } + else if constexpr (std::is_same_v) { + return encode_codepoint_w(out_destination, in_codepoint); + } + else if constexpr (std::is_same_v) { + return encode_codepoint_utf8(out_destination, in_codepoint); + } +} + +template +constexpr size_t encode_codepoint(std::basic_string& out_string, char32_t in_codepoint) { + return encode_codepoint_utf(out_string, in_codepoint); +} + +template +constexpr size_t encode_codepoint(std::basic_ostream& out_stream, char32_t in_codepoint) { + return encode_codepoint_utf(out_stream, in_codepoint); +} + +template +constexpr size_t encode_codepoint(CharT* out_buffer, char32_t in_codepoint) { + return encode_codepoint_utf(out_buffer, in_codepoint); +} + +/** + * Inline constexpr decode implementation + */ + +/** decode_codepoint */ + +template +constexpr get_endpoint_result decode_codepoint_utf8(std::basic_string_view in_string) { + get_endpoint_result result{ 0, 0 }; + + if (in_string.empty()) { + return result; + } + + if ((in_string.front() & 0x80) != 0) { // UTF-8 sequence{ + // Validity check + if (in_string.size() < 2 + || (in_string.front() & 0x40) == 0) { + // This is an invalid 1 byte sequence + return result; + } + + // get codepoint value + if ((in_string.front() & 0x20) != 0) { + // This is a 3+ byte sequence + if (in_string.size() < 3) { + // Invalid sequence; too few characters available + return result; + } + + if ((in_string.front() & 0x10) != 0) { + // This is a 4 byte sequence + if (in_string.size() < 4) { + // Invalid sequence; too few characters available + return result; + } + + result.codepoint = static_cast(in_string[0] & 0x0F) << 18; + result.codepoint += static_cast(in_string[1] & 0x3F) << 12; + result.codepoint += static_cast(in_string[2] & 0x3F) << 6; + result.codepoint += static_cast(in_string[3] & 0x3F); + result.units = 4; + return result; + } + + // this is a 3 byte sequence + result.codepoint = static_cast(in_string[0] & 0x0F) << 12; + result.codepoint += static_cast(in_string[1] & 0x3F) << 6; + result.codepoint += static_cast(in_string[2] & 0x3F); + result.units = 3; + return result; + } + + // This is a 2 byte sequence + result.codepoint = static_cast(in_string[0] & 0x1F) << 6; + result.codepoint += static_cast(in_string[1] & 0x3F); + result.units = 2; + return result; + } + + // This is a valid 1 byte sequence + result.codepoint = static_cast(in_string.front()); + result.units = 1; + + return result; +} + +template +constexpr get_endpoint_result decode_codepoint_utf16(std::basic_string_view in_string) { + if (in_string.empty()) { + return { 0, 0 }; + } + + if (is_high_surrogate(in_string.front()) // If this is a high surrogate codepoint... + && in_string.size() > 1 // And a codepoint follows this surrogate.. + && is_low_surrogate(in_string[1])) { // And that codepoint is a low surrogate... + // We have a valid surrogate pair; decode it into a codepoint and return + char32_t codepoint { static_cast( + ((in_string.front() - 0xD800U) * 0x400U) // high surrogate magic + + (in_string[1] - 0xDC00U) // low surrogate magic + + 0x10000ULL // more magic + ) }; + + return { codepoint, 2 }; + } + + // Codepoint is a single char16_t; return codepoint directly + return { in_string.front(), 1 }; +} + +template +constexpr get_endpoint_result decode_codepoint_utf32(std::basic_string_view in_string) { + if (in_string.empty()) { + return { 0, 0 }; + } + + return { in_string.front(), 1 }; +} + +template +constexpr get_endpoint_result decode_codepoint(std::basic_string_view in_string) { + if constexpr (std::is_same_v) { + return decode_codepoint_utf8(in_string); + } + else if constexpr (std::is_same_v) { + return decode_codepoint_utf16(in_string); + } + else if constexpr (std::is_same_v) { + return decode_codepoint_utf32(in_string); + } + else if constexpr (std::is_same_v) { + if constexpr (std::is_same_v::equivalent_type, char16_t>) { + return decode_codepoint_utf16(in_string); + } + else if constexpr (std::is_same_v::equivalent_type, char32_t>) { + return decode_codepoint_utf32(in_string); + } + } + else if constexpr (std::is_same_v) { + return decode_codepoint_utf8(in_string); + } +} + +template +constexpr get_endpoint_result decode_codepoint(const CharT* in_begin, size_t in_length) { + return decode_codepoint(std::basic_string_view{in_begin, in_length}); +} + +template +constexpr get_endpoint_result decode_codepoint(const CharT* in_begin, const CharT* in_end) { + return decode_codepoint(std::basic_string_view{in_begin, static_cast(in_end - in_begin)}); +} + +constexpr bool is_high_surrogate(char32_t in_codepoint) { + return in_codepoint >= 0xD800 && in_codepoint <= 0xDBFF; +} + +constexpr bool is_low_surrogate(char32_t in_codepoint) { + return in_codepoint >= 0xDC00 && in_codepoint <= 0xDFFF; +} + +constexpr get_endpoint_result decode_surrogate_pair(char16_t in_high_surrogate, char16_t in_low_surrogate) { + if (is_high_surrogate(in_high_surrogate) + && is_low_surrogate((in_low_surrogate))) { + // We have a valid surrogate pair; decode it into a codepoint and return + char32_t codepoint { static_cast( + ((in_high_surrogate - 0xD800U) * 0x400U) // high surrogate magic + + (in_low_surrogate - 0xDC00U) // low surrogate magic + + 0x10000ULL // more magic + ) }; + + return { codepoint, 2 }; + } + + return { 0, 0 }; +} + +// Maybe this should be moved back to .cpp and provide separate constexpr/non-constexpr variants? +static constexpr unsigned char base_table[]{ + 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 127, 127, 127, 127, 127, 127, + 127, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 127, 127, 127, 127, 127, + 127, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 127, 127, 127, 127, 127, +}; + +// If we're already making two comparisons, what would the real impact be of a couple more and removing the lookup table? +constexpr int as_base(char32_t in_character, unsigned int base) { + if (in_character >= sizeof(base_table)) { + return -1; + } + + unsigned int result = base_table[in_character]; + if (result >= base) { + return -1; + } + + return base_table[in_character]; +} + +} // namespace jessilib diff --git a/src/include/jessilib/unicode_sequence.hpp b/src/include/jessilib/unicode_sequence.hpp index 4464c82..a912e66 100644 --- a/src/include/jessilib/unicode_sequence.hpp +++ b/src/include/jessilib/unicode_sequence.hpp @@ -25,8 +25,7 @@ #pragma once -#include -#include "unicode.hpp" +#include "unicode_base.hpp" namespace jessilib { @@ -41,7 +40,7 @@ template using shrink_sequence_tree_member = const std::pair>; template -bool shrink_tree_member_compare(const shrink_sequence_tree_member& in_lhs, const char32_t in_rhs) { +constexpr bool shrink_tree_member_compare(const shrink_sequence_tree_member& in_lhs, const char32_t in_rhs) { return in_lhs.first < in_rhs; } @@ -82,7 +81,7 @@ constexpr bool is_simple() { // Only use for ASTs where each character process is guaranteed to write at most 1 character for each character consumed template SequenceTreeBegin, size_t SequenceTreeSize> -bool apply_shrink_sequence_tree(std::basic_string& inout_string) { +constexpr bool apply_shrink_sequence_tree(std::basic_string& inout_string) { if (inout_string.empty()) { // Nothing to parse return true; @@ -93,7 +92,7 @@ bool apply_shrink_sequence_tree(std::basic_string& inout_string) { get_endpoint_result decode; constexpr auto SubTreeEnd = SequenceTreeBegin + SequenceTreeSize; - while ((decode = decode_codepoint(read_view)).units != 0) { // TODO: make constexpr + while ((decode = decode_codepoint(read_view)).units != 0) { auto parser = std::lower_bound(SequenceTreeBegin, SubTreeEnd, decode.codepoint, &shrink_tree_member_compare); if (parser == SubTreeEnd || parser->first != decode.codepoint) { // Just a normal character; write it over @@ -306,8 +305,8 @@ constexpr shrink_sequence_tree_member make_hex_sequence_pair() { // Calls into another tree with the next character template SubTreeBegin, size_t SubTreeSize, bool FailNotFound = true> constexpr shrink_sequence_tree_member make_tree_sequence_pair() { - return { InCodepointV, [](CharT*& in_write_head, std::basic_string_view& read_view) { - auto decode = decode_codepoint(read_view); // TODO: make constexpr + return { InCodepointV, [](CharT*& in_write_head, std::basic_string_view& read_view) constexpr { + auto decode = decode_codepoint(read_view); constexpr shrink_sequence_tree_member* SubTreeEnd = SubTreeBegin + SubTreeSize; auto parser = std::lower_bound(SubTreeBegin, SubTreeEnd, decode.codepoint, &shrink_tree_member_compare); @@ -388,7 +387,7 @@ static constexpr shrink_sequence_tree cpp_escapes_root_tree{ // Return true for valid sequences, false otherwise template -bool apply_cpp_escape_sequences(std::basic_string& inout_string) { +constexpr bool apply_cpp_escape_sequences(std::basic_string& inout_string) { static_assert(is_sorted, std::size(cpp_escapes_root_tree)>(), "Tree must be pre-sorted"); static_assert(is_sorted, std::size(cpp_escapes_main_tree)>(), "Tree must be pre-sorted"); @@ -410,7 +409,7 @@ static_assert(is_sorted, std::siz template* = nullptr> -bool deserialize_http_query(std::basic_string& inout_string) { +constexpr bool deserialize_http_query(std::basic_string& inout_string) { return apply_shrink_sequence_tree, std::size(http_query_escapes_root_tree)>(inout_string); } diff --git a/src/test/unicode_sequence.cpp b/src/test/unicode_sequence.cpp index 83e4c22..7ab26c5 100644 --- a/src/test/unicode_sequence.cpp +++ b/src/test/unicode_sequence.cpp @@ -18,8 +18,31 @@ #include "jessilib/unicode_sequence.hpp" #include +#include "jessilib/unicode.hpp" // string_cast #include "test.hpp" +using namespace std; + +// Compile-time tests for constexpr on compilers which support C++20 constexpr std::string +#ifdef __cpp_lib_constexpr_string +constexpr std::string cpp_constexpr(std::string_view in_expression) { + std::string result{ in_expression }; + jessilib::apply_cpp_escape_sequences(result); + return result; +} + +constexpr std::string query_constexpr(std::string_view in_expression) { + std::string result{ in_expression }; + jessilib::deserialize_http_query(result); + return result; +} +static_assert(cpp_constexpr("test"s) == "test"s); +static_assert(cpp_constexpr("\\r\\n"s) == "\r\n"s); +static_assert(query_constexpr("test"s) == "test"s); +static_assert(query_constexpr("first+second"s) == "first second"s); +static_assert(query_constexpr("first%20second"s) == "first second"s); +#endif // __cpp_lib_constexpr_string + using char_types = ::testing::Types; using utf8_char_types = ::testing::Types; using char_type_combos = ::testing::Types< @@ -171,7 +194,7 @@ TYPED_TEST(UnicodeSequenceTest, cpp_u16) { parsed_string += make_hex_string(codepoint, 4); jessilib::apply_cpp_escape_sequences(parsed_string); - auto decode = jessilib::decode_codepoint(parsed_string); + auto decode = jessilib::decode_codepoint(parsed_string.data(), parsed_string.size()); EXPECT_NE(decode.units, 0); EXPECT_EQ(decode.codepoint, static_cast(codepoint)); } @@ -184,7 +207,7 @@ TYPED_TEST(UnicodeSequenceTest, cpp_u32) { parsed_string += make_hex_string(codepoint, 8); jessilib::apply_cpp_escape_sequences(parsed_string); - auto decode = jessilib::decode_codepoint(parsed_string); + auto decode = jessilib::decode_codepoint(parsed_string.data(), parsed_string.size()); EXPECT_NE(decode.units, 0); EXPECT_EQ(decode.codepoint, static_cast(codepoint)); }