Browse Source

Split core unicode methods off to unicode_base; made same methods constexpr except for fold & allocating encode; apply_cpp_escape_sequences & deserialize_http_query now constexpr

master
Jessica James 3 years ago
parent
commit
a232d33d8a
  1. 278
      src/common/unicode.cpp
  2. 157
      src/include/jessilib/unicode.hpp
  3. 484
      src/include/jessilib/unicode_base.hpp
  4. 17
      src/include/jessilib/unicode_sequence.hpp
  5. 27
      src/test/unicode_sequence.cpp

278
src/common/unicode.cpp

@ -16,146 +16,10 @@
* Written by Jessica James <jessica.aj@outlook.com>
*/
#include "unicode.hpp"
#include "unicode_base.hpp"
namespace jessilib {
/** encode_codepoint */
template<typename T>
void append_helper(std::basic_string<T>& out_string, T in_value) {
out_string += in_value;
}
template<typename T>
void append_helper(std::basic_ostream<T>& out_string, T in_value) {
out_string << in_value;
}
template<typename T>
void append_helper(T*& out_string, T in_value) {
*out_string = in_value;
++out_string;
}
template<typename T, typename CharT>
size_t encode_codepoint_utf8(T& out_destination, char32_t in_codepoint) {
if (in_codepoint > 0x10FFFF) {
return 0;
}
if (in_codepoint <= 0x007F) {
// 1-byte sequence (7 bits)
append_helper(out_destination, static_cast<CharT>(in_codepoint));
return 1;
}
if (in_codepoint <= 0x07FF) {
// 2-byte sequence (11 bits; 5 + 6)
append_helper(out_destination, static_cast<CharT>(0xC0 | ((in_codepoint >> 6) & 0x1F)));
append_helper(out_destination, static_cast<CharT>(0x80 | (in_codepoint & 0x3F)));
return 2;
}
if (in_codepoint <= 0xFFFF) {
// 3-byte sequence (16 bits; 4 + 6 + 6)
append_helper(out_destination, static_cast<CharT>(0xE0 | ((in_codepoint >> 12) & 0x0F)));
append_helper(out_destination, static_cast<CharT>(0x80 | ((in_codepoint >> 6) & 0x3F)));
append_helper(out_destination, static_cast<CharT>(0x80 | (in_codepoint & 0x3F)));
return 3;
}
// 4-byte sequence (21 bits; 3 + 6 + 6 + 6)
append_helper(out_destination, static_cast<CharT>(0xF0 | ((in_codepoint >> 18) & 0x07)));
append_helper(out_destination, static_cast<CharT>(0x80 | ((in_codepoint >> 12) & 0x3F)));
append_helper(out_destination, static_cast<CharT>(0x80 | ((in_codepoint >> 6) & 0x3F)));
append_helper(out_destination, static_cast<CharT>(0x80 | (in_codepoint & 0x3F)));
return 4;
}
template<typename T>
size_t encode_codepoint_utf16(T& out_destination, char32_t in_codepoint) {
if (in_codepoint > 0x10FFFF) {
return 0;
}
if (in_codepoint <= 0xFFFF) {
// 1-unit sequence
append_helper(out_destination, static_cast<char16_t>(in_codepoint));
return 1;
}
// 2-unit sequence
in_codepoint -= 0x10000;
append_helper(out_destination, static_cast<char16_t>((in_codepoint >> 10) + 0xD800));
append_helper(out_destination, static_cast<char16_t>((in_codepoint & 0x03FF) + 0xDC00));
return 2;
}
template<typename T>
size_t encode_codepoint_utf32(T& out_destination, char32_t in_codepoint) {
if (in_codepoint > 0x10FFFF) {
return 0;
}
append_helper(out_destination, in_codepoint);
return 1;
}
/** Strings */
size_t encode_codepoint(std::string& out_string, char32_t in_codepoint) {
return encode_codepoint_utf8<std::string, char>(out_string, in_codepoint);
}
size_t encode_codepoint(std::u8string& out_string, char32_t in_codepoint) {
return encode_codepoint_utf8<std::u8string, char8_t>(out_string, in_codepoint);
}
size_t encode_codepoint(std::u16string& out_string, char32_t in_codepoint) {
return encode_codepoint_utf16(out_string, in_codepoint);
}
size_t encode_codepoint(std::u32string& out_string, char32_t in_codepoint) {
return encode_codepoint_utf32(out_string, in_codepoint);
}
/** Streams */
size_t encode_codepoint(std::basic_ostream<char>& out_stream, char32_t in_codepoint) {
return encode_codepoint_utf8<std::basic_ostream<char>, char>(out_stream, in_codepoint);
}
size_t encode_codepoint(std::basic_ostream<char8_t>& out_stream, char32_t in_codepoint) {
return encode_codepoint_utf8<std::basic_ostream<char8_t>, char8_t>(out_stream, in_codepoint);
}
size_t encode_codepoint(std::basic_ostream<char16_t>& out_stream, char32_t in_codepoint) {
return encode_codepoint_utf16(out_stream, in_codepoint);
}
size_t encode_codepoint(std::basic_ostream<char32_t>& out_stream, char32_t in_codepoint) {
return encode_codepoint_utf32(out_stream, in_codepoint);
}
/** Pointers */
size_t encode_codepoint(char* out_buffer, char32_t in_codepoint) {
return encode_codepoint_utf8<decltype(out_buffer), char>(out_buffer, in_codepoint);
}
size_t encode_codepoint(char8_t* out_buffer, char32_t in_codepoint) {
return encode_codepoint_utf8<decltype(out_buffer), char8_t>(out_buffer, in_codepoint);
}
size_t encode_codepoint(char16_t* out_buffer, char32_t in_codepoint) {
return encode_codepoint_utf16(out_buffer, in_codepoint);
}
size_t encode_codepoint(char32_t* out_buffer, char32_t in_codepoint) {
return encode_codepoint_utf32(out_buffer, in_codepoint);
}
/** Allocating */
std::u8string encode_codepoint_u8(char32_t in_codepoint) {
@ -176,126 +40,12 @@ std::u32string encode_codepoint_u32(char32_t in_codepoint) {
return result;
}
/** decode_codepoint */
get_endpoint_result decode_codepoint(const std::string_view& in_string) {
return decode_codepoint(std::u8string_view{ reinterpret_cast<const char8_t*>(in_string.data()), in_string.size() });
}
get_endpoint_result decode_codepoint(const std::u8string_view& in_string) {
get_endpoint_result result{ 0, 0 };
if (in_string.empty()) {
return result;
}
if ((in_string.front() & 0x80) != 0) { // UTF-8 sequence{
// Validity check
if (in_string.size() < 2
|| (in_string.front() & 0x40) == 0) {
// This is an invalid 1 byte sequence
return result;
}
// get codepoint value
if ((in_string.front() & 0x20) != 0) {
// This is a 3+ byte sequence
if (in_string.size() < 3) {
// Invalid sequence; too few characters available
return result;
}
if ((in_string.front() & 0x10) != 0) {
// This is a 4 byte sequence
if (in_string.size() < 4) {
// Invalid sequence; too few characters available
return result;
}
result.codepoint = static_cast<char32_t>(in_string[0] & 0x0F) << 18;
result.codepoint += static_cast<char32_t>(in_string[1] & 0x3F) << 12;
result.codepoint += static_cast<char32_t>(in_string[2] & 0x3F) << 6;
result.codepoint += static_cast<char32_t>(in_string[3] & 0x3F);
result.units = 4;
return result;
}
// this is a 3 byte sequence
result.codepoint = static_cast<char32_t>(in_string[0] & 0x0F) << 12;
result.codepoint += static_cast<char32_t>(in_string[1] & 0x3F) << 6;
result.codepoint += static_cast<char32_t>(in_string[2] & 0x3F);
result.units = 3;
return result;
}
// This is a 2 byte sequence
result.codepoint = static_cast<char32_t>(in_string[0] & 0x1F) << 6;
result.codepoint += static_cast<char32_t>(in_string[1] & 0x3F);
result.units = 2;
return result;
}
// This is a valid 1 byte sequence
result.codepoint = static_cast<char32_t>(in_string.front());
result.units = 1;
std::wstring encode_codepoint_w(char32_t in_codepoint) {
std::wstring result;
encode_codepoint(result, in_codepoint);
return result;
}
get_endpoint_result decode_codepoint(const std::u16string_view& in_string) {
if (in_string.empty()) {
return { 0, 0 };
}
if (is_high_surrogate(in_string.front()) // If this is a high surrogate codepoint...
&& in_string.size() > 1 // And a codepoint follows this surrogate..
&& is_low_surrogate(in_string[1])) { // And that codepoint is a low surrogate...
// We have a valid surrogate pair; decode it into a codepoint and return
char32_t codepoint { static_cast<char32_t>(
((in_string.front() - 0xD800U) * 0x400U) // high surrogate magic
+ (in_string[1] - 0xDC00U) // low surrogate magic
+ 0x10000ULL // more magic
) };
return { codepoint, 2 };
}
// Codepoint is a single char16_t; return codepoint directly
return { in_string.front(), 1 };
}
get_endpoint_result decode_codepoint(const std::u32string_view& in_string) {
if (in_string.empty()) {
return { 0, 0 };
}
return { in_string.front(), 1 };
}
bool is_high_surrogate(char32_t in_codepoint) {
return in_codepoint >= 0xD800 && in_codepoint <= 0xDBFF;
}
bool is_low_surrogate(char32_t in_codepoint) {
return in_codepoint >= 0xDC00 && in_codepoint <= 0xDFFF;
}
get_endpoint_result decode_surrogate_pair(char16_t in_high_surrogate, char16_t in_low_surrogate) {
if (is_high_surrogate(in_high_surrogate)
&& is_low_surrogate((in_low_surrogate))) {
// We have a valid surrogate pair; decode it into a codepoint and return
char32_t codepoint { static_cast<char32_t>(
((in_high_surrogate - 0xD800U) * 0x400U) // high surrogate magic
+ (in_low_surrogate - 0xDC00U) // low surrogate magic
+ 0x10000ULL // more magic
) };
return { codepoint, 2 };
}
return { 0, 0 };
}
/**
* Codepoint folding (case-insensitive character comparisons)
*/
@ -549,24 +299,4 @@ char32_t fold(char32_t in_codepoint) {
return match->fold(in_codepoint);
}
static constexpr unsigned char base_table[]{
127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 127, 127, 127, 127, 127, 127,
127, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 127, 127, 127, 127, 127,
127, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 127, 127, 127, 127, 127,
};
int as_base(char32_t in_character, unsigned int base) {
if (in_character >= sizeof(base_table)) {
return -1;
}
unsigned int result = base_table[in_character];
if (result >= base) {
return -1;
}
return base_table[in_character];
}
} // namespace jessilib

157
src/include/jessilib/unicode.hpp

@ -21,109 +21,10 @@
#include <string>
#include <string_view>
#include <ostream>
#include "unicode_base.hpp"
namespace jessilib {
/** encode_codepoint */
/**
* Encodes a codepoint, and appends it to an output string
*
* @param out_string String to append
* @param in_codepoint Codepoint to encode
* @return Number of data elements appended to out_string
*/
size_t encode_codepoint(std::string& out_string, char32_t in_codepoint); // DEPRECATED: ASSUMES UTF-8
size_t encode_codepoint(std::u8string& out_string, char32_t in_codepoint);
size_t encode_codepoint(std::u16string& out_string, char32_t in_codepoint);
size_t encode_codepoint(std::u32string& out_string, char32_t in_codepoint);
/**
* Encodes a codepoint to an output stream
*
* @param out_stream Stream to write codepoint to
* @param in_codepoint Codepoint to encode
* @return Number of data elements appending to out_stream
*/
size_t encode_codepoint(std::basic_ostream<char>& out_stream, char32_t in_codepoint); // DEPRECATED: ASSUMES UTF-8
size_t encode_codepoint(std::basic_ostream<char8_t>& out_stream, char32_t in_codepoint);
size_t encode_codepoint(std::basic_ostream<char16_t>& out_stream, char32_t in_codepoint);
size_t encode_codepoint(std::basic_ostream<char32_t>& out_stream, char32_t in_codepoint);
/**
* Encodes a codepoint directly to a character buffer
* Note: Do not use this without careful consideration; note the size requirements:
* 1) char8_t may write up to 4 elements
* 2) char16_t may write up to 2 elements
* 3) char32_t may write up to 1 element
* 4) char may write up to 4 elements; provided solely for compatibility/ease of use
*
* @param out_buffer Character buffer to write to
* @param in_codepoint Codepoint to encode
* @return Number of data elements written to out_buffer
*/
size_t encode_codepoint(char* out_buffer, char32_t in_codepoint);
size_t encode_codepoint(char8_t* out_buffer, char32_t in_codepoint);
size_t encode_codepoint(char16_t* out_buffer, char32_t in_codepoint);
size_t encode_codepoint(char32_t* out_buffer, char32_t in_codepoint);
/**
* Encodes a codepoint and returns it as a string
*
* @param in_codepoint Codepoint to encode
* @return A string containing the codepoint encoded to the appropriate underlying CharT type
*/
std::u8string encode_codepoint_u8(char32_t in_codepoint);
std::u16string encode_codepoint_u16(char32_t in_codepoint);
std::u32string encode_codepoint_u32(char32_t in_codepoint);
/** decode_codepoint */
struct get_endpoint_result {
char32_t codepoint{}; // Codepoint
size_t units{}; // Number of data units codepoint was represented by, or 0
};
/**
* Decodes the front codepoint in a string
*
* @param in_string String to decode a codepoint from
* @return A struct containing a valid codepoint and the number of representative data units on success, zero otherwise.
*/
get_endpoint_result decode_codepoint(const std::string_view& in_string); // DEPRECATED: ASSUMES UTF-8
get_endpoint_result decode_codepoint(const std::u8string_view& in_string); // UTF-8
get_endpoint_result decode_codepoint(const std::u16string_view& in_string); // UTF-16
get_endpoint_result decode_codepoint(const std::u32string_view& in_string); // UTF-32
/** advance_codepoint */
template<typename T>
char32_t advance_codepoint(std::basic_string_view<T>& in_string) {
auto result = decode_codepoint(in_string);
in_string.remove_prefix(result.units);
return result.codepoint;
}
/** next_codepoint */
template<typename T>
std::basic_string_view<T> next_codepoint(const std::basic_string_view<T>& in_string) {
return in_string.substr(decode_codepoint(in_string).units);
}
/** is_valid_codepoint */
template<typename T>
bool is_valid_codepoint(const std::basic_string_view<T>& in_string) {
return decode_codepoint(in_string).units != 0;
}
/** utf-16 surrogate helpers */
bool is_high_surrogate(char32_t in_codepoint);
bool is_low_surrogate(char32_t in_codepoint);
get_endpoint_result decode_surrogate_pair(char16_t in_high_surrogate, char16_t in_low_surrogate);
/** Utilities */
namespace impl_unicode {
@ -135,6 +36,8 @@ struct is_string : std::false_type {};
template<typename T>
struct is_string<std::basic_string<T>> {
using type = T;
static constexpr bool is_fixed_array{ false };
static constexpr bool is_container{ true };
static constexpr bool value{ true };
constexpr operator bool() const noexcept { return true; }
constexpr bool operator()() const noexcept { return true; }
@ -143,6 +46,8 @@ struct is_string<std::basic_string<T>> {
template<typename T>
struct is_string<std::basic_string_view<T>> {
using type = T;
static constexpr bool is_fixed_array{ false };
static constexpr bool is_container{ true };
static constexpr bool value{ true };
constexpr operator bool() const noexcept { return true; }
constexpr bool operator()() const noexcept { return true; }
@ -151,6 +56,8 @@ struct is_string<std::basic_string_view<T>> {
template<typename T>
struct is_string<T*> {
using type = T;
static constexpr bool is_fixed_array{ false };
static constexpr bool is_container{ false };
static constexpr bool value{ true };
constexpr operator bool() const noexcept { return true; }
constexpr bool operator()() const noexcept { return true; }
@ -159,6 +66,8 @@ struct is_string<T*> {
template<typename T>
struct is_string<T[]> {
using type = T;
static constexpr bool is_fixed_array{ true };
static constexpr bool is_container{ false };
static constexpr bool value{ true };
constexpr operator bool() const noexcept { return true; }
constexpr bool operator()() const noexcept { return true; }
@ -167,6 +76,8 @@ struct is_string<T[]> {
template<typename T, size_t N>
struct is_string<T[N]> {
using type = T;
static constexpr bool is_fixed_array{ true };
static constexpr bool is_container{ false };
static constexpr bool value{ true };
constexpr operator bool() const noexcept { return true; }
constexpr bool operator()() const noexcept { return true; }
@ -214,32 +125,37 @@ std::basic_string_view<OutCharT> string_view_cast(const InT& in_string) {
size_t out_string_units = in_string_bytes / sizeof(OutCharT);
const OutCharT* data_begin = reinterpret_cast<const OutCharT*>(in_string.data());
std::basic_string_view<OutCharT> result{ data_begin, out_string_units };
if (!is_valid(result)) {
// Result isn't valid; discard and return empty
return {};
}
return result;
return { data_begin, out_string_units };
}
template<typename OutCharT, typename InT>
std::basic_string<OutCharT> string_cast(const InT& in_string) {
static_assert(impl_unicode::is_string<InT>::value == true);
using InCharT = typename impl_unicode::is_string<InT>::type;
using InEquivalentT = typename unicode_traits<InCharT>::equivalent_type;
using InViewT = std::basic_string_view<InCharT>;
std::basic_string<OutCharT> result;
using OutT = std::basic_string<OutCharT>;
// Just do a dumb copy when same type & valid; should be slightly faster than re-encoding
if constexpr (std::is_same_v<OutCharT, InCharT>) {
if (is_valid(in_string)) {
result = in_string;
if constexpr (std::is_same_v<InT, OutT>) {
// This does nothing at all; consider static_assert against this?
return in_string;
}
return result;
else if constexpr (std::is_same_v<OutCharT, InCharT>
|| std::is_same_v<OutCharT, InEquivalentT>) {
// Just do a dumb copy when same or equivalent char types; should be faster than re-encoding
if constexpr (impl_unicode::is_string<InT>::is_container) {
return { reinterpret_cast<const OutCharT*>(in_string.data()), in_string.size() };
}
else if constexpr (impl_unicode::is_string<InT>::is_fixed_array) {
return { reinterpret_cast<const OutCharT*>(in_string), std::size(in_string) - 1 }; // strip null term
}
else {
return { reinterpret_cast<const OutCharT*>(in_string) };
}
}
else {
// Last resort: reencode the string
std::basic_string<OutCharT> result;
InViewT in_string_view = static_cast<InViewT>(in_string);
if constexpr (sizeof(InCharT) <= sizeof(OutCharT)) {
// When copying to a larger type, we will need _at most_ as many elements as the smaller storage type
@ -260,12 +176,9 @@ std::basic_string<OutCharT> string_cast(const InT& in_string) {
}
return result;
}
}
/** single-unit helper utilities */
char32_t fold(char32_t in_codepoint); // Folds codepoint for case-insensitive checks (not for human output)
int as_base(char32_t in_character, unsigned int base); // The value represented by in_character in terms of base if valid, -1 otherwise
/**
* Checks if two codepoints are equal to each-other (case insensitive)
*
@ -753,7 +666,7 @@ struct text_hash {
get_endpoint_result decode;
while (data != end) {
decode = decode_codepoint({data, static_cast<size_t>(end - data)});
decode = decode_codepoint(data, end);
if (decode.units == 0) {
return hash;
}
@ -838,7 +751,7 @@ struct text_hashi {
get_endpoint_result decode;
while (data != end) {
decode = decode_codepoint({data, static_cast<size_t>(end - data)});
decode = decode_codepoint(data, end - data);
if (decode.units == 0) {
return hash;
}

484
src/include/jessilib/unicode_base.hpp

@ -0,0 +1,484 @@
/**
* Copyright (C) 2018-2021 Jessica James.
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*
* Written by Jessica James <jessica.aj@outlook.com>
*/
#pragma once
#include <string>
#include <string_view>
#include <ostream>
namespace jessilib {
/** encode_codepoint */
/**
* Encodes a codepoint, and appends it to an output string
*
* @param out_string String to append
* @param in_codepoint Codepoint to encode
* @return Number of data elements appended to out_string
*/
template<typename CharT>
constexpr size_t encode_codepoint(std::basic_string<CharT>& out_string, char32_t in_codepoint);
/**
* Encodes a codepoint to an output stream
*
* @param out_stream Stream to write codepoint to
* @param in_codepoint Codepoint to encode
* @return Number of data elements appending to out_stream
*/
template<typename CharT>
constexpr size_t encode_codepoint(std::basic_ostream<CharT>& out_stream, char32_t in_codepoint);
/**
* Encodes a codepoint directly to a character buffer
* Note: Do not use this without careful consideration; note the size requirements:
* 1) char8_t may write up to 4 elements
* 2) char16_t may write up to 2 elements
* 3) char32_t may write up to 1 element
* 4) char may write up to 4 elements; provided solely for compatibility/ease of use
*
* @param out_buffer Character buffer to write to
* @param in_codepoint Codepoint to encode
* @return Number of data elements written to out_buffer
*/
template<typename CharT>
constexpr size_t encode_codepoint(CharT* out_buffer, char32_t in_codepoint);
/**
* Encodes a codepoint and returns it as a string
*
* @param in_codepoint Codepoint to encode
* @return A string containing the codepoint encoded to the appropriate underlying CharT type
*/
std::u8string encode_codepoint_u8(char32_t in_codepoint);
std::u16string encode_codepoint_u16(char32_t in_codepoint);
std::u32string encode_codepoint_u32(char32_t in_codepoint);
std::wstring encode_codepoint_w(char32_t in_codepoint); // ASSUMES UTF-16 OR UTF-32
/** decode_codepoint */
struct get_endpoint_result {
char32_t codepoint{}; // Codepoint
size_t units{}; // Number of data units codepoint was represented by, or 0
};
/**
* Decodes the front codepoint in a string
*
* @param in_string String to decode a codepoint from
* @return A struct containing a valid codepoint and the number of representative data units on success, zero otherwise.
*/
template<typename CharT>
constexpr get_endpoint_result decode_codepoint_utf8(std::basic_string_view<CharT> in_string); // UTF-8
template<typename CharT>
constexpr get_endpoint_result decode_codepoint_utf16(std::basic_string_view<CharT> in_string); // UTF-16
template<typename CharT>
constexpr get_endpoint_result decode_codepoint_utf32(std::basic_string_view<CharT> in_string); // UTF-32
template<typename CharT>
constexpr get_endpoint_result decode_codepoint(std::basic_string_view<CharT> in_string); // ASSUMES UTF-16 OR UTF-32
template<typename CharT>
constexpr get_endpoint_result decode_codepoint(const CharT* in_begin, size_t in_length);
template<typename CharT>
constexpr get_endpoint_result decode_codepoint(const CharT* in_begin, const CharT* in_end);
/** advance_codepoint */
template<typename T>
char32_t advance_codepoint(std::basic_string_view<T>& in_string) {
auto result = decode_codepoint(in_string);
in_string.remove_prefix(result.units);
return result.codepoint;
}
/** next_codepoint */
template<typename T>
std::basic_string_view<T> next_codepoint(const std::basic_string_view<T>& in_string) {
return in_string.substr(decode_codepoint(in_string).units);
}
/** is_valid_codepoint */
template<typename T>
bool is_valid_codepoint(const std::basic_string_view<T>& in_string) {
return decode_codepoint(in_string).units != 0;
}
/** utf-16 surrogate helpers */
constexpr bool is_high_surrogate(char32_t in_codepoint);
constexpr bool is_low_surrogate(char32_t in_codepoint);
constexpr get_endpoint_result decode_surrogate_pair(char16_t in_high_surrogate, char16_t in_low_surrogate);
template<typename CharT>
struct unicode_traits : std::false_type {};
template<>
struct unicode_traits<char> : std::true_type {
using equivalent_type = char8_t; // DEPRECATE
static constexpr size_t max_units_per_codepoint = 4;
};
template<>
struct unicode_traits<char8_t> : std::true_type {
using equivalent_type = char; // DEPRECATE
static constexpr size_t max_units_per_codepoint = 4;
};
template<>
struct unicode_traits<char16_t> : std::true_type {
using equivalent_type = std::conditional_t<sizeof(wchar_t) == sizeof(char16_t), wchar_t, char16_t>;
static constexpr size_t max_units_per_codepoint = 2;
};
template<>
struct unicode_traits<char32_t> : std::true_type {
using equivalent_type = std::conditional_t<sizeof(wchar_t) == sizeof(char32_t), wchar_t, char32_t>;
static constexpr size_t max_units_per_codepoint = 1;
};
template<>
struct unicode_traits<wchar_t> : std::true_type {
using equivalent_type = std::conditional_t<sizeof(wchar_t) == sizeof(char32_t), char32_t, char16_t>;
static constexpr size_t max_units_per_codepoint = unicode_traits<equivalent_type>::max_units_per_codepoint;
};
template<typename CharT>
using encode_buffer_type = CharT[unicode_traits<CharT>::max_units_per_codepoint];
/** single-unit helper utilities */
char32_t fold(char32_t in_codepoint); // Folds codepoint for case-insensitive checks (not for human output)
constexpr int as_base(char32_t in_character, unsigned int base); // The value represented by in_character in terms of base if valid, -1 otherwise
/**
* Inline constexpr encode implementation
*/
/** encode_codepoint */
namespace impl_unicode {
template<typename T>
constexpr void append_helper(std::basic_string<T>& out_string, T in_value) {
out_string += in_value;
}
template<typename T>
constexpr void append_helper(std::basic_ostream<T>& out_string, T in_value) {
out_string << in_value;
}
template<typename T>
constexpr void append_helper(T*& out_string, T in_value) {
*out_string = in_value;
++out_string;
}
} // namespace impl_unicode
template<typename CharT, typename T>
constexpr size_t encode_codepoint_utf8(T& out_destination, char32_t in_codepoint) {
if (in_codepoint > 0x10FFFF) {
return 0;
}
if (in_codepoint <= 0x007F) {
// 1-byte sequence (7 bits)
impl_unicode::append_helper(out_destination, static_cast<CharT>(in_codepoint));
return 1;
}
if (in_codepoint <= 0x07FF) {
// 2-byte sequence (11 bits; 5 + 6)
impl_unicode::append_helper(out_destination, static_cast<CharT>(0xC0 | ((in_codepoint >> 6) & 0x1F)));
impl_unicode::append_helper(out_destination, static_cast<CharT>(0x80 | (in_codepoint & 0x3F)));
return 2;
}
if (in_codepoint <= 0xFFFF) {
// 3-byte sequence (16 bits; 4 + 6 + 6)
impl_unicode::append_helper(out_destination, static_cast<CharT>(0xE0 | ((in_codepoint >> 12) & 0x0F)));
impl_unicode::append_helper(out_destination, static_cast<CharT>(0x80 | ((in_codepoint >> 6) & 0x3F)));
impl_unicode::append_helper(out_destination, static_cast<CharT>(0x80 | (in_codepoint & 0x3F)));
return 3;
}
// 4-byte sequence (21 bits; 3 + 6 + 6 + 6)
impl_unicode::append_helper(out_destination, static_cast<CharT>(0xF0 | ((in_codepoint >> 18) & 0x07)));
impl_unicode::append_helper(out_destination, static_cast<CharT>(0x80 | ((in_codepoint >> 12) & 0x3F)));
impl_unicode::append_helper(out_destination, static_cast<CharT>(0x80 | ((in_codepoint >> 6) & 0x3F)));
impl_unicode::append_helper(out_destination, static_cast<CharT>(0x80 | (in_codepoint & 0x3F)));
return 4;
}
template<typename CharT, typename T>
constexpr size_t encode_codepoint_utf16(T& out_destination, char32_t in_codepoint) {
if (in_codepoint > 0x10FFFF) {
return 0;
}
if (in_codepoint <= 0xFFFF) {
// 1-unit sequence
impl_unicode::append_helper(out_destination, static_cast<CharT>(in_codepoint));
return 1;
}
// 2-unit sequence
in_codepoint -= 0x10000;
impl_unicode::append_helper(out_destination, static_cast<CharT>((in_codepoint >> 10) + 0xD800));
impl_unicode::append_helper(out_destination, static_cast<CharT>((in_codepoint & 0x03FF) + 0xDC00));
return 2;
}
template<typename CharT, typename T>
constexpr size_t encode_codepoint_utf32(T& out_destination, char32_t in_codepoint) {
if (in_codepoint > 0x10FFFF) {
return 0;
}
impl_unicode::append_helper(out_destination, static_cast<CharT>(in_codepoint));
return 1;
}
template<typename T>
constexpr size_t encode_codepoint_w(T& out_destination, char32_t in_codepoint) {
if constexpr (std::is_same_v<unicode_traits<wchar_t>::equivalent_type, char16_t>) {
return encode_codepoint_utf16<wchar_t, T>(out_destination, in_codepoint);
}
if constexpr (std::is_same_v<unicode_traits<wchar_t>::equivalent_type, char32_t>) {
return encode_codepoint_utf32<wchar_t, T>(out_destination, in_codepoint);
}
}
template<typename CharT, typename T>
constexpr size_t encode_codepoint_utf(T& out_destination, char32_t in_codepoint) {
if constexpr (std::is_same_v<CharT, char8_t>) {
return encode_codepoint_utf8<CharT, T>(out_destination, in_codepoint);
}
else if constexpr (std::is_same_v<CharT, char16_t>) {
return encode_codepoint_utf16<CharT, T>(out_destination, in_codepoint);
}
else if constexpr (std::is_same_v<CharT, char32_t>) {
return encode_codepoint_utf32<CharT, T>(out_destination, in_codepoint);
}
else if constexpr (std::is_same_v<CharT, wchar_t>) {
return encode_codepoint_w<T>(out_destination, in_codepoint);
}
else if constexpr (std::is_same_v<CharT, char>) {
return encode_codepoint_utf8<CharT, T>(out_destination, in_codepoint);
}
}
template<typename CharT>
constexpr size_t encode_codepoint(std::basic_string<CharT>& out_string, char32_t in_codepoint) {
return encode_codepoint_utf<CharT>(out_string, in_codepoint);
}
template<typename CharT>
constexpr size_t encode_codepoint(std::basic_ostream<CharT>& out_stream, char32_t in_codepoint) {
return encode_codepoint_utf<CharT>(out_stream, in_codepoint);
}
template<typename CharT>
constexpr size_t encode_codepoint(CharT* out_buffer, char32_t in_codepoint) {
return encode_codepoint_utf<CharT>(out_buffer, in_codepoint);
}
/**
* Inline constexpr decode implementation
*/
/** decode_codepoint */
template<typename CharT>
constexpr get_endpoint_result decode_codepoint_utf8(std::basic_string_view<CharT> in_string) {
get_endpoint_result result{ 0, 0 };
if (in_string.empty()) {
return result;
}
if ((in_string.front() & 0x80) != 0) { // UTF-8 sequence{
// Validity check
if (in_string.size() < 2
|| (in_string.front() & 0x40) == 0) {
// This is an invalid 1 byte sequence
return result;
}
// get codepoint value
if ((in_string.front() & 0x20) != 0) {
// This is a 3+ byte sequence
if (in_string.size() < 3) {
// Invalid sequence; too few characters available
return result;
}
if ((in_string.front() & 0x10) != 0) {
// This is a 4 byte sequence
if (in_string.size() < 4) {
// Invalid sequence; too few characters available
return result;
}
result.codepoint = static_cast<char32_t>(in_string[0] & 0x0F) << 18;
result.codepoint += static_cast<char32_t>(in_string[1] & 0x3F) << 12;
result.codepoint += static_cast<char32_t>(in_string[2] & 0x3F) << 6;
result.codepoint += static_cast<char32_t>(in_string[3] & 0x3F);
result.units = 4;
return result;
}
// this is a 3 byte sequence
result.codepoint = static_cast<char32_t>(in_string[0] & 0x0F) << 12;
result.codepoint += static_cast<char32_t>(in_string[1] & 0x3F) << 6;
result.codepoint += static_cast<char32_t>(in_string[2] & 0x3F);
result.units = 3;
return result;
}
// This is a 2 byte sequence
result.codepoint = static_cast<char32_t>(in_string[0] & 0x1F) << 6;
result.codepoint += static_cast<char32_t>(in_string[1] & 0x3F);
result.units = 2;
return result;
}
// This is a valid 1 byte sequence
result.codepoint = static_cast<char32_t>(in_string.front());
result.units = 1;
return result;
}
template<typename CharT>
constexpr get_endpoint_result decode_codepoint_utf16(std::basic_string_view<CharT> in_string) {
if (in_string.empty()) {
return { 0, 0 };
}
if (is_high_surrogate(in_string.front()) // If this is a high surrogate codepoint...
&& in_string.size() > 1 // And a codepoint follows this surrogate..
&& is_low_surrogate(in_string[1])) { // And that codepoint is a low surrogate...
// We have a valid surrogate pair; decode it into a codepoint and return
char32_t codepoint { static_cast<char32_t>(
((in_string.front() - 0xD800U) * 0x400U) // high surrogate magic
+ (in_string[1] - 0xDC00U) // low surrogate magic
+ 0x10000ULL // more magic
) };
return { codepoint, 2 };
}
// Codepoint is a single char16_t; return codepoint directly
return { in_string.front(), 1 };
}
template<typename CharT>
constexpr get_endpoint_result decode_codepoint_utf32(std::basic_string_view<CharT> in_string) {
if (in_string.empty()) {
return { 0, 0 };
}
return { in_string.front(), 1 };
}
template<typename CharT>
constexpr get_endpoint_result decode_codepoint(std::basic_string_view<CharT> in_string) {
if constexpr (std::is_same_v<CharT, char8_t>) {
return decode_codepoint_utf8(in_string);
}
else if constexpr (std::is_same_v<CharT, char16_t>) {
return decode_codepoint_utf16(in_string);
}
else if constexpr (std::is_same_v<CharT, char32_t>) {
return decode_codepoint_utf32(in_string);
}
else if constexpr (std::is_same_v<CharT, wchar_t>) {
if constexpr (std::is_same_v<unicode_traits<wchar_t>::equivalent_type, char16_t>) {
return decode_codepoint_utf16<wchar_t>(in_string);
}
else if constexpr (std::is_same_v<unicode_traits<wchar_t>::equivalent_type, char32_t>) {
return decode_codepoint_utf32<wchar_t>(in_string);
}
}
else if constexpr (std::is_same_v<CharT, char>) {
return decode_codepoint_utf8(in_string);
}
}
template<typename CharT>
constexpr get_endpoint_result decode_codepoint(const CharT* in_begin, size_t in_length) {
return decode_codepoint<CharT>(std::basic_string_view<CharT>{in_begin, in_length});
}
template<typename CharT>
constexpr get_endpoint_result decode_codepoint(const CharT* in_begin, const CharT* in_end) {
return decode_codepoint<CharT>(std::basic_string_view<CharT>{in_begin, static_cast<size_t>(in_end - in_begin)});
}
constexpr bool is_high_surrogate(char32_t in_codepoint) {
return in_codepoint >= 0xD800 && in_codepoint <= 0xDBFF;
}
constexpr bool is_low_surrogate(char32_t in_codepoint) {
return in_codepoint >= 0xDC00 && in_codepoint <= 0xDFFF;
}
constexpr get_endpoint_result decode_surrogate_pair(char16_t in_high_surrogate, char16_t in_low_surrogate) {
if (is_high_surrogate(in_high_surrogate)
&& is_low_surrogate((in_low_surrogate))) {
// We have a valid surrogate pair; decode it into a codepoint and return
char32_t codepoint { static_cast<char32_t>(
((in_high_surrogate - 0xD800U) * 0x400U) // high surrogate magic
+ (in_low_surrogate - 0xDC00U) // low surrogate magic
+ 0x10000ULL // more magic
) };
return { codepoint, 2 };
}
return { 0, 0 };
}
// Maybe this should be moved back to .cpp and provide separate constexpr/non-constexpr variants?
static constexpr unsigned char base_table[]{
127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 127, 127, 127, 127, 127, 127,
127, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 127, 127, 127, 127, 127,
127, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 127, 127, 127, 127, 127,
};
// If we're already making two comparisons, what would the real impact be of a couple more and removing the lookup table?
constexpr int as_base(char32_t in_character, unsigned int base) {
if (in_character >= sizeof(base_table)) {
return -1;
}
unsigned int result = base_table[in_character];
if (result >= base) {
return -1;
}
return base_table[in_character];
}
} // namespace jessilib

17
src/include/jessilib/unicode_sequence.hpp

@ -25,8 +25,7 @@
#pragma once
#include <map>
#include "unicode.hpp"
#include "unicode_base.hpp"
namespace jessilib {
@ -41,7 +40,7 @@ template<typename CharT>
using shrink_sequence_tree_member = const std::pair<char32_t, shrink_sequence_tree_action<CharT>>;
template<typename CharT>
bool shrink_tree_member_compare(const shrink_sequence_tree_member<CharT>& in_lhs, const char32_t in_rhs) {
constexpr bool shrink_tree_member_compare(const shrink_sequence_tree_member<CharT>& in_lhs, const char32_t in_rhs) {
return in_lhs.first < in_rhs;
}
@ -82,7 +81,7 @@ constexpr bool is_simple() {
// Only use for ASTs where each character process is guaranteed to write at most 1 character for each character consumed
template<typename CharT, const shrink_sequence_tree<CharT> SequenceTreeBegin, size_t SequenceTreeSize>
bool apply_shrink_sequence_tree(std::basic_string<CharT>& inout_string) {
constexpr bool apply_shrink_sequence_tree(std::basic_string<CharT>& inout_string) {
if (inout_string.empty()) {
// Nothing to parse
return true;
@ -93,7 +92,7 @@ bool apply_shrink_sequence_tree(std::basic_string<CharT>& inout_string) {
get_endpoint_result decode;
constexpr auto SubTreeEnd = SequenceTreeBegin + SequenceTreeSize;
while ((decode = decode_codepoint(read_view)).units != 0) { // TODO: make constexpr
while ((decode = decode_codepoint(read_view)).units != 0) {
auto parser = std::lower_bound(SequenceTreeBegin, SubTreeEnd, decode.codepoint, &shrink_tree_member_compare<CharT>);
if (parser == SubTreeEnd || parser->first != decode.codepoint) {
// Just a normal character; write it over
@ -306,8 +305,8 @@ constexpr shrink_sequence_tree_member<CharT> make_hex_sequence_pair() {
// Calls into another tree with the next character
template<typename CharT, char32_t InCodepointV, const shrink_sequence_tree<CharT> SubTreeBegin, size_t SubTreeSize, bool FailNotFound = true>
constexpr shrink_sequence_tree_member<CharT> make_tree_sequence_pair() {
return { InCodepointV, [](CharT*& in_write_head, std::basic_string_view<CharT>& read_view) {
auto decode = decode_codepoint(read_view); // TODO: make constexpr
return { InCodepointV, [](CharT*& in_write_head, std::basic_string_view<CharT>& read_view) constexpr {
auto decode = decode_codepoint(read_view);
constexpr shrink_sequence_tree_member<CharT>* SubTreeEnd = SubTreeBegin + SubTreeSize;
auto parser = std::lower_bound(SubTreeBegin, SubTreeEnd, decode.codepoint, &shrink_tree_member_compare<CharT>);
@ -388,7 +387,7 @@ static constexpr shrink_sequence_tree<CharT> cpp_escapes_root_tree{
// Return true for valid sequences, false otherwise
template<typename CharT>
bool apply_cpp_escape_sequences(std::basic_string<CharT>& inout_string) {
constexpr bool apply_cpp_escape_sequences(std::basic_string<CharT>& inout_string) {
static_assert(is_sorted<CharT, cpp_escapes_root_tree<CharT>, std::size(cpp_escapes_root_tree<CharT>)>(), "Tree must be pre-sorted");
static_assert(is_sorted<CharT, cpp_escapes_main_tree<CharT>, std::size(cpp_escapes_main_tree<CharT>)>(), "Tree must be pre-sorted");
@ -410,7 +409,7 @@ static_assert(is_sorted<char8_t, http_query_escapes_root_tree<char8_t>, std::siz
template<typename CharT,
std::enable_if_t<sizeof(CharT) == 1>* = nullptr>
bool deserialize_http_query(std::basic_string<CharT>& inout_string) {
constexpr bool deserialize_http_query(std::basic_string<CharT>& inout_string) {
return apply_shrink_sequence_tree<CharT, http_query_escapes_root_tree<CharT>, std::size(http_query_escapes_root_tree<CharT>)>(inout_string);
}

27
src/test/unicode_sequence.cpp

@ -18,8 +18,31 @@
#include "jessilib/unicode_sequence.hpp"
#include <charconv>
#include "jessilib/unicode.hpp" // string_cast
#include "test.hpp"
using namespace std;
// Compile-time tests for constexpr on compilers which support C++20 constexpr std::string
#ifdef __cpp_lib_constexpr_string
constexpr std::string cpp_constexpr(std::string_view in_expression) {
std::string result{ in_expression };
jessilib::apply_cpp_escape_sequences(result);
return result;
}
constexpr std::string query_constexpr(std::string_view in_expression) {
std::string result{ in_expression };
jessilib::deserialize_http_query(result);
return result;
}
static_assert(cpp_constexpr("test"s) == "test"s);
static_assert(cpp_constexpr("\\r\\n"s) == "\r\n"s);
static_assert(query_constexpr("test"s) == "test"s);
static_assert(query_constexpr("first+second"s) == "first second"s);
static_assert(query_constexpr("first%20second"s) == "first second"s);
#endif // __cpp_lib_constexpr_string
using char_types = ::testing::Types<char, char8_t, char16_t, char32_t>;
using utf8_char_types = ::testing::Types<char, char8_t>;
using char_type_combos = ::testing::Types<
@ -171,7 +194,7 @@ TYPED_TEST(UnicodeSequenceTest, cpp_u16) {
parsed_string += make_hex_string<TypeParam>(codepoint, 4);
jessilib::apply_cpp_escape_sequences(parsed_string);
auto decode = jessilib::decode_codepoint(parsed_string);
auto decode = jessilib::decode_codepoint(parsed_string.data(), parsed_string.size());
EXPECT_NE(decode.units, 0);
EXPECT_EQ(decode.codepoint, static_cast<char32_t>(codepoint));
}
@ -184,7 +207,7 @@ TYPED_TEST(UnicodeSequenceTest, cpp_u32) {
parsed_string += make_hex_string<TypeParam>(codepoint, 8);
jessilib::apply_cpp_escape_sequences(parsed_string);
auto decode = jessilib::decode_codepoint(parsed_string);
auto decode = jessilib::decode_codepoint(parsed_string.data(), parsed_string.size());
EXPECT_NE(decode.units, 0);
EXPECT_EQ(decode.codepoint, static_cast<char32_t>(codepoint));
}

Loading…
Cancel
Save