Split core unicode methods off to unicode_base; made same methods constexpr except for fold & allocating encode; apply_cpp_escape_sequences & deserialize_http_query now constexpr

3 years ago · a232d33d8a
5 changed files with 571 additions and 422 deletions
--- a/src/common/unicode.cpp
+++ b/src/common/unicode.cpp
@ -16,146 +16,10 @@
 * Written by Jessica James <jessica.aj@outlook.com>
 */

-#include "unicode.hpp"
+#include "unicode_base.hpp"

 namespace jessilib {

-/** encode_codepoint */
-
-template<typename T>
-void append_helper(std::basic_string<T>& out_string, T in_value) {
-	out_string += in_value;
-}
-
-template<typename T>
-void append_helper(std::basic_ostream<T>& out_string, T in_value) {
-	out_string << in_value;
-}
-
-template<typename T>
-void append_helper(T*& out_string, T in_value) {
-	*out_string = in_value;
-	++out_string;
-}
-
-template<typename T, typename CharT>
-size_t encode_codepoint_utf8(T& out_destination, char32_t in_codepoint) {
-	if (in_codepoint > 0x10FFFF) {
-		return 0;
-	}
-
-	if (in_codepoint <= 0x007F) {
-		// 1-byte sequence (7 bits)
-		append_helper(out_destination, static_cast<CharT>(in_codepoint));
-		return 1;
-	}
-
-	if (in_codepoint <= 0x07FF) {
-		// 2-byte sequence (11 bits; 5 + 6)
-		append_helper(out_destination, static_cast<CharT>(0xC0 | ((in_codepoint >> 6) & 0x1F)));
-		append_helper(out_destination, static_cast<CharT>(0x80 | (in_codepoint & 0x3F)));
-		return 2;
-	}
-
-	if (in_codepoint <= 0xFFFF) {
-		// 3-byte sequence (16 bits; 4 + 6 + 6)
-		append_helper(out_destination, static_cast<CharT>(0xE0 | ((in_codepoint >> 12) & 0x0F)));
-		append_helper(out_destination, static_cast<CharT>(0x80 | ((in_codepoint >> 6) & 0x3F)));
-		append_helper(out_destination, static_cast<CharT>(0x80 | (in_codepoint & 0x3F)));
-		return 3;
-	}
-
-	// 4-byte sequence (21 bits; 3 + 6 + 6 + 6)
-	append_helper(out_destination, static_cast<CharT>(0xF0 | ((in_codepoint >> 18) & 0x07)));
-	append_helper(out_destination, static_cast<CharT>(0x80 | ((in_codepoint >> 12) & 0x3F)));
-	append_helper(out_destination, static_cast<CharT>(0x80 | ((in_codepoint >> 6) & 0x3F)));
-	append_helper(out_destination, static_cast<CharT>(0x80 | (in_codepoint & 0x3F)));
-	return 4;
-}
-
-template<typename T>
-size_t encode_codepoint_utf16(T& out_destination, char32_t in_codepoint) {
-	if (in_codepoint > 0x10FFFF) {
-		return 0;
-	}
-
-	if (in_codepoint <= 0xFFFF) {
-		// 1-unit sequence
-		append_helper(out_destination, static_cast<char16_t>(in_codepoint));
-		return 1;
-	}
-
-	// 2-unit sequence
-	in_codepoint -= 0x10000;
-	append_helper(out_destination, static_cast<char16_t>((in_codepoint >> 10) + 0xD800));
-	append_helper(out_destination, static_cast<char16_t>((in_codepoint & 0x03FF) + 0xDC00));
-	return 2;
-}
-
-template<typename T>
-size_t encode_codepoint_utf32(T& out_destination, char32_t in_codepoint) {
-	if (in_codepoint > 0x10FFFF) {
-		return 0;
-	}
-
-	append_helper(out_destination, in_codepoint);
-	return 1;
-}
-
-/** Strings */
-
-size_t encode_codepoint(std::string& out_string, char32_t in_codepoint) {
-	return encode_codepoint_utf8<std::string, char>(out_string, in_codepoint);
-}
-
-size_t encode_codepoint(std::u8string& out_string, char32_t in_codepoint) {
-	return encode_codepoint_utf8<std::u8string, char8_t>(out_string, in_codepoint);
-}
-
-size_t encode_codepoint(std::u16string& out_string, char32_t in_codepoint) {
-	return encode_codepoint_utf16(out_string, in_codepoint);
-}
-
-size_t encode_codepoint(std::u32string& out_string, char32_t in_codepoint) {
-	return encode_codepoint_utf32(out_string, in_codepoint);
-}
-
-/** Streams */
-
-size_t encode_codepoint(std::basic_ostream<char>& out_stream, char32_t in_codepoint) {
-	return encode_codepoint_utf8<std::basic_ostream<char>, char>(out_stream, in_codepoint);
-}
-
-size_t encode_codepoint(std::basic_ostream<char8_t>& out_stream, char32_t in_codepoint) {
-	return encode_codepoint_utf8<std::basic_ostream<char8_t>, char8_t>(out_stream, in_codepoint);
-}
-
-size_t encode_codepoint(std::basic_ostream<char16_t>& out_stream, char32_t in_codepoint) {
-	return encode_codepoint_utf16(out_stream, in_codepoint);
-}
-
-size_t encode_codepoint(std::basic_ostream<char32_t>& out_stream, char32_t in_codepoint) {
-	return encode_codepoint_utf32(out_stream, in_codepoint);
-}
-
-/** Pointers */
-
-size_t encode_codepoint(char* out_buffer, char32_t in_codepoint) {
-	return encode_codepoint_utf8<decltype(out_buffer), char>(out_buffer, in_codepoint);
-}
-
-size_t encode_codepoint(char8_t* out_buffer, char32_t in_codepoint) {
-	return encode_codepoint_utf8<decltype(out_buffer), char8_t>(out_buffer, in_codepoint);
-}
-
-size_t encode_codepoint(char16_t* out_buffer, char32_t in_codepoint) {
-	return encode_codepoint_utf16(out_buffer, in_codepoint);
-}
-
-size_t encode_codepoint(char32_t* out_buffer, char32_t in_codepoint) {
-	return encode_codepoint_utf32(out_buffer, in_codepoint);
-}
-
 /** Allocating */

 std::u8string encode_codepoint_u8(char32_t in_codepoint) {
@ -176,126 +40,12 @@ std::u32string encode_codepoint_u32(char32_t in_codepoint) {
 	return result;
 }

-/** decode_codepoint */
-
-get_endpoint_result decode_codepoint(const std::string_view& in_string) {
-	return decode_codepoint(std::u8string_view{ reinterpret_cast<const char8_t*>(in_string.data()), in_string.size() });
-}
-
-get_endpoint_result decode_codepoint(const std::u8string_view& in_string) {
-	get_endpoint_result result{ 0, 0 };
-
-	if (in_string.empty()) {
-		return result;
-	}
-
-	if ((in_string.front() & 0x80) != 0) { // UTF-8 sequence{
-		// Validity check
-		if (in_string.size() < 2
-			|| (in_string.front() & 0x40) == 0) {
-			// This is an invalid 1 byte sequence
-			return result;
-		}
-
-		// get codepoint value
-		if ((in_string.front() & 0x20) != 0) {
-			// This is a 3+ byte sequence
-			if (in_string.size() < 3) {
-				// Invalid sequence; too few characters available
-				return result;
-			}
-
-			if ((in_string.front() & 0x10) != 0) {
-				// This is a 4 byte sequence
-				if (in_string.size() < 4) {
-					// Invalid sequence; too few characters available
-					return result;
-				}
-
-				result.codepoint = static_cast<char32_t>(in_string[0] & 0x0F) << 18;
-				result.codepoint += static_cast<char32_t>(in_string[1] & 0x3F) << 12;
-				result.codepoint += static_cast<char32_t>(in_string[2] & 0x3F) << 6;
-				result.codepoint += static_cast<char32_t>(in_string[3] & 0x3F);
-				result.units = 4;
-				return result;
-			}
-
-			// this is a 3 byte sequence
-			result.codepoint = static_cast<char32_t>(in_string[0] & 0x0F) << 12;
-			result.codepoint += static_cast<char32_t>(in_string[1] & 0x3F) << 6;
-			result.codepoint += static_cast<char32_t>(in_string[2] & 0x3F);
-			result.units = 3;
-			return result;
-		}
-
-		// This is a 2 byte sequence
-		result.codepoint = static_cast<char32_t>(in_string[0] & 0x1F) << 6;
-		result.codepoint += static_cast<char32_t>(in_string[1] & 0x3F);
-		result.units = 2;
-		return result;
-	}
-
-	// This is a valid 1 byte sequence
-	result.codepoint = static_cast<char32_t>(in_string.front());
-	result.units = 1;
-
+std::wstring encode_codepoint_w(char32_t in_codepoint) {
+	std::wstring result;
+	encode_codepoint(result, in_codepoint);
 	return result;
 }

-get_endpoint_result decode_codepoint(const std::u16string_view& in_string) {
-	if (in_string.empty()) {
-		return { 0, 0 };
-	}
-
-	if (is_high_surrogate(in_string.front()) // If this is a high surrogate codepoint...
-		&& in_string.size() > 1 // And a codepoint follows this surrogate..
-		&& is_low_surrogate(in_string[1])) { // And that codepoint is a low surrogate...
-		// We have a valid surrogate pair; decode it into a codepoint and return
-		char32_t codepoint { static_cast<char32_t>(
-			((in_string.front() - 0xD800U) * 0x400U) // high surrogate magic
-			+ (in_string[1] - 0xDC00U) // low surrogate magic
-			+ 0x10000ULL // more magic
-		) };
-
-		return { codepoint, 2 };
-	}
-
-	// Codepoint is a single char16_t; return codepoint directly
-	return { in_string.front(), 1 };
-}
-
-get_endpoint_result decode_codepoint(const std::u32string_view& in_string) {
-	if (in_string.empty()) {
-		return { 0, 0 };
-	}
-
-	return { in_string.front(), 1 };
-}
-
-bool is_high_surrogate(char32_t in_codepoint) {
-	return in_codepoint >= 0xD800 && in_codepoint <= 0xDBFF;
-}
-
-bool is_low_surrogate(char32_t in_codepoint) {
-	return in_codepoint >= 0xDC00 && in_codepoint <= 0xDFFF;
-}
-
-get_endpoint_result decode_surrogate_pair(char16_t in_high_surrogate, char16_t in_low_surrogate) {
-	if (is_high_surrogate(in_high_surrogate)
-		&& is_low_surrogate((in_low_surrogate))) {
-		// We have a valid surrogate pair; decode it into a codepoint and return
-		char32_t codepoint { static_cast<char32_t>(
-			((in_high_surrogate - 0xD800U) * 0x400U) // high surrogate magic
-			+ (in_low_surrogate - 0xDC00U) // low surrogate magic
-			+ 0x10000ULL // more magic
-		) };
-
-		return { codepoint, 2 };
-	}
-
-	return { 0, 0 };
-}
-
 /**
 * Codepoint folding (case-insensitive character comparisons)
 */
@ -549,24 +299,4 @@ char32_t fold(char32_t in_codepoint) {
 	return match->fold(in_codepoint);
 }

-static constexpr unsigned char base_table[]{
-	127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-	127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 127, 127, 127, 127, 127, 127,
-	127, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 127, 127, 127, 127, 127,
-	127, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 127, 127, 127, 127, 127,
-};
-
-int as_base(char32_t in_character, unsigned int base) {
-	if (in_character >= sizeof(base_table)) {
-		return -1;
-	}
-
-	unsigned int result = base_table[in_character];
-	if (result >= base) {
-		return -1;
-	}
-
-	return base_table[in_character];
-}
-
 } // namespace jessilib
--- a/src/include/jessilib/unicode.hpp
+++ b/src/include/jessilib/unicode.hpp
@ -21,109 +21,10 @@
 #include <string>
 #include <string_view>
 #include <ostream>
+#include "unicode_base.hpp"

 namespace jessilib {

-/** encode_codepoint */
-
-/**
- * Encodes a codepoint, and appends it to an output string
- *
- * @param out_string String to append
- * @param in_codepoint Codepoint to encode
- * @return Number of data elements appended to out_string
- */
-size_t encode_codepoint(std::string& out_string, char32_t in_codepoint); // DEPRECATED: ASSUMES UTF-8
-size_t encode_codepoint(std::u8string& out_string, char32_t in_codepoint);
-size_t encode_codepoint(std::u16string& out_string, char32_t in_codepoint);
-size_t encode_codepoint(std::u32string& out_string, char32_t in_codepoint);
-
-/**
- * Encodes a codepoint to an output stream
- *
- * @param out_stream Stream to write codepoint to
- * @param in_codepoint Codepoint to encode
- * @return Number of data elements appending to out_stream
- */
-size_t encode_codepoint(std::basic_ostream<char>& out_stream, char32_t in_codepoint); // DEPRECATED: ASSUMES UTF-8
-size_t encode_codepoint(std::basic_ostream<char8_t>& out_stream, char32_t in_codepoint);
-size_t encode_codepoint(std::basic_ostream<char16_t>& out_stream, char32_t in_codepoint);
-size_t encode_codepoint(std::basic_ostream<char32_t>& out_stream, char32_t in_codepoint);
-
-/**
- * Encodes a codepoint directly to a character buffer
- * Note: Do not use this without careful consideration; note the size requirements:
- * 1) char8_t may write up to 4 elements
- * 2) char16_t may write up to 2 elements
- * 3) char32_t may write up to 1 element
- * 4) char may write up to 4 elements; provided solely for compatibility/ease of use
- *
- * @param out_buffer Character buffer to write to
- * @param in_codepoint Codepoint to encode
- * @return Number of data elements written to out_buffer
- */
-size_t encode_codepoint(char* out_buffer, char32_t in_codepoint);
-size_t encode_codepoint(char8_t* out_buffer, char32_t in_codepoint);
-size_t encode_codepoint(char16_t* out_buffer, char32_t in_codepoint);
-size_t encode_codepoint(char32_t* out_buffer, char32_t in_codepoint);
-
-/**
- * Encodes a codepoint and returns it as a string
- *
- * @param in_codepoint Codepoint to encode
- * @return A string containing the codepoint encoded to the appropriate underlying CharT type
- */
-std::u8string encode_codepoint_u8(char32_t in_codepoint);
-std::u16string encode_codepoint_u16(char32_t in_codepoint);
-std::u32string encode_codepoint_u32(char32_t in_codepoint);
-
-/** decode_codepoint */
-
-struct get_endpoint_result {
-	char32_t codepoint{}; // Codepoint
-	size_t units{}; // Number of data units codepoint was represented by, or 0
-};
-
-/**
- * Decodes the front codepoint in a string
- *
- * @param in_string String to decode a codepoint from
- * @return A struct containing a valid codepoint and the number of representative data units on success, zero otherwise.
- */
-get_endpoint_result decode_codepoint(const std::string_view& in_string); // DEPRECATED: ASSUMES UTF-8
-get_endpoint_result decode_codepoint(const std::u8string_view& in_string); // UTF-8
-get_endpoint_result decode_codepoint(const std::u16string_view& in_string); // UTF-16
-get_endpoint_result decode_codepoint(const std::u32string_view& in_string); // UTF-32
-
-/** advance_codepoint */
-
-template<typename T>
-char32_t advance_codepoint(std::basic_string_view<T>& in_string) {
-	auto result = decode_codepoint(in_string);
-	in_string.remove_prefix(result.units);
-	return result.codepoint;
-}
-
-/** next_codepoint */
-
-template<typename T>
-std::basic_string_view<T> next_codepoint(const std::basic_string_view<T>& in_string) {
-	return in_string.substr(decode_codepoint(in_string).units);
-}
-
-/** is_valid_codepoint */
-
-template<typename T>
-bool is_valid_codepoint(const std::basic_string_view<T>& in_string) {
-	return decode_codepoint(in_string).units != 0;
-}
-
-/** utf-16 surrogate helpers */
-
-bool is_high_surrogate(char32_t in_codepoint);
-bool is_low_surrogate(char32_t in_codepoint);
-get_endpoint_result decode_surrogate_pair(char16_t in_high_surrogate, char16_t in_low_surrogate);
-
 /** Utilities */

 namespace impl_unicode {
@ -135,6 +36,8 @@ struct is_string : std::false_type {};
 template<typename T>
 struct is_string<std::basic_string<T>> {
 	using type = T;
+	static constexpr bool is_fixed_array{ false };
+	static constexpr bool is_container{ true };
 	static constexpr bool value{ true };
 	constexpr operator bool() const noexcept { return true; }
 	constexpr bool operator()() const noexcept { return true; }
@ -143,6 +46,8 @@ struct is_string<std::basic_string<T>> {
 template<typename T>
 struct is_string<std::basic_string_view<T>> {
 	using type = T;
+	static constexpr bool is_fixed_array{ false };
+	static constexpr bool is_container{ true };
 	static constexpr bool value{ true };
 	constexpr operator bool() const noexcept { return true; }
 	constexpr bool operator()() const noexcept { return true; }
@ -151,6 +56,8 @@ struct is_string<std::basic_string_view<T>> {
 template<typename T>
 struct is_string<T*> {
 	using type = T;
+	static constexpr bool is_fixed_array{ false };
+	static constexpr bool is_container{ false };
 	static constexpr bool value{ true };
 	constexpr operator bool() const noexcept { return true; }
 	constexpr bool operator()() const noexcept { return true; }
@ -159,6 +66,8 @@ struct is_string<T*> {
 template<typename T>
 struct is_string<T[]> {
 	using type = T;
+	static constexpr bool is_fixed_array{ true };
+	static constexpr bool is_container{ false };
 	static constexpr bool value{ true };
 	constexpr operator bool() const noexcept { return true; }
 	constexpr bool operator()() const noexcept { return true; }
@ -167,6 +76,8 @@ struct is_string<T[]> {
 template<typename T, size_t N>
 struct is_string<T[N]> {
 	using type = T;
+	static constexpr bool is_fixed_array{ true };
+	static constexpr bool is_container{ false };
 	static constexpr bool value{ true };
 	constexpr operator bool() const noexcept { return true; }
 	constexpr bool operator()() const noexcept { return true; }
@ -214,58 +125,60 @@ std::basic_string_view<OutCharT> string_view_cast(const InT& in_string) {

 	size_t out_string_units = in_string_bytes / sizeof(OutCharT);
 	const OutCharT* data_begin = reinterpret_cast<const OutCharT*>(in_string.data());
-	std::basic_string_view<OutCharT> result{ data_begin, out_string_units };
-
-	if (!is_valid(result)) {
-		// Result isn't valid; discard and return empty
-		return {};
-	}
-
-	return result;
+	return { data_begin, out_string_units };
 }

 template<typename OutCharT, typename InT>
 std::basic_string<OutCharT> string_cast(const InT& in_string) {
 	static_assert(impl_unicode::is_string<InT>::value == true);
 	using InCharT = typename impl_unicode::is_string<InT>::type;
+	using InEquivalentT = typename unicode_traits<InCharT>::equivalent_type;
 	using InViewT = std::basic_string_view<InCharT>;
-	std::basic_string<OutCharT> result;
+	using OutT = std::basic_string<OutCharT>;

-	// Just do a dumb copy when same type & valid; should be slightly faster than re-encoding
-	if constexpr (std::is_same_v<OutCharT, InCharT>) {
-		if (is_valid(in_string)) {
-			result = in_string;
-		}
-
-		return result;
+	if constexpr (std::is_same_v<InT, OutT>) {
+		// This does nothing at all; consider static_assert against this?
+		return in_string;
 	}
-
-	InViewT in_string_view = static_cast<InViewT>(in_string);
-	if constexpr (sizeof(InCharT) <= sizeof(OutCharT)) {
-		// When copying to a larger type, we will need _at most_ as many elements as the smaller storage type
-		result.reserve(in_string_view.size());
+	else if constexpr (std::is_same_v<OutCharT, InCharT>
+	    || std::is_same_v<OutCharT, InEquivalentT>) {
+		// Just do a dumb copy when same or equivalent char types; should be faster than re-encoding
+		if constexpr (impl_unicode::is_string<InT>::is_container) {
+			return { reinterpret_cast<const OutCharT*>(in_string.data()), in_string.size() };
+		}
+		else if constexpr (impl_unicode::is_string<InT>::is_fixed_array) {
+			return { reinterpret_cast<const OutCharT*>(in_string), std::size(in_string) - 1 }; // strip null term
+		}
+		else {
+			return { reinterpret_cast<const OutCharT*>(in_string) };
+		}
 	}
 	else {
-		result.reserve(in_string_view.size() * (sizeof(OutCharT) / sizeof(InCharT)));
-	}
+		// Last resort: reencode the string
+		std::basic_string<OutCharT> result;
+		InViewT in_string_view = static_cast<InViewT>(in_string);
+		if constexpr (sizeof(InCharT) <= sizeof(OutCharT)) {
+			// When copying to a larger type, we will need _at most_ as many elements as the smaller storage type
+			result.reserve(in_string_view.size());
+		}
+		else {
+			result.reserve(in_string_view.size() * (sizeof(OutCharT) / sizeof(InCharT)));
+		}

-	while (!in_string_view.empty()) {
-		get_endpoint_result string_front = decode_codepoint(in_string_view);
-		if (string_front.units == 0) {
-			return {};
+		while (!in_string_view.empty()) {
+			get_endpoint_result string_front = decode_codepoint(in_string_view);
+			if (string_front.units == 0) {
+				return {};
+			}
+			in_string_view.remove_prefix(string_front.units);
+
+			encode_codepoint(result, string_front.codepoint);
 		}
-		in_string_view.remove_prefix(string_front.units);

-		encode_codepoint(result, string_front.codepoint);
+		return result;
 	}
-
-	return result;
 }

-/** single-unit helper utilities */
-char32_t fold(char32_t in_codepoint); // Folds codepoint for case-insensitive checks (not for human output)
-int as_base(char32_t in_character, unsigned int base); // The value represented by in_character in terms of base if valid, -1 otherwise
-
 /**
 * Checks if two codepoints are equal to each-other (case insensitive)
 *
@ -753,7 +666,7 @@ struct text_hash {

 		get_endpoint_result decode;
 		while (data != end) {
-			decode = decode_codepoint({data, static_cast<size_t>(end - data)});
+			decode = decode_codepoint(data, end);
 			if (decode.units == 0) {
 				return hash;
 			}
@ -838,7 +751,7 @@ struct text_hashi {

 		get_endpoint_result decode;
 		while (data != end) {
-			decode = decode_codepoint({data, static_cast<size_t>(end - data)});
+			decode = decode_codepoint(data, end - data);
 			if (decode.units == 0) {
 				return hash;
 			}
--- a/src/include/jessilib/unicode_base.hpp
+++ b/src/include/jessilib/unicode_base.hpp
@ -0,0 +1,484 @@
+/**
+ * Copyright (C) 2018-2021 Jessica James.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Written by Jessica James <jessica.aj@outlook.com>
+ */
+
+#pragma once
+
+#include <string>
+#include <string_view>
+#include <ostream>
+
+namespace jessilib {
+
+/** encode_codepoint */
+
+/**
+ * Encodes a codepoint, and appends it to an output string
+ *
+ * @param out_string String to append
+ * @param in_codepoint Codepoint to encode
+ * @return Number of data elements appended to out_string
+ */
+template<typename CharT>
+constexpr size_t encode_codepoint(std::basic_string<CharT>& out_string, char32_t in_codepoint);
+
+/**
+ * Encodes a codepoint to an output stream
+ *
+ * @param out_stream Stream to write codepoint to
+ * @param in_codepoint Codepoint to encode
+ * @return Number of data elements appending to out_stream
+ */
+template<typename CharT>
+constexpr size_t encode_codepoint(std::basic_ostream<CharT>& out_stream, char32_t in_codepoint);
+
+/**
+ * Encodes a codepoint directly to a character buffer
+ * Note: Do not use this without careful consideration; note the size requirements:
+ * 1) char8_t may write up to 4 elements
+ * 2) char16_t may write up to 2 elements
+ * 3) char32_t may write up to 1 element
+ * 4) char may write up to 4 elements; provided solely for compatibility/ease of use
+ *
+ * @param out_buffer Character buffer to write to
+ * @param in_codepoint Codepoint to encode
+ * @return Number of data elements written to out_buffer
+ */
+template<typename CharT>
+constexpr size_t encode_codepoint(CharT* out_buffer, char32_t in_codepoint);
+
+/**
+ * Encodes a codepoint and returns it as a string
+ *
+ * @param in_codepoint Codepoint to encode
+ * @return A string containing the codepoint encoded to the appropriate underlying CharT type
+ */
+std::u8string encode_codepoint_u8(char32_t in_codepoint);
+std::u16string encode_codepoint_u16(char32_t in_codepoint);
+std::u32string encode_codepoint_u32(char32_t in_codepoint);
+std::wstring encode_codepoint_w(char32_t in_codepoint); // ASSUMES UTF-16 OR UTF-32
+
+/** decode_codepoint */
+
+struct get_endpoint_result {
+	char32_t codepoint{}; // Codepoint
+	size_t units{}; // Number of data units codepoint was represented by, or 0
+};
+
+/**
+ * Decodes the front codepoint in a string
+ *
+ * @param in_string String to decode a codepoint from
+ * @return A struct containing a valid codepoint and the number of representative data units on success, zero otherwise.
+ */
+template<typename CharT>
+constexpr get_endpoint_result decode_codepoint_utf8(std::basic_string_view<CharT> in_string); // UTF-8
+template<typename CharT>
+constexpr get_endpoint_result decode_codepoint_utf16(std::basic_string_view<CharT> in_string); // UTF-16
+template<typename CharT>
+constexpr get_endpoint_result decode_codepoint_utf32(std::basic_string_view<CharT> in_string); // UTF-32
+template<typename CharT>
+constexpr get_endpoint_result decode_codepoint(std::basic_string_view<CharT> in_string); // ASSUMES UTF-16 OR UTF-32
+template<typename CharT>
+constexpr get_endpoint_result decode_codepoint(const CharT* in_begin, size_t in_length);
+template<typename CharT>
+constexpr get_endpoint_result decode_codepoint(const CharT* in_begin, const CharT* in_end);
+
+/** advance_codepoint */
+
+template<typename T>
+char32_t advance_codepoint(std::basic_string_view<T>& in_string) {
+	auto result = decode_codepoint(in_string);
+	in_string.remove_prefix(result.units);
+	return result.codepoint;
+}
+
+/** next_codepoint */
+
+template<typename T>
+std::basic_string_view<T> next_codepoint(const std::basic_string_view<T>& in_string) {
+	return in_string.substr(decode_codepoint(in_string).units);
+}
+
+/** is_valid_codepoint */
+
+template<typename T>
+bool is_valid_codepoint(const std::basic_string_view<T>& in_string) {
+	return decode_codepoint(in_string).units != 0;
+}
+
+/** utf-16 surrogate helpers */
+
+constexpr bool is_high_surrogate(char32_t in_codepoint);
+constexpr bool is_low_surrogate(char32_t in_codepoint);
+constexpr get_endpoint_result decode_surrogate_pair(char16_t in_high_surrogate, char16_t in_low_surrogate);
+
+template<typename CharT>
+struct unicode_traits : std::false_type {};
+
+template<>
+struct unicode_traits<char> : std::true_type {
+	using equivalent_type = char8_t; // DEPRECATE
+	static constexpr size_t max_units_per_codepoint = 4;
+};
+
+template<>
+struct unicode_traits<char8_t> : std::true_type {
+	using equivalent_type = char; // DEPRECATE
+	static constexpr size_t max_units_per_codepoint = 4;
+};
+
+template<>
+struct unicode_traits<char16_t> : std::true_type {
+	using equivalent_type = std::conditional_t<sizeof(wchar_t) == sizeof(char16_t), wchar_t, char16_t>;
+	static constexpr size_t max_units_per_codepoint = 2;
+};
+
+template<>
+struct unicode_traits<char32_t> : std::true_type {
+	using equivalent_type = std::conditional_t<sizeof(wchar_t) == sizeof(char32_t), wchar_t, char32_t>;
+	static constexpr size_t max_units_per_codepoint = 1;
+};
+
+template<>
+struct unicode_traits<wchar_t> : std::true_type {
+	using equivalent_type = std::conditional_t<sizeof(wchar_t) == sizeof(char32_t), char32_t, char16_t>;
+	static constexpr size_t max_units_per_codepoint = unicode_traits<equivalent_type>::max_units_per_codepoint;
+};
+
+template<typename CharT>
+using encode_buffer_type = CharT[unicode_traits<CharT>::max_units_per_codepoint];
+
+/** single-unit helper utilities */
+char32_t fold(char32_t in_codepoint); // Folds codepoint for case-insensitive checks (not for human output)
+constexpr int as_base(char32_t in_character, unsigned int base); // The value represented by in_character in terms of base if valid, -1 otherwise
+
+/**
+ * Inline constexpr encode implementation
+ */
+
+/** encode_codepoint */
+
+namespace impl_unicode {
+
+template<typename T>
+constexpr void append_helper(std::basic_string<T>& out_string, T in_value) {
+	out_string += in_value;
+}
+
+template<typename T>
+constexpr void append_helper(std::basic_ostream<T>& out_string, T in_value) {
+	out_string << in_value;
+}
+
+template<typename T>
+constexpr void append_helper(T*& out_string, T in_value) {
+	*out_string = in_value;
+	++out_string;
+}
+
+} // namespace impl_unicode
+
+template<typename CharT, typename T>
+constexpr size_t encode_codepoint_utf8(T& out_destination, char32_t in_codepoint) {
+	if (in_codepoint > 0x10FFFF) {
+		return 0;
+	}
+
+	if (in_codepoint <= 0x007F) {
+		// 1-byte sequence (7 bits)
+		impl_unicode::append_helper(out_destination, static_cast<CharT>(in_codepoint));
+		return 1;
+	}
+
+	if (in_codepoint <= 0x07FF) {
+		// 2-byte sequence (11 bits; 5 + 6)
+		impl_unicode::append_helper(out_destination, static_cast<CharT>(0xC0 | ((in_codepoint >> 6) & 0x1F)));
+		impl_unicode::append_helper(out_destination, static_cast<CharT>(0x80 | (in_codepoint & 0x3F)));
+		return 2;
+	}
+
+	if (in_codepoint <= 0xFFFF) {
+		// 3-byte sequence (16 bits; 4 + 6 + 6)
+		impl_unicode::append_helper(out_destination, static_cast<CharT>(0xE0 | ((in_codepoint >> 12) & 0x0F)));
+		impl_unicode::append_helper(out_destination, static_cast<CharT>(0x80 | ((in_codepoint >> 6) & 0x3F)));
+		impl_unicode::append_helper(out_destination, static_cast<CharT>(0x80 | (in_codepoint & 0x3F)));
+		return 3;
+	}
+
+	// 4-byte sequence (21 bits; 3 + 6 + 6 + 6)
+	impl_unicode::append_helper(out_destination, static_cast<CharT>(0xF0 | ((in_codepoint >> 18) & 0x07)));
+	impl_unicode::append_helper(out_destination, static_cast<CharT>(0x80 | ((in_codepoint >> 12) & 0x3F)));
+	impl_unicode::append_helper(out_destination, static_cast<CharT>(0x80 | ((in_codepoint >> 6) & 0x3F)));
+	impl_unicode::append_helper(out_destination, static_cast<CharT>(0x80 | (in_codepoint & 0x3F)));
+	return 4;
+}
+
+template<typename CharT, typename T>
+constexpr size_t encode_codepoint_utf16(T& out_destination, char32_t in_codepoint) {
+	if (in_codepoint > 0x10FFFF) {
+		return 0;
+	}
+
+	if (in_codepoint <= 0xFFFF) {
+		// 1-unit sequence
+		impl_unicode::append_helper(out_destination, static_cast<CharT>(in_codepoint));
+		return 1;
+	}
+
+	// 2-unit sequence
+	in_codepoint -= 0x10000;
+	impl_unicode::append_helper(out_destination, static_cast<CharT>((in_codepoint >> 10) + 0xD800));
+	impl_unicode::append_helper(out_destination, static_cast<CharT>((in_codepoint & 0x03FF) + 0xDC00));
+	return 2;
+}
+
+template<typename CharT, typename T>
+constexpr size_t encode_codepoint_utf32(T& out_destination, char32_t in_codepoint) {
+	if (in_codepoint > 0x10FFFF) {
+		return 0;
+	}
+
+	impl_unicode::append_helper(out_destination, static_cast<CharT>(in_codepoint));
+	return 1;
+}
+
+template<typename T>
+constexpr size_t encode_codepoint_w(T& out_destination, char32_t in_codepoint) {
+	if constexpr (std::is_same_v<unicode_traits<wchar_t>::equivalent_type, char16_t>) {
+		return encode_codepoint_utf16<wchar_t, T>(out_destination, in_codepoint);
+	}
+
+	if constexpr (std::is_same_v<unicode_traits<wchar_t>::equivalent_type, char32_t>) {
+		return encode_codepoint_utf32<wchar_t, T>(out_destination, in_codepoint);
+	}
+}
+
+template<typename CharT, typename T>
+constexpr size_t encode_codepoint_utf(T& out_destination, char32_t in_codepoint) {
+	if constexpr (std::is_same_v<CharT, char8_t>) {
+		return encode_codepoint_utf8<CharT, T>(out_destination, in_codepoint);
+	}
+	else if constexpr (std::is_same_v<CharT, char16_t>) {
+		return encode_codepoint_utf16<CharT, T>(out_destination, in_codepoint);
+	}
+	else if constexpr (std::is_same_v<CharT, char32_t>) {
+		return encode_codepoint_utf32<CharT, T>(out_destination, in_codepoint);
+	}
+	else if constexpr (std::is_same_v<CharT, wchar_t>) {
+		return encode_codepoint_w<T>(out_destination, in_codepoint);
+	}
+	else if constexpr (std::is_same_v<CharT, char>) {
+		return encode_codepoint_utf8<CharT, T>(out_destination, in_codepoint);
+	}
+}
+
+template<typename CharT>
+constexpr size_t encode_codepoint(std::basic_string<CharT>& out_string, char32_t in_codepoint) {
+	return encode_codepoint_utf<CharT>(out_string, in_codepoint);
+}
+
+template<typename CharT>
+constexpr size_t encode_codepoint(std::basic_ostream<CharT>& out_stream, char32_t in_codepoint) {
+	return encode_codepoint_utf<CharT>(out_stream, in_codepoint);
+}
+
+template<typename CharT>
+constexpr size_t encode_codepoint(CharT* out_buffer, char32_t in_codepoint) {
+	return encode_codepoint_utf<CharT>(out_buffer, in_codepoint);
+}
+
+/**
+ * Inline constexpr decode implementation
+ */
+
+/** decode_codepoint */
+
+template<typename CharT>
+constexpr get_endpoint_result decode_codepoint_utf8(std::basic_string_view<CharT> in_string) {
+	get_endpoint_result result{ 0, 0 };
+
+	if (in_string.empty()) {
+		return result;
+	}
+
+	if ((in_string.front() & 0x80) != 0) { // UTF-8 sequence{
+		// Validity check
+		if (in_string.size() < 2
+			|| (in_string.front() & 0x40) == 0) {
+			// This is an invalid 1 byte sequence
+			return result;
+		}
+
+		// get codepoint value
+		if ((in_string.front() & 0x20) != 0) {
+			// This is a 3+ byte sequence
+			if (in_string.size() < 3) {
+				// Invalid sequence; too few characters available
+				return result;
+			}
+
+			if ((in_string.front() & 0x10) != 0) {
+				// This is a 4 byte sequence
+				if (in_string.size() < 4) {
+					// Invalid sequence; too few characters available
+					return result;
+				}
+
+				result.codepoint = static_cast<char32_t>(in_string[0] & 0x0F) << 18;
+				result.codepoint += static_cast<char32_t>(in_string[1] & 0x3F) << 12;
+				result.codepoint += static_cast<char32_t>(in_string[2] & 0x3F) << 6;
+				result.codepoint += static_cast<char32_t>(in_string[3] & 0x3F);
+				result.units = 4;
+				return result;
+			}
+
+			// this is a 3 byte sequence
+			result.codepoint = static_cast<char32_t>(in_string[0] & 0x0F) << 12;
+			result.codepoint += static_cast<char32_t>(in_string[1] & 0x3F) << 6;
+			result.codepoint += static_cast<char32_t>(in_string[2] & 0x3F);
+			result.units = 3;
+			return result;
+		}
+
+		// This is a 2 byte sequence
+		result.codepoint = static_cast<char32_t>(in_string[0] & 0x1F) << 6;
+		result.codepoint += static_cast<char32_t>(in_string[1] & 0x3F);
+		result.units = 2;
+		return result;
+	}
+
+	// This is a valid 1 byte sequence
+	result.codepoint = static_cast<char32_t>(in_string.front());
+	result.units = 1;
+
+	return result;
+}
+
+template<typename CharT>
+constexpr get_endpoint_result decode_codepoint_utf16(std::basic_string_view<CharT> in_string) {
+	if (in_string.empty()) {
+		return { 0, 0 };
+	}
+
+	if (is_high_surrogate(in_string.front()) // If this is a high surrogate codepoint...
+		&& in_string.size() > 1 // And a codepoint follows this surrogate..
+		&& is_low_surrogate(in_string[1])) { // And that codepoint is a low surrogate...
+		// We have a valid surrogate pair; decode it into a codepoint and return
+		char32_t codepoint { static_cast<char32_t>(
+			((in_string.front() - 0xD800U) * 0x400U) // high surrogate magic
+				+ (in_string[1] - 0xDC00U) // low surrogate magic
+				+ 0x10000ULL // more magic
+		) };
+
+		return { codepoint, 2 };
+	}
+
+	// Codepoint is a single char16_t; return codepoint directly
+	return { in_string.front(), 1 };
+}
+
+template<typename CharT>
+constexpr get_endpoint_result decode_codepoint_utf32(std::basic_string_view<CharT> in_string) {
+	if (in_string.empty()) {
+		return { 0, 0 };
+	}
+
+	return { in_string.front(), 1 };
+}
+
+template<typename CharT>
+constexpr get_endpoint_result decode_codepoint(std::basic_string_view<CharT> in_string) {
+	if constexpr (std::is_same_v<CharT, char8_t>) {
+		return decode_codepoint_utf8(in_string);
+	}
+	else if constexpr (std::is_same_v<CharT, char16_t>) {
+		return decode_codepoint_utf16(in_string);
+	}
+	else if constexpr (std::is_same_v<CharT, char32_t>) {
+		return decode_codepoint_utf32(in_string);
+	}
+	else if constexpr (std::is_same_v<CharT, wchar_t>) {
+		if constexpr (std::is_same_v<unicode_traits<wchar_t>::equivalent_type, char16_t>) {
+			return decode_codepoint_utf16<wchar_t>(in_string);
+		}
+		else if constexpr (std::is_same_v<unicode_traits<wchar_t>::equivalent_type, char32_t>) {
+			return decode_codepoint_utf32<wchar_t>(in_string);
+		}
+	}
+	else if constexpr (std::is_same_v<CharT, char>) {
+		return decode_codepoint_utf8(in_string);
+	}
+}
+
+template<typename CharT>
+constexpr get_endpoint_result decode_codepoint(const CharT* in_begin, size_t in_length) {
+	return decode_codepoint<CharT>(std::basic_string_view<CharT>{in_begin, in_length});
+}
+
+template<typename CharT>
+constexpr get_endpoint_result decode_codepoint(const CharT* in_begin, const CharT* in_end) {
+	return decode_codepoint<CharT>(std::basic_string_view<CharT>{in_begin, static_cast<size_t>(in_end - in_begin)});
+}
+
+constexpr bool is_high_surrogate(char32_t in_codepoint) {
+	return in_codepoint >= 0xD800 && in_codepoint <= 0xDBFF;
+}
+
+constexpr bool is_low_surrogate(char32_t in_codepoint) {
+	return in_codepoint >= 0xDC00 && in_codepoint <= 0xDFFF;
+}
+
+constexpr get_endpoint_result decode_surrogate_pair(char16_t in_high_surrogate, char16_t in_low_surrogate) {
+	if (is_high_surrogate(in_high_surrogate)
+		&& is_low_surrogate((in_low_surrogate))) {
+		// We have a valid surrogate pair; decode it into a codepoint and return
+		char32_t codepoint { static_cast<char32_t>(
+			((in_high_surrogate - 0xD800U) * 0x400U) // high surrogate magic
+				+ (in_low_surrogate - 0xDC00U) // low surrogate magic
+				+ 0x10000ULL // more magic
+		) };
+
+		return { codepoint, 2 };
+	}
+
+	return { 0, 0 };
+}
+
+// Maybe this should be moved back to .cpp and provide separate constexpr/non-constexpr variants?
+static constexpr unsigned char base_table[]{
+	127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+	127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 127, 127, 127, 127, 127, 127,
+	127, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 127, 127, 127, 127, 127,
+	127, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 127, 127, 127, 127, 127,
+};
+
+// If we're already making two comparisons, what would the real impact be of a couple more and removing the lookup table?
+constexpr int as_base(char32_t in_character, unsigned int base) {
+	if (in_character >= sizeof(base_table)) {
+		return -1;
+	}
+
+	unsigned int result = base_table[in_character];
+	if (result >= base) {
+		return -1;
+	}
+
+	return base_table[in_character];
+}
+
+} // namespace jessilib
--- a/src/include/jessilib/unicode_sequence.hpp
+++ b/src/include/jessilib/unicode_sequence.hpp
@ -25,8 +25,7 @@

 #pragma once

-#include <map>
-#include "unicode.hpp"
+#include "unicode_base.hpp"

 namespace jessilib {

@ -41,7 +40,7 @@ template<typename CharT>
 using shrink_sequence_tree_member = const std::pair<char32_t, shrink_sequence_tree_action<CharT>>;

 template<typename CharT>
-bool shrink_tree_member_compare(const shrink_sequence_tree_member<CharT>& in_lhs, const char32_t in_rhs) {
+constexpr bool shrink_tree_member_compare(const shrink_sequence_tree_member<CharT>& in_lhs, const char32_t in_rhs) {
 	return in_lhs.first < in_rhs;
 }

@ -82,7 +81,7 @@ constexpr bool is_simple() {

 // Only use for ASTs where each character process is guaranteed to write at most 1 character for each character consumed
 template<typename CharT, const shrink_sequence_tree<CharT> SequenceTreeBegin, size_t SequenceTreeSize>
-bool apply_shrink_sequence_tree(std::basic_string<CharT>& inout_string) {
+constexpr bool apply_shrink_sequence_tree(std::basic_string<CharT>& inout_string) {
 	if (inout_string.empty()) {
 		// Nothing to parse
 		return true;
@ -93,7 +92,7 @@ bool apply_shrink_sequence_tree(std::basic_string<CharT>& inout_string) {
 	get_endpoint_result decode;

 	constexpr auto SubTreeEnd = SequenceTreeBegin + SequenceTreeSize;
-	while ((decode = decode_codepoint(read_view)).units != 0) { // TODO: make constexpr
+	while ((decode = decode_codepoint(read_view)).units != 0) {
 		auto parser = std::lower_bound(SequenceTreeBegin, SubTreeEnd, decode.codepoint, &shrink_tree_member_compare<CharT>);
 		if (parser == SubTreeEnd || parser->first != decode.codepoint) {
 			// Just a normal character; write it over
@ -306,8 +305,8 @@ constexpr shrink_sequence_tree_member<CharT> make_hex_sequence_pair() {
 // Calls into another tree with the next character
 template<typename CharT, char32_t InCodepointV, const shrink_sequence_tree<CharT> SubTreeBegin, size_t SubTreeSize, bool FailNotFound = true>
 constexpr shrink_sequence_tree_member<CharT> make_tree_sequence_pair() {
-	return { InCodepointV, [](CharT*& in_write_head, std::basic_string_view<CharT>& read_view) {
-		auto decode = decode_codepoint(read_view); // TODO: make constexpr
+	return { InCodepointV, [](CharT*& in_write_head, std::basic_string_view<CharT>& read_view) constexpr {
+		auto decode = decode_codepoint(read_view);

 		constexpr shrink_sequence_tree_member<CharT>* SubTreeEnd = SubTreeBegin + SubTreeSize;
 		auto parser = std::lower_bound(SubTreeBegin, SubTreeEnd, decode.codepoint, &shrink_tree_member_compare<CharT>);
@ -388,7 +387,7 @@ static constexpr shrink_sequence_tree<CharT> cpp_escapes_root_tree{

 // Return true for valid sequences, false otherwise
 template<typename CharT>
-bool apply_cpp_escape_sequences(std::basic_string<CharT>& inout_string) {
+constexpr bool apply_cpp_escape_sequences(std::basic_string<CharT>& inout_string) {
 	static_assert(is_sorted<CharT, cpp_escapes_root_tree<CharT>, std::size(cpp_escapes_root_tree<CharT>)>(), "Tree must be pre-sorted");
 	static_assert(is_sorted<CharT, cpp_escapes_main_tree<CharT>, std::size(cpp_escapes_main_tree<CharT>)>(), "Tree must be pre-sorted");

@ -410,7 +409,7 @@ static_assert(is_sorted<char8_t, http_query_escapes_root_tree<char8_t>, std::siz

 template<typename CharT,
    std::enable_if_t<sizeof(CharT) == 1>* = nullptr>
-bool deserialize_http_query(std::basic_string<CharT>& inout_string) {
+constexpr bool deserialize_http_query(std::basic_string<CharT>& inout_string) {
 	return apply_shrink_sequence_tree<CharT, http_query_escapes_root_tree<CharT>, std::size(http_query_escapes_root_tree<CharT>)>(inout_string);
 }

--- a/src/test/unicode_sequence.cpp
+++ b/src/test/unicode_sequence.cpp
@ -18,8 +18,31 @@

 #include "jessilib/unicode_sequence.hpp"
 #include <charconv>
+#include "jessilib/unicode.hpp" // string_cast
 #include "test.hpp"

+using namespace std;
+
+// Compile-time tests for constexpr on compilers which support C++20 constexpr std::string
+#ifdef __cpp_lib_constexpr_string
+constexpr std::string cpp_constexpr(std::string_view in_expression) {
+	std::string result{ in_expression };
+	jessilib::apply_cpp_escape_sequences(result);
+	return result;
+}
+
+constexpr std::string query_constexpr(std::string_view in_expression) {
+	std::string result{ in_expression };
+	jessilib::deserialize_http_query(result);
+	return result;
+}
+static_assert(cpp_constexpr("test"s) == "test"s);
+static_assert(cpp_constexpr("\\r\\n"s) == "\r\n"s);
+static_assert(query_constexpr("test"s) == "test"s);
+static_assert(query_constexpr("first+second"s) == "first second"s);
+static_assert(query_constexpr("first%20second"s) == "first second"s);
+#endif // __cpp_lib_constexpr_string
+
 using char_types = ::testing::Types<char, char8_t, char16_t, char32_t>;
 using utf8_char_types = ::testing::Types<char, char8_t>;
 using char_type_combos = ::testing::Types<
@ -171,7 +194,7 @@ TYPED_TEST(UnicodeSequenceTest, cpp_u16) {
 		parsed_string += make_hex_string<TypeParam>(codepoint, 4);
 		jessilib::apply_cpp_escape_sequences(parsed_string);

-		auto decode = jessilib::decode_codepoint(parsed_string);
+		auto decode = jessilib::decode_codepoint(parsed_string.data(), parsed_string.size());
 		EXPECT_NE(decode.units, 0);
 		EXPECT_EQ(decode.codepoint, static_cast<char32_t>(codepoint));
 	}
@ -184,7 +207,7 @@ TYPED_TEST(UnicodeSequenceTest, cpp_u32) {
 		parsed_string += make_hex_string<TypeParam>(codepoint, 8);
 		jessilib::apply_cpp_escape_sequences(parsed_string);

-		auto decode = jessilib::decode_codepoint(parsed_string);
+		auto decode = jessilib::decode_codepoint(parsed_string.data(), parsed_string.size());
 		EXPECT_NE(decode.units, 0);
 		EXPECT_EQ(decode.codepoint, static_cast<char32_t>(codepoint));
 	}