diff --git a/src/common/unicode.cpp b/src/common/unicode.cpp index 5e28030..a3abc32 100644 --- a/src/common/unicode.cpp +++ b/src/common/unicode.cpp @@ -1,5 +1,5 @@ /** - * Copyright (C) 2018 Jessica James. + * Copyright (C) 2018-2021 Jessica James. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -549,7 +549,7 @@ char32_t fold(char32_t in_codepoint) { return match->fold(in_codepoint); } -const unsigned char base_table[]{ +static constexpr unsigned char base_table[]{ 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 127, 127, 127, 127, 127, 127, 127, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 127, 127, 127, 127, 127, diff --git a/src/include/jessilib/unicode_sequence.hpp b/src/include/jessilib/unicode_sequence.hpp index e0a2ce9..4464c82 100644 --- a/src/include/jessilib/unicode_sequence.hpp +++ b/src/include/jessilib/unicode_sequence.hpp @@ -399,9 +399,32 @@ bool apply_cpp_escape_sequences(std::basic_string& inout_string) { * Query string escape sequence parser */ -static constexpr shrink_sequence_tree http_query_escapes_root_tree{ - make_hex_sequence_pair(), - make_simple_sequence_pair() +template* = nullptr> // make_hex_sequence_pair isn't going to play well with other types +static constexpr shrink_sequence_tree http_query_escapes_root_tree{ + make_hex_sequence_pair(), + make_simple_sequence_pair() }; +static_assert(is_sorted, std::size(http_query_escapes_root_tree)>(), "Tree must be pre-sorted"); +static_assert(is_sorted, std::size(http_query_escapes_root_tree)>(), "Tree must be pre-sorted"); + +template* = nullptr> +bool deserialize_http_query(std::basic_string& inout_string) { + return apply_shrink_sequence_tree, std::size(http_query_escapes_root_tree)>(inout_string); +} + +// TODO: decide whether to take this approach, where query strings are assumed to represent UTF-8 text data, OR implement +// such that calling deserialize_http_query will assume the relevant encoding (i.e: calling with char16_t would read in +// escaped query values as bytes in codepoint char16_t, rather than utf-8 encoding sequence) +/*template* = nullptr> +bool deserialize_http_query(std::basic_string& inout_string) { + //TODO: optimize this? + std::basic_string u8query_string = string_cast(inout_string); + bool result = deserialize_http_query(u8query_string); + inout_string = string_cast(u8query_string); + return result; +}*/ } // namespace jessilib diff --git a/src/test/unicode_sequence.cpp b/src/test/unicode_sequence.cpp index 5067714..83e4c22 100644 --- a/src/test/unicode_sequence.cpp +++ b/src/test/unicode_sequence.cpp @@ -21,6 +21,7 @@ #include "test.hpp" using char_types = ::testing::Types; +using utf8_char_types = ::testing::Types; using char_type_combos = ::testing::Types< std::pair, std::pair, std::pair, std::pair, std::pair, std::pair, std::pair, std::pair, @@ -33,6 +34,14 @@ public: }; TYPED_TEST_SUITE(UnicodeSequenceTest, char_types); +template +class UnicodeUTF8SequenceTest : public ::testing::Test { +public: +}; +TYPED_TEST_SUITE(UnicodeUTF8SequenceTest, utf8_char_types); + +constexpr char32_t MAX_LOOP_CODEPOINT = 0x100FF; // use 0x10FFFF for full testing + #define TEST_CPP_SEQUENCE(expr) \ { auto parsed_string = jessilib::string_cast(#expr); \ auto normal_string = jessilib::string_cast(expr); \ @@ -138,9 +147,9 @@ TYPED_TEST(UnicodeSequenceTest, cpp_hex) { return; } - // "x0" -> "x10ffff" with & without leading zeroes + // "x0" -> "x100ff" with & without leading zeroes if constexpr (sizeof(TypeParam) == 4) { - for (unsigned int codepoint = 0; codepoint <= 0x10000; ++codepoint) { + for (unsigned int codepoint = 0; codepoint <= MAX_LOOP_CODEPOINT; ++codepoint) { std::basic_string parsed_string; for (size_t min_length = 0; min_length <= 8; ++min_length) { // "\x0" -> "\x0010ffff" @@ -156,7 +165,7 @@ TYPED_TEST(UnicodeSequenceTest, cpp_hex) { } TYPED_TEST(UnicodeSequenceTest, cpp_u16) { - // "u000" -> "uffff" with & without leading zeroes + // "u0000" -> "uffff" with & without leading zeroes for (unsigned int codepoint = 0; codepoint <= 0xFFFF; ++codepoint) { std::basic_string parsed_string = jessilib::string_cast("\\u"); parsed_string += make_hex_string(codepoint, 4); @@ -169,8 +178,8 @@ TYPED_TEST(UnicodeSequenceTest, cpp_u16) { } TYPED_TEST(UnicodeSequenceTest, cpp_u32) { - // "u000" -> "uffff" with & without leading zeroes - for (unsigned int codepoint = 0; codepoint <= 0x100FF; ++codepoint) { + // "U00000000" -> "U000100FF" with & without leading zeroes + for (unsigned int codepoint = 0; codepoint <= MAX_LOOP_CODEPOINT; ++codepoint) { std::basic_string parsed_string = jessilib::string_cast("\\U"); parsed_string += make_hex_string(codepoint, 8); jessilib::apply_cpp_escape_sequences(parsed_string); @@ -180,3 +189,123 @@ TYPED_TEST(UnicodeSequenceTest, cpp_u32) { EXPECT_EQ(decode.codepoint, static_cast(codepoint)); } } + +/** + * Query strings + */ + +TYPED_TEST(UnicodeUTF8SequenceTest, single_chars) { + // [U+0000, U+100FF) + for (char32_t codepoint = 0; codepoint < MAX_LOOP_CODEPOINT; ++codepoint) { + std::basic_string expected; + size_t units = jessilib::encode_codepoint(expected, codepoint); + EXPECT_NE(units, 0); + EXPECT_EQ(units, expected.size()); + + // Construct the query string + std::basic_string query_string; + for (auto& unit : expected) { + char encoded[3] { '%', 0, 0 }; + char* encoded_end = encoded + sizeof(encoded); + auto to_chars_result = std::to_chars(encoded + 1, encoded_end, static_cast(unit), 16); + ASSERT_EQ(to_chars_result.ec, std::errc{}) // assertion will fail when `unit` is signed type + << "For unit " << static_cast(unit) << " in codepoint " << static_cast(codepoint) << std::endl; + + if (to_chars_result.ptr != encoded_end) { + // Only wrote one hex; shift it + encoded[2] = encoded[1]; + encoded[1] = '0'; + } + + EXPECT_EQ(encoded[0], '%'); + EXPECT_NE(encoded[1], 0); + EXPECT_NE(encoded[2], 0); + query_string.insert(query_string.end(), encoded, encoded_end); + } + EXPECT_EQ(query_string.size(), expected.size() * 3); + + // Decode & check the query string + jessilib::deserialize_http_query(query_string); + EXPECT_EQ(query_string, expected); + } +} + +TYPED_TEST(UnicodeUTF8SequenceTest, invalids) { + std::basic_string query_string, long_query_string; + for (size_t unit = 0; unit <= 0xFF; ++unit) { + TypeParam encoded[2] { '%', static_cast(unit) }; + TypeParam* encoded_end = encoded + sizeof(encoded); + query_string.insert(query_string.end(), encoded, encoded_end); + + long_query_string += query_string; + jessilib::deserialize_http_query(query_string); + EXPECT_TRUE(query_string.empty()) + << "in unit: " << unit << std::endl; + } + + jessilib::deserialize_http_query(long_query_string); + EXPECT_TRUE(long_query_string.empty()); +} + +TYPED_TEST(UnicodeUTF8SequenceTest, invalids_2len) { + std::basic_string query_string, long_query_string; + for (size_t unit = 0; unit <= 0xFFFF; ++unit) { + TypeParam first = static_cast(unit >> 8); // order of these two doesn't matter + TypeParam second = static_cast(unit & 0xFF); + if (jessilib::as_base(first, 16) >= 0 + && jessilib::as_base(second, 16) >= 0) { + continue; + } + TypeParam encoded[3] { '%', static_cast(first), static_cast(second) }; + TypeParam* encoded_end = encoded + sizeof(encoded); + query_string.insert(query_string.end(), encoded, encoded_end); + + long_query_string += query_string; + jessilib::deserialize_http_query(query_string); + EXPECT_TRUE(query_string.empty()) + << "in unit: " << unit << std::endl; + } + + jessilib::deserialize_http_query(long_query_string); + EXPECT_TRUE(long_query_string.empty()); +} + +TYPED_TEST(UnicodeUTF8SequenceTest, invalids_trailing) { + std::basic_string query_string, long_query_string; + for (size_t unit = 0; unit <= 0xFF; ++unit) { + TypeParam encoded[3] { '%', static_cast(unit), '%' }; + TypeParam* encoded_end = encoded + sizeof(encoded); + query_string.insert(query_string.end(), encoded, encoded_end); + + long_query_string += query_string; + jessilib::deserialize_http_query(query_string); + EXPECT_TRUE(query_string.empty()) + << "in unit: " << unit << std::endl; + } + + jessilib::deserialize_http_query(long_query_string); + EXPECT_TRUE(long_query_string.empty()); +} + +TYPED_TEST(UnicodeUTF8SequenceTest, invalids_2len_trailing) { + std::basic_string query_string, long_query_string; + for (size_t unit = 0; unit <= 0xFFFF; ++unit) { + TypeParam first = static_cast(unit >> 8); // order of these two doesn't matter + TypeParam second = static_cast(unit & 0xFF); + if (jessilib::as_base(first, 16) >= 0 + && jessilib::as_base(second, 16) >= 0) { + continue; + } + TypeParam encoded[4] { '%', static_cast(first), static_cast(second), '%' }; + TypeParam* encoded_end = encoded + sizeof(encoded); + query_string.insert(query_string.end(), encoded, encoded_end); + + long_query_string += query_string; + jessilib::deserialize_http_query(query_string); + EXPECT_TRUE(query_string.empty()) + << "in unit: " << unit << std::endl; + } + + jessilib::deserialize_http_query(long_query_string); + EXPECT_TRUE(long_query_string.empty()); +}