Browse Source

add deserialize_http_query

master
Jessica James 3 years ago
parent
commit
a8cc0237f8
  1. 4
      src/common/unicode.cpp
  2. 29
      src/include/jessilib/unicode_sequence.hpp
  3. 139
      src/test/unicode_sequence.cpp

4
src/common/unicode.cpp

@ -1,5 +1,5 @@
/** /**
* Copyright (C) 2018 Jessica James. * Copyright (C) 2018-2021 Jessica James.
* *
* Permission to use, copy, modify, and/or distribute this software for any * Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above * purpose with or without fee is hereby granted, provided that the above
@ -549,7 +549,7 @@ char32_t fold(char32_t in_codepoint) {
return match->fold(in_codepoint); return match->fold(in_codepoint);
} }
const unsigned char base_table[]{ static constexpr unsigned char base_table[]{
127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 127, 127, 127, 127, 127, 127,
127, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 127, 127, 127, 127, 127, 127, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 127, 127, 127, 127, 127,

29
src/include/jessilib/unicode_sequence.hpp

@ -399,9 +399,32 @@ bool apply_cpp_escape_sequences(std::basic_string<CharT>& inout_string) {
* Query string escape sequence parser * Query string escape sequence parser
*/ */
static constexpr shrink_sequence_tree<char8_t> http_query_escapes_root_tree{ template<typename CharT,
make_hex_sequence_pair<char8_t, U'%', 2, false, false>(), std::enable_if_t<sizeof(CharT) == 1>* = nullptr> // make_hex_sequence_pair isn't going to play well with other types
make_simple_sequence_pair<char8_t, U'+', ' '>() static constexpr shrink_sequence_tree<CharT> http_query_escapes_root_tree{
make_hex_sequence_pair<CharT, U'%', 2, true, false>(),
make_simple_sequence_pair<CharT, U'+', ' '>()
}; };
static_assert(is_sorted<char, http_query_escapes_root_tree<char>, std::size(http_query_escapes_root_tree<char>)>(), "Tree must be pre-sorted");
static_assert(is_sorted<char8_t, http_query_escapes_root_tree<char8_t>, std::size(http_query_escapes_root_tree<char8_t>)>(), "Tree must be pre-sorted");
template<typename CharT,
std::enable_if_t<sizeof(CharT) == 1>* = nullptr>
bool deserialize_http_query(std::basic_string<CharT>& inout_string) {
return apply_shrink_sequence_tree<CharT, http_query_escapes_root_tree<CharT>, std::size(http_query_escapes_root_tree<CharT>)>(inout_string);
}
// TODO: decide whether to take this approach, where query strings are assumed to represent UTF-8 text data, OR implement
// such that calling deserialize_http_query will assume the relevant encoding (i.e: calling with char16_t would read in
// escaped query values as bytes in codepoint char16_t, rather than utf-8 encoding sequence)
/*template<typename CharT,
std::enable_if_t<sizeof(CharT) != 1>* = nullptr>
bool deserialize_http_query(std::basic_string<CharT>& inout_string) {
//TODO: optimize this?
std::basic_string<char8_t> u8query_string = string_cast<char8_t>(inout_string);
bool result = deserialize_http_query<char8_t>(u8query_string);
inout_string = string_cast<CharT>(u8query_string);
return result;
}*/
} // namespace jessilib } // namespace jessilib

139
src/test/unicode_sequence.cpp

@ -21,6 +21,7 @@
#include "test.hpp" #include "test.hpp"
using char_types = ::testing::Types<char, char8_t, char16_t, char32_t>; using char_types = ::testing::Types<char, char8_t, char16_t, char32_t>;
using utf8_char_types = ::testing::Types<char, char8_t>;
using char_type_combos = ::testing::Types< using char_type_combos = ::testing::Types<
std::pair<char, char>, std::pair<char, char8_t>, std::pair<char, char16_t>, std::pair<char, char32_t>, std::pair<char, char>, std::pair<char, char8_t>, std::pair<char, char16_t>, std::pair<char, char32_t>,
std::pair<char8_t, char>, std::pair<char8_t, char8_t>, std::pair<char8_t, char16_t>, std::pair<char8_t, char32_t>, std::pair<char8_t, char>, std::pair<char8_t, char8_t>, std::pair<char8_t, char16_t>, std::pair<char8_t, char32_t>,
@ -33,6 +34,14 @@ public:
}; };
TYPED_TEST_SUITE(UnicodeSequenceTest, char_types); TYPED_TEST_SUITE(UnicodeSequenceTest, char_types);
template<typename T>
class UnicodeUTF8SequenceTest : public ::testing::Test {
public:
};
TYPED_TEST_SUITE(UnicodeUTF8SequenceTest, utf8_char_types);
constexpr char32_t MAX_LOOP_CODEPOINT = 0x100FF; // use 0x10FFFF for full testing
#define TEST_CPP_SEQUENCE(expr) \ #define TEST_CPP_SEQUENCE(expr) \
{ auto parsed_string = jessilib::string_cast<TypeParam>(#expr); \ { auto parsed_string = jessilib::string_cast<TypeParam>(#expr); \
auto normal_string = jessilib::string_cast<TypeParam>(expr); \ auto normal_string = jessilib::string_cast<TypeParam>(expr); \
@ -138,9 +147,9 @@ TYPED_TEST(UnicodeSequenceTest, cpp_hex) {
return; return;
} }
// "x0" -> "x10ffff" with & without leading zeroes // "x0" -> "x100ff" with & without leading zeroes
if constexpr (sizeof(TypeParam) == 4) { if constexpr (sizeof(TypeParam) == 4) {
for (unsigned int codepoint = 0; codepoint <= 0x10000; ++codepoint) { for (unsigned int codepoint = 0; codepoint <= MAX_LOOP_CODEPOINT; ++codepoint) {
std::basic_string<TypeParam> parsed_string; std::basic_string<TypeParam> parsed_string;
for (size_t min_length = 0; min_length <= 8; ++min_length) { for (size_t min_length = 0; min_length <= 8; ++min_length) {
// "\x0" -> "\x0010ffff" // "\x0" -> "\x0010ffff"
@ -156,7 +165,7 @@ TYPED_TEST(UnicodeSequenceTest, cpp_hex) {
} }
TYPED_TEST(UnicodeSequenceTest, cpp_u16) { TYPED_TEST(UnicodeSequenceTest, cpp_u16) {
// "u000" -> "uffff" with & without leading zeroes // "u0000" -> "uffff" with & without leading zeroes
for (unsigned int codepoint = 0; codepoint <= 0xFFFF; ++codepoint) { for (unsigned int codepoint = 0; codepoint <= 0xFFFF; ++codepoint) {
std::basic_string<TypeParam> parsed_string = jessilib::string_cast<TypeParam>("\\u"); std::basic_string<TypeParam> parsed_string = jessilib::string_cast<TypeParam>("\\u");
parsed_string += make_hex_string<TypeParam>(codepoint, 4); parsed_string += make_hex_string<TypeParam>(codepoint, 4);
@ -169,8 +178,8 @@ TYPED_TEST(UnicodeSequenceTest, cpp_u16) {
} }
TYPED_TEST(UnicodeSequenceTest, cpp_u32) { TYPED_TEST(UnicodeSequenceTest, cpp_u32) {
// "u000" -> "uffff" with & without leading zeroes // "U00000000" -> "U000100FF" with & without leading zeroes
for (unsigned int codepoint = 0; codepoint <= 0x100FF; ++codepoint) { for (unsigned int codepoint = 0; codepoint <= MAX_LOOP_CODEPOINT; ++codepoint) {
std::basic_string<TypeParam> parsed_string = jessilib::string_cast<TypeParam>("\\U"); std::basic_string<TypeParam> parsed_string = jessilib::string_cast<TypeParam>("\\U");
parsed_string += make_hex_string<TypeParam>(codepoint, 8); parsed_string += make_hex_string<TypeParam>(codepoint, 8);
jessilib::apply_cpp_escape_sequences(parsed_string); jessilib::apply_cpp_escape_sequences(parsed_string);
@ -180,3 +189,123 @@ TYPED_TEST(UnicodeSequenceTest, cpp_u32) {
EXPECT_EQ(decode.codepoint, static_cast<char32_t>(codepoint)); EXPECT_EQ(decode.codepoint, static_cast<char32_t>(codepoint));
} }
} }
/**
* Query strings
*/
TYPED_TEST(UnicodeUTF8SequenceTest, single_chars) {
// [U+0000, U+100FF)
for (char32_t codepoint = 0; codepoint < MAX_LOOP_CODEPOINT; ++codepoint) {
std::basic_string<TypeParam> expected;
size_t units = jessilib::encode_codepoint(expected, codepoint);
EXPECT_NE(units, 0);
EXPECT_EQ(units, expected.size());
// Construct the query string
std::basic_string<TypeParam> query_string;
for (auto& unit : expected) {
char encoded[3] { '%', 0, 0 };
char* encoded_end = encoded + sizeof(encoded);
auto to_chars_result = std::to_chars(encoded + 1, encoded_end, static_cast<unsigned char>(unit), 16);
ASSERT_EQ(to_chars_result.ec, std::errc{}) // assertion will fail when `unit` is signed type
<< "For unit " << static_cast<int>(unit) << " in codepoint " << static_cast<int>(codepoint) << std::endl;
if (to_chars_result.ptr != encoded_end) {
// Only wrote one hex; shift it
encoded[2] = encoded[1];
encoded[1] = '0';
}
EXPECT_EQ(encoded[0], '%');
EXPECT_NE(encoded[1], 0);
EXPECT_NE(encoded[2], 0);
query_string.insert(query_string.end(), encoded, encoded_end);
}
EXPECT_EQ(query_string.size(), expected.size() * 3);
// Decode & check the query string
jessilib::deserialize_http_query(query_string);
EXPECT_EQ(query_string, expected);
}
}
TYPED_TEST(UnicodeUTF8SequenceTest, invalids) {
std::basic_string<TypeParam> query_string, long_query_string;
for (size_t unit = 0; unit <= 0xFF; ++unit) {
TypeParam encoded[2] { '%', static_cast<TypeParam>(unit) };
TypeParam* encoded_end = encoded + sizeof(encoded);
query_string.insert(query_string.end(), encoded, encoded_end);
long_query_string += query_string;
jessilib::deserialize_http_query(query_string);
EXPECT_TRUE(query_string.empty())
<< "in unit: " << unit << std::endl;
}
jessilib::deserialize_http_query(long_query_string);
EXPECT_TRUE(long_query_string.empty());
}
TYPED_TEST(UnicodeUTF8SequenceTest, invalids_2len) {
std::basic_string<TypeParam> query_string, long_query_string;
for (size_t unit = 0; unit <= 0xFFFF; ++unit) {
TypeParam first = static_cast<TypeParam>(unit >> 8); // order of these two doesn't matter
TypeParam second = static_cast<TypeParam>(unit & 0xFF);
if (jessilib::as_base(first, 16) >= 0
&& jessilib::as_base(second, 16) >= 0) {
continue;
}
TypeParam encoded[3] { '%', static_cast<TypeParam>(first), static_cast<TypeParam>(second) };
TypeParam* encoded_end = encoded + sizeof(encoded);
query_string.insert(query_string.end(), encoded, encoded_end);
long_query_string += query_string;
jessilib::deserialize_http_query(query_string);
EXPECT_TRUE(query_string.empty())
<< "in unit: " << unit << std::endl;
}
jessilib::deserialize_http_query(long_query_string);
EXPECT_TRUE(long_query_string.empty());
}
TYPED_TEST(UnicodeUTF8SequenceTest, invalids_trailing) {
std::basic_string<TypeParam> query_string, long_query_string;
for (size_t unit = 0; unit <= 0xFF; ++unit) {
TypeParam encoded[3] { '%', static_cast<TypeParam>(unit), '%' };
TypeParam* encoded_end = encoded + sizeof(encoded);
query_string.insert(query_string.end(), encoded, encoded_end);
long_query_string += query_string;
jessilib::deserialize_http_query(query_string);
EXPECT_TRUE(query_string.empty())
<< "in unit: " << unit << std::endl;
}
jessilib::deserialize_http_query(long_query_string);
EXPECT_TRUE(long_query_string.empty());
}
TYPED_TEST(UnicodeUTF8SequenceTest, invalids_2len_trailing) {
std::basic_string<TypeParam> query_string, long_query_string;
for (size_t unit = 0; unit <= 0xFFFF; ++unit) {
TypeParam first = static_cast<TypeParam>(unit >> 8); // order of these two doesn't matter
TypeParam second = static_cast<TypeParam>(unit & 0xFF);
if (jessilib::as_base(first, 16) >= 0
&& jessilib::as_base(second, 16) >= 0) {
continue;
}
TypeParam encoded[4] { '%', static_cast<TypeParam>(first), static_cast<TypeParam>(second), '%' };
TypeParam* encoded_end = encoded + sizeof(encoded);
query_string.insert(query_string.end(), encoded, encoded_end);
long_query_string += query_string;
jessilib::deserialize_http_query(query_string);
EXPECT_TRUE(query_string.empty())
<< "in unit: " << unit << std::endl;
}
jessilib::deserialize_http_query(long_query_string);
EXPECT_TRUE(long_query_string.empty());
}

Loading…
Cancel
Save