Browse Source

Simplify & fix low control character handling in make_json_string

master
Jessica James 3 years ago
parent
commit
edf5b45223
  1. 67
      src/common/parsers/json.cpp
  2. 1
      src/test/parsers/json.cpp

67
src/common/parsers/json.cpp

@ -30,74 +30,37 @@ std::string make_json_string(std::u8string_view in_string) {
result.reserve(in_string.size() + 2); result.reserve(in_string.size() + 2);
result = '\"'; result = '\"';
while (!in_string.empty()) { decode_result decode;
if (in_string.front() == '\\') { // backslash while ((decode = decode_codepoint(in_string)).units != 0) {
if (decode.codepoint == U'\\') { // backslash
result += '\\'; result += '\\';
result += '\\'; result += '\\';
} }
else if (in_string.front() == '\"') { // quotation else if (decode.codepoint == U'\"') { // quotation
result += '\\'; result += '\\';
result += '\"'; result += '\"';
} }
else if (in_string.front() < 0x20) { // control characters else if (decode.codepoint < 0x20) { // control characters
result += "\\u0000"sv; result += "\\u0000"sv;
// overwrite last 2 zeroes with correct hexadecimal sequence // overwrite last 2 zeroes with correct hexadecimal sequence
char* data_end = result.data() + result.size(); char* data_end = result.data() + result.size();
char* data = data_end - 2; // TODO: this isn't correct, is it? to_chars may only write 1 char in many cases char* data = data_end - 2; // Will only ever use 2 chars
std::to_chars(data, data_end, static_cast<char>(in_string.front()), 16); // TODO: use decode_codepoint auto to_chars_result = std::to_chars(data, data_end, static_cast<uint32_t>(decode.codepoint), 16);
} if (to_chars_result.ec == std::errc{} && to_chars_result.ptr != data_end) {
else if ((in_string.front() & 0x80) != 0) { // UTF-8 sequence; copy to bypass above processing // Only 1 byte written; shift it over
if ((in_string.front() & 0x40) != 0) { *to_chars_result.ptr = *(to_chars_result.ptr - 1);
// this is a 2+ byte sequence
if ((in_string.front() & 0x20) != 0) {
// this is a 3+ byte sequence
if ((in_string.front() & 0x10) != 0) {
// this is a 4 byte sequence
if (in_string.size() < 4) {
// Invalid sequence encountered (first byte indicates 4 bytes, but less than 4 available)
break;
}
// This is a 4-byte sequence
result.append(reinterpret_cast<const char*>(in_string.data()), 4);
in_string.remove_prefix(4);
continue;
}
if (in_string.size() < 3) {
// Invalid sequence encountered (first byte indicates 3 bytes, but less than 3 available)
break;
}
// This is a 3-byte sequence
result.append(reinterpret_cast<const char*>(in_string.data()), 3);
in_string.remove_prefix(3);
continue;
}
if (in_string.size() < 2) {
// Invalid sequence encountered (first byte indicates 2 bytes, but less than 2 available)
break;
}
// This is a 2-byte sequence // And fill in the zeroes
result.append(reinterpret_cast<const char*>(in_string.data()), 2); *(to_chars_result.ptr - 1) = '0';
in_string.remove_prefix(2);
continue;
} }
// Invalid sequence encountered (first bit is 1, but not second)
break;
} }
else { else {
// Character in standard ASCII table // Valid UTF-8 sequence; copy it over
result += static_cast<char>(in_string.front()); result.append(reinterpret_cast<const char*>(in_string.data()), decode.units);
} }
in_string.remove_prefix(1); in_string.remove_prefix(decode.units);
} }
result += '\"'; result += '\"';

1
src/test/parsers/json.cpp

@ -58,6 +58,7 @@ TEST(JsonParser, serialize_string) {
EXPECT_EQ(parser.serialize(u8"text"), R"json("text")json"); EXPECT_EQ(parser.serialize(u8"text"), R"json("text")json");
expect_eq(parser.serialize(u8"\"text\""), R"json("\"text\"")json"); expect_eq(parser.serialize(u8"\"text\""), R"json("\"text\"")json");
expect_eq(parser.serialize(u8"\"te\x01xt\""), R"json("\"te\u0001xt\"")json");
expect_eq(parser.serialize(u8"\"te\x10xt\""), R"json("\"te\u0010xt\"")json"); expect_eq(parser.serialize(u8"\"te\x10xt\""), R"json("\"te\u0010xt\"")json");
} }

Loading…
Cancel
Save