From edf5b45223b02704e4a4c170c4c125dbf927cde8 Mon Sep 17 00:00:00 2001 From: Jessica James Date: Mon, 6 Dec 2021 13:02:47 -0600 Subject: [PATCH] Simplify & fix low control character handling in make_json_string --- src/common/parsers/json.cpp | 69 +++++++++---------------------------- src/test/parsers/json.cpp | 1 + 2 files changed, 17 insertions(+), 53 deletions(-) diff --git a/src/common/parsers/json.cpp b/src/common/parsers/json.cpp index 175af1d..c22ad78 100644 --- a/src/common/parsers/json.cpp +++ b/src/common/parsers/json.cpp @@ -30,74 +30,37 @@ std::string make_json_string(std::u8string_view in_string) { result.reserve(in_string.size() + 2); result = '\"'; - while (!in_string.empty()) { - if (in_string.front() == '\\') { // backslash + decode_result decode; + while ((decode = decode_codepoint(in_string)).units != 0) { + if (decode.codepoint == U'\\') { // backslash result += '\\'; result += '\\'; } - else if (in_string.front() == '\"') { // quotation + else if (decode.codepoint == U'\"') { // quotation result += '\\'; result += '\"'; } - else if (in_string.front() < 0x20) { // control characters + else if (decode.codepoint < 0x20) { // control characters result += "\\u0000"sv; // overwrite last 2 zeroes with correct hexadecimal sequence char* data_end = result.data() + result.size(); - char* data = data_end - 2; // TODO: this isn't correct, is it? to_chars may only write 1 char in many cases - std::to_chars(data, data_end, static_cast(in_string.front()), 16); // TODO: use decode_codepoint - } - else if ((in_string.front() & 0x80) != 0) { // UTF-8 sequence; copy to bypass above processing - if ((in_string.front() & 0x40) != 0) { - // this is a 2+ byte sequence - - if ((in_string.front() & 0x20) != 0) { - // this is a 3+ byte sequence - - if ((in_string.front() & 0x10) != 0) { - // this is a 4 byte sequence - if (in_string.size() < 4) { - // Invalid sequence encountered (first byte indicates 4 bytes, but less than 4 available) - break; - } - - // This is a 4-byte sequence - result.append(reinterpret_cast(in_string.data()), 4); - in_string.remove_prefix(4); - continue; - } - - if (in_string.size() < 3) { - // Invalid sequence encountered (first byte indicates 3 bytes, but less than 3 available) - break; - } - - // This is a 3-byte sequence - result.append(reinterpret_cast(in_string.data()), 3); - in_string.remove_prefix(3); - continue; - } - - if (in_string.size() < 2) { - // Invalid sequence encountered (first byte indicates 2 bytes, but less than 2 available) - break; - } - - // This is a 2-byte sequence - result.append(reinterpret_cast(in_string.data()), 2); - in_string.remove_prefix(2); - continue; + char* data = data_end - 2; // Will only ever use 2 chars + auto to_chars_result = std::to_chars(data, data_end, static_cast(decode.codepoint), 16); + if (to_chars_result.ec == std::errc{} && to_chars_result.ptr != data_end) { + // Only 1 byte written; shift it over + *to_chars_result.ptr = *(to_chars_result.ptr - 1); + + // And fill in the zeroes + *(to_chars_result.ptr - 1) = '0'; } - - // Invalid sequence encountered (first bit is 1, but not second) - break; } else { - // Character in standard ASCII table - result += static_cast(in_string.front()); + // Valid UTF-8 sequence; copy it over + result.append(reinterpret_cast(in_string.data()), decode.units); } - in_string.remove_prefix(1); + in_string.remove_prefix(decode.units); } result += '\"'; diff --git a/src/test/parsers/json.cpp b/src/test/parsers/json.cpp index 50f5e04..b7bcda3 100644 --- a/src/test/parsers/json.cpp +++ b/src/test/parsers/json.cpp @@ -58,6 +58,7 @@ TEST(JsonParser, serialize_string) { EXPECT_EQ(parser.serialize(u8"text"), R"json("text")json"); expect_eq(parser.serialize(u8"\"text\""), R"json("\"text\"")json"); + expect_eq(parser.serialize(u8"\"te\x01xt\""), R"json("\"te\u0001xt\"")json"); expect_eq(parser.serialize(u8"\"te\x10xt\""), R"json("\"te\u0010xt\"")json"); }