Browse Source

Use generic escape sequence parser for json

master
Jessica James 3 years ago
parent
commit
b80b353b3b
  1. 135
      src/common/parsers/json.cpp

135
src/common/parsers/json.cpp

@ -19,6 +19,7 @@
#include "parsers/json.hpp"
#include <charconv>
#include "unicode.hpp"
#include "unicode_sequence.hpp"
#include "util.hpp"
using namespace std::literals;
@ -101,135 +102,41 @@ std::u8string read_json_string(std::string_view& in_data) {
// Remove leading quotation
in_data.remove_prefix(1);
// Iterate over view until we reach the ending quotation, or the end of the view
while (!in_data.empty()) {
switch (in_data.front()) {
// Escape sequence
case '\\':
// strip '\'
in_data.remove_prefix(1);
if (in_data.empty()) {
throw std::invalid_argument{ "Invalid JSON data; unexpected end of data when parsing escape sequence" };
throw std::invalid_argument{ "Invalid JSON data; missing ending quote (\") when parsing string" };
}
// Parse escape type
switch (in_data.front()) {
// Quote
case '\"':
in_data.remove_prefix(1);
result += u8'\"';
break;
// Backslash
case '\\':
in_data.remove_prefix(1);
result += u8'\\';
break;
// Forward slash
case '/':
in_data.remove_prefix(1);
result += u8'/';
break;
// Backspace
case 'b':
in_data.remove_prefix(1);
result += u8'\b';
break;
// Formfeed
case 'f':
in_data.remove_prefix(1);
result += u8'\f';
break;
// Newline
case 'n':
in_data.remove_prefix(1);
result += u8'\n';
break;
// Carriage return
case 'r':
in_data.remove_prefix(1);
result += u8'\r';
break;
// Horizontal tab
case 't':
if (in_data.front() == '\"') {
in_data.remove_prefix(1);
result += u8'\t';
break;
// Unicode codepoint
case 'u': {
in_data.remove_prefix(1); // strip 'u'
if (in_data.size() < 4) {
throw std::invalid_argument{
"Invalid JSON data; unexpected end of data when parsing unicode escape sequence" };
advance_whitespace(in_data); // strip trailing spaces
return result;
}
char16_t codepoint = get_codepoint_from_hex(in_data);
in_data.remove_prefix(4); // strip codepoint hex
if (is_high_surrogate(codepoint) // If we have a high surrogate...
&& in_data.size() >= 6) { // And we have enough room for "\uXXXX"...
// Special case: we just parsed a high surrogate. Handle this with the low surrogate, if there is one
if (in_data.substr(0, 2) == "\\u"sv) {
// Another codepoint follows; read it in
in_data.remove_prefix(2); // strip "\u"
char16_t second_codepoint = get_codepoint_from_hex(in_data);
in_data.remove_prefix(4); // strip codepoint hex
if (is_low_surrogate(second_codepoint)) {
// We've got a valid surrogate pair; serialize the represented codepoint; decode it
codepoint = static_cast<char16_t>(decode_surrogate_pair(codepoint, second_codepoint).codepoint);
encode_codepoint(result, codepoint); // serialize the real codepoint
}
else {
// This is not a valid surrogate pair; serialize the codepoints directly
encode_codepoint(result, codepoint);
encode_codepoint(result, second_codepoint);
}
continue;
}
size_t search_start = 1;
size_t end_pos;
while ((end_pos = in_data.find('\"', search_start)) != std::string_view::npos) {
// Quote found; check if it's escaped
if (in_data[end_pos - 1] != '\\') {
// Unescaped quote; must be end of string
break;
}
encode_codepoint(result, codepoint);
continue;
search_start = end_pos + 1;
}
default:
throw std::invalid_argument{ "Invalid JSON data; unexpected token: '"s + in_data.front() + "' when parsing escape sequence" };
if (end_pos == std::string_view::npos) {
throw std::invalid_argument{ "Invalid JSON data; missing ending quote (\") when parsing string" };
}
break;
// End of string
case '\"':
in_data.remove_prefix(1); // strip trailing quotation
std::u8string_view string_data = jessilib::string_view_cast<char8_t>(in_data.substr(0, end_pos));
in_data.remove_prefix(string_data.size() + 1);
advance_whitespace(in_data); // strip trailing spaces
return result;
// Unicode sequence
default: {
auto codepoint = decode_codepoint(in_data);
if (codepoint.units == 0) {
// Invalid unicode sequence
throw std::invalid_argument{ "Invalid JSON data; unexpected token: '"s + in_data.front() + "' when parsing string" };
result = string_data;
if (!jessilib::apply_cpp_escape_sequences(result)) {
throw std::invalid_argument{ jessilib::join<std::string>("Invalid JSON data; invalid token or end of string: "sv, string_data) };
}
// Valid unicode sequence
result.append(reinterpret_cast<const char8_t*>(in_data.data()), codepoint.units);
in_data.remove_prefix(codepoint.units);
break;
}
}
}
// We reached the end of the string_view before encountering an ending quote
throw std::invalid_argument{ "Invalid JSON data; missing ending quote (\") when parsing string" };
return result;
}
object read_json_number(std::string_view& in_data) {

Loading…
Cancel
Save