Browse Source

Adjust json_parser to use new syntax_tree stuff, proving out the syntax_tree structure for object readers and enabling usage with non-UTF-8 encodings, requires additional cleanup/work. Also added default parser initialization

master
Jessica James 3 years ago
parent
commit
6ee1786fc2
  1. 6
      src/common/parser/parser_manager.cpp
  2. 274
      src/common/parsers/json.cpp
  3. 2
      src/include/impl/parser_manager.hpp
  4. 41
      src/include/jessilib/http_query.hpp
  5. 415
      src/include/jessilib/parsers/json.hpp
  6. 23
      src/include/jessilib/unicode.hpp
  7. 89
      src/include/jessilib/unicode_syntax.hpp
  8. 67
      src/include/jessilib/util.hpp
  9. 18
      src/test/parsers/json.cpp

6
src/common/parser/parser_manager.cpp

@ -17,6 +17,7 @@
*/ */
#include "impl/parser_manager.hpp" #include "impl/parser_manager.hpp"
#include "parsers/json.hpp" // only for default-registration
#include "parser.hpp" #include "parser.hpp"
#include "assert.hpp" #include "assert.hpp"
@ -49,6 +50,11 @@ bool parser_manager::registration::operator<(const registration& rhs) const {
return m_format < rhs.m_format; return m_format < rhs.m_format;
} }
parser_manager::parser_manager() {
// Add library-provided default parsers; intentionally delayed until construction rather than self-registration for zero-cost static initialization when unused
register_parser(std::make_shared<json_parser>(), "json", false);
}
parser_manager::id parser_manager::register_parser(std::shared_ptr<parser> in_parser, const std::string& in_format, bool in_force) { parser_manager::id parser_manager::register_parser(std::shared_ptr<parser> in_parser, const std::string& in_format, bool in_force) {
std::lock_guard<std::shared_mutex> guard{ m_mutex }; std::lock_guard<std::shared_mutex> guard{ m_mutex };

274
src/common/parsers/json.cpp

@ -18,9 +18,6 @@
#include "parsers/json.hpp" #include "parsers/json.hpp"
#include <charconv> #include <charconv>
#include "unicode.hpp"
#include "unicode_sequence.hpp"
#include "util.hpp"
using namespace std::literals; using namespace std::literals;
@ -68,277 +65,12 @@ std::string make_json_string(std::u8string_view in_string) {
return result; return result;
} }
void advance_whitespace(std::string_view& in_data) {
while (!in_data.empty()) {
switch (in_data.front()) {
case ' ':
case '\t':
case '\r':
case '\n':
in_data.remove_prefix(1);
break;
default:
return;
}
}
}
uint16_t get_codepoint_from_hex(const std::string_view& in_data) {
uint16_t value{};
auto data = in_data.data();
auto end = in_data.data() + 4;
data = std::from_chars(data, end, value, 16).ptr;
if (data != end) {
throw std::invalid_argument{ "Invalid JSON data; unexpected token: '"s + *data + "' when parsing unicode escape sequence" };
};
return value;
}
std::u8string read_json_string(std::string_view& in_data) {
std::u8string result;
// Remove leading quotation
in_data.remove_prefix(1);
if (in_data.empty()) {
throw std::invalid_argument{ "Invalid JSON data; missing ending quote (\") when parsing string" };
}
if (in_data.front() == '\"') {
in_data.remove_prefix(1);
advance_whitespace(in_data); // strip trailing spaces
return result;
}
size_t search_start = 1;
size_t end_pos;
while ((end_pos = in_data.find('\"', search_start)) != std::string_view::npos) {
// Quote found; check if it's escaped
if (in_data[end_pos - 1] != '\\') {
// Unescaped quote; must be end of string
break;
}
search_start = end_pos + 1;
}
if (end_pos == std::string_view::npos) {
throw std::invalid_argument{ "Invalid JSON data; missing ending quote (\") when parsing string" };
}
std::u8string_view string_data = jessilib::string_view_cast<char8_t>(in_data.substr(0, end_pos));
in_data.remove_prefix(string_data.size() + 1);
advance_whitespace(in_data); // strip trailing spaces
result = string_data;
if (!jessilib::apply_cpp_escape_sequences(result)) {
throw std::invalid_argument{ jessilib::join<std::string>("Invalid JSON data; invalid token or end of string: "sv, string_data) };
}
return result;
}
object read_json_number(std::string_view& in_data) {
// parse integer
intmax_t integer_value{};
const char* from_chars_end = std::from_chars(in_data.data(), in_data.data() + in_data.size(), integer_value).ptr;
if (in_data.data() == from_chars_end) {
// Failed to parse integer portion
throw std::invalid_argument{ "Invalid JSON data; unexpected token: '"s + in_data.front() + "' when parsing number" };
}
// Strip integer portion and return if nothing remains
in_data.remove_prefix(from_chars_end - in_data.data());
if (in_data.empty() || in_data.front() != '.') {
return integer_value;
}
// Parse decimal portion
/*
// std::from_chars method
long double decimal_value{};
from_chars_end = std::from_chars(data, data_end, decimal_value).ptr;
return static_cast<long double>(integer_value) + decimal_value;
*/
// parse_decimal_part method
in_data.remove_prefix(1); // strip leading '.'
long double decimal_value = static_cast<long double>(integer_value);
from_chars_end = parse_decimal_part(in_data.data(), in_data.data() + in_data.size(), decimal_value);
// TODO: parse exponent
// Strip decimal portion and return
in_data.remove_prefix(from_chars_end - in_data.data());
return decimal_value;
}
object read_json_object(std::string_view& in_data) {
while (!in_data.empty()) {
switch (in_data.front()) {
/** Start of null */
case 'n':
if (in_data.substr(0, 4) != "null"sv) {
throw std::invalid_argument{ "Invalid JSON data; unexpected token: '"s + std::string{ in_data } + "' when parsing null" };
}
in_data.remove_prefix(4);
return {};
/** Start of boolean (true) */
case 't':
if (in_data.substr(0, 4) != "true"sv) {
throw std::invalid_argument{ "Invalid JSON data; unexpected token: '"s + std::string{ in_data } + "' when parsing boolean" };
}
in_data.remove_prefix(4);
return true;
/** Start of boolean (false) */
case 'f':
if (in_data.substr(0, 5) != "false"sv) {
throw std::invalid_argument{ "Invalid JSON data; unexpected token: '"s + std::string{ in_data } + "' when parsing boolean" };
}
in_data.remove_prefix(5);
return false;
/** Whitespace */
case ' ':
case '\t':
case '\r':
case '\n':
in_data.remove_prefix(1);
break;
/** Start of string */
case '\"':
return read_json_string(in_data);
/** Start of number */
case '-':
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9': {
return read_json_number(in_data);
}
/** Start of array */
case '[': {
// Strip brace and leading whitespace
in_data.remove_prefix(1);
advance_whitespace(in_data);
// Build and populate result
std::vector<object> result;
while (true) {
if (in_data.empty()) {
throw std::invalid_argument{ "Invalid JSON data; unexpected end of data when parsing object array" };
}
if (in_data.front() == ']') {
// End of array
in_data.remove_prefix(1);
return result;
}
// We've reached the start of an object; parse it into our array
result.push_back(read_json_object(in_data));
// Strip leading whitespace
advance_whitespace(in_data);
if (in_data.empty()) {
throw std::invalid_argument{ "Invalid JSON data; unexpected end of data when parsing object array" };
}
if (in_data.front() == ',') {
// Strip comma and trailing whitespace
in_data.remove_prefix(1);
advance_whitespace(in_data);
}
}
}
/** Start of map */
case '{': {
// Strip brace and leading whitespace
in_data.remove_prefix(1);
advance_whitespace(in_data);
// Build and populate result
object result{ object::map_type{} };
while (true) {
if (in_data.empty()) {
throw std::invalid_argument{ "Invalid JSON data; unexpected end of data when parsing object map" };
}
if (in_data.front() == '}') {
// End of object
in_data.remove_prefix(1);
return result;
}
// Assert that we've reached the start of a key
if (in_data.front() != '\"') {
throw std::invalid_argument{
"Invalid JSON data; unexpected token: '"s + in_data.front() + "' when parsing object map" };
}
// Read in the key and build a value
auto& value = result[read_json_string(in_data)];
// Verify next character is ':'
if (in_data.empty()) {
throw std::invalid_argument{
"Invalid JSON data; unexpected end of data after parsing map key; expected ':' followed by value" };
}
if (in_data.front() != ':') {
throw std::invalid_argument{ "Invalid JSON data; unexpected token: '"s + in_data.front()
+ "' when parsing map key (expected ':' instead)" };
}
in_data.remove_prefix(1); // strip ':'
// We've reached an object value; parse it
value = read_json_object(in_data);
// Advance through whitespace to ',' or '}'
advance_whitespace(in_data);
if (in_data.empty()) {
throw std::invalid_argument{
"Invalid JSON data; unexpected end of data after parsing map value; expected '}'" };
}
if (in_data.front() == ',') {
// Strip comma and trailing whitespace
in_data.remove_prefix(1);
advance_whitespace(in_data);
}
}
// Unreachable; above code will always return on success or throw on failure
}
default:
throw std::invalid_argument{ "Invalid JSON data; unexpected token: '"s + in_data.front() + "' when parsing JSON" };
}
}
// No non-whitespace data passed in; return a null object
return {};
}
object json_parser::deserialize(std::string_view in_data) { object json_parser::deserialize(std::string_view in_data) {
return read_json_object(in_data); object result;
deserialize_json<char, true>(result, in_data);
return result;
} }
std::string json_parser::serialize(const object& in_object) { std::string json_parser::serialize(const object& in_object) {

2
src/include/impl/parser_manager.hpp

@ -67,6 +67,8 @@ private:
std::string m_format; std::string m_format;
}; };
parser_manager();
std::shared_mutex m_mutex; std::shared_mutex m_mutex;
id m_last_id{}; id m_last_id{};
std::set<registration> m_registrations; // This set and map could be condensed into a bimap std::set<registration> m_registrations; // This set and map could be condensed into a bimap

41
src/include/jessilib/http_query.hpp

@ -50,19 +50,6 @@ constexpr bool deserialize_http_query(std::basic_string<CharT>& inout_string) {
return apply_shrink_sequence_tree<CharT, http_query_escapes_root_tree<CharT>, std::size(http_query_escapes_root_tree<CharT>)>(inout_string); return apply_shrink_sequence_tree<CharT, http_query_escapes_root_tree<CharT>, std::size(http_query_escapes_root_tree<CharT>)>(inout_string);
} }
// TODO: decide whether to take this approach, where query strings are assumed to represent UTF-8 text data, OR implement
// such that calling deserialize_http_query will assume the relevant encoding (i.e: calling with char16_t would read in
// escaped query values as bytes in codepoint char16_t, rather than utf-8 encoding sequence)
/*template<typename CharT,
std::enable_if_t<sizeof(CharT) != 1>* = nullptr>
bool deserialize_http_query(std::basic_string<CharT>& inout_string) {
//TODO: optimize this?
std::basic_string<char8_t> u8query_string = string_cast<char8_t>(inout_string);
bool result = deserialize_http_query<char8_t>(u8query_string);
inout_string = string_cast<CharT>(u8query_string);
return result;
}*/
/** /**
* HTML form parser * HTML form parser
*/ */
@ -79,21 +66,21 @@ struct HTMLFormContext {
template<typename CharT, typename ContextT, char32_t InCodepointV> template<typename CharT, typename ContextT, char32_t InCodepointV>
constexpr syntax_tree_member<CharT, ContextT> make_value_start_pair() { constexpr syntax_tree_member<CharT, ContextT> make_value_start_pair() {
// '=' // '='
return { InCodepointV, [](ContextT& inout_context, std::basic_string_view<CharT>&) constexpr { return { InCodepointV, [](ContextT& inout_context, std::basic_string_view<CharT>&) constexpr -> size_t {
if (inout_context.value_start != nullptr) { if (inout_context.value_start != nullptr) {
// There's already a value pending; this must just be part of the value. // There's already a value pending; this must just be part of the value.
inout_context.write_head += encode_codepoint(inout_context.write_head, InCodepointV); inout_context.write_head += encode_codepoint(inout_context.write_head, InCodepointV);
return true; return 0;
} }
// Start pending_value // Start pending_value
inout_context.value_start = inout_context.write_head; inout_context.value_start = inout_context.write_head;
return true; return 0;
} }; } };
} }
template<typename CharT, typename ContextT> template<typename CharT, typename ContextT>
constexpr bool value_end_action(ContextT& inout_context, std::basic_string_view<CharT>&) { constexpr size_t value_end_action(ContextT& inout_context, std::basic_string_view<CharT>&) {
const CharT* value_end = inout_context.write_head; const CharT* value_end = inout_context.write_head;
const CharT* key_start = inout_context.key_start; const CharT* key_start = inout_context.key_start;
const CharT* value_start = inout_context.value_start; const CharT* value_start = inout_context.value_start;
@ -106,7 +93,7 @@ constexpr bool value_end_action(ContextT& inout_context, std::basic_string_view<
// Start reading next key // Start reading next key
inout_context.key_start = value_end; inout_context.key_start = value_end;
inout_context.value_start = nullptr; inout_context.value_start = nullptr;
return true; return 0;
} }
// This is a valueless key; terminate the key and push it // This is a valueless key; terminate the key and push it
@ -115,7 +102,7 @@ constexpr bool value_end_action(ContextT& inout_context, std::basic_string_view<
// Start reading next key // Start reading next key
inout_context.key_start = value_end; inout_context.key_start = value_end;
return true; return 0;
} }
template<typename CharT, typename ContextT, char32_t InCodepointV> template<typename CharT, typename ContextT, char32_t InCodepointV>
@ -126,8 +113,12 @@ constexpr syntax_tree_member<CharT, ContextT> make_value_end_pair() {
template<typename CharT, typename ContextT, char32_t InCodepointV, size_t MaxDigitsV, bool ExactDigitsV, bool IsUnicode> template<typename CharT, typename ContextT, char32_t InCodepointV, size_t MaxDigitsV, bool ExactDigitsV, bool IsUnicode>
constexpr syntax_tree_member<CharT, ContextT> make_hex_syntax_shrink_pair() { constexpr syntax_tree_member<CharT, ContextT> make_hex_syntax_shrink_pair() {
return { InCodepointV, [](ContextT& inout_context, std::basic_string_view<CharT>& inout_read_view) constexpr { return { InCodepointV, [](ContextT& inout_context, std::basic_string_view<CharT>& inout_read_view) constexpr -> size_t {
return hex_shrink_sequence_action<CharT, MaxDigitsV, ExactDigitsV, IsUnicode>(inout_context.write_head, inout_read_view); if (hex_shrink_sequence_action<CharT, MaxDigitsV, ExactDigitsV, IsUnicode>(inout_context.write_head, inout_read_view)) {
return 0;
}
return std::numeric_limits<size_t>::max();
} }; } };
} }
@ -135,16 +126,16 @@ template<typename CharT, typename ContextT, char32_t InCodepointV, char8_t OutCo
constexpr syntax_tree_member<CharT, ContextT> make_simple_shrink_pair() { constexpr syntax_tree_member<CharT, ContextT> make_simple_shrink_pair() {
return { return {
InCodepointV, InCodepointV,
[](ContextT& inout_context, std::basic_string_view<CharT>&) constexpr { [](ContextT& inout_context, std::basic_string_view<CharT>&) constexpr -> size_t {
*inout_context.write_head = static_cast<CharT>(OutCodepointV); *inout_context.write_head = static_cast<CharT>(OutCodepointV);
++inout_context.write_head; ++inout_context.write_head;
return true; return 0;
} }
}; };
} }
template<typename CharT, typename ContextT> template<typename CharT, typename ContextT>
bool html_form_default_action(decode_result decode, ContextT& inout_context, std::basic_string_view<CharT>& inout_read_view) { size_t html_form_default_action(decode_result decode, ContextT& inout_context, std::basic_string_view<CharT>& inout_read_view) {
// A regular character; copy it and advance the read/write heads // A regular character; copy it and advance the read/write heads
CharT*& write_head = inout_context.write_head; CharT*& write_head = inout_context.write_head;
CharT* write_end = write_head + decode.units; CharT* write_end = write_head + decode.units;
@ -154,7 +145,7 @@ bool html_form_default_action(decode_result decode, ContextT& inout_context, std
inout_read_view.remove_prefix(1); inout_read_view.remove_prefix(1);
} }
return true; return 0;
} }
template<typename CharT, typename ContainerT, template<typename CharT, typename ContainerT,

415
src/include/jessilib/parsers/json.hpp

@ -19,6 +19,10 @@
#pragma once #pragma once
#include "jessilib/parser.hpp" #include "jessilib/parser.hpp"
#include "jessilib/unicode.hpp" // join
#include "jessilib/unicode_syntax.hpp" // syntax trees
#include "jessilib/unicode_sequence.hpp" // apply_cpp_escape_sequences
#include "jessilib/util.hpp" // from_chars
namespace jessilib { namespace jessilib {
@ -29,4 +33,415 @@ public:
virtual std::string serialize(const object& in_object) override; virtual std::string serialize(const object& in_object) override;
}; };
/**
* JSON Parse Tree
*/
// TODO: remove this
template<typename CharT>
void advance_whitespace(std::basic_string_view<CharT>& in_data) {
while (!in_data.empty()) {
switch (in_data.front()) {
case ' ':
case '\t':
case '\r':
case '\n':
in_data.remove_prefix(1);
break;
default:
return;
}
}
}
template<typename CharT, bool UseExceptionsV = true>
struct json_context {
object& out_object;
static constexpr bool use_exceptions{ UseExceptionsV };
};
// Doesn't do decoding, because we know our keyword is all basic latin (1 data unit, regardless of encoding)
template<typename CharT>
constexpr bool starts_with_fast(std::basic_string_view<CharT> in_string, std::u8string_view in_substring) {
if (in_string.size() < in_substring.size()) {
return false;
}
const CharT* itr = in_string.data();
for (auto character : in_substring) {
if (*itr != character) {
return false;
}
++itr;
}
return true;
}
template<typename CharT, typename ContextT, char32_t InCodepointV, const std::u8string_view& KeywordRemainderV, typename ValueT, ValueT ValueV>
constexpr syntax_tree_member<CharT, ContextT> make_keyword_value_pair() {
// null, true, false
return { InCodepointV, [](ContextT& inout_context, std::basic_string_view<CharT>& inout_read_view) constexpr -> size_t {
if (starts_with_fast(inout_read_view, KeywordRemainderV)) {
// This is the keyword; go ahead and chuck it in
if constexpr (std::is_pointer_v<ValueT> || std::is_null_pointer_v<ValueT>) {
if constexpr (ValueV == nullptr) {
inout_context.out_object = object{};
}
}
else {
inout_context.out_object = ValueV;
}
inout_read_view.remove_prefix(KeywordRemainderV.size());
return 1;
}
// Unexpected character; throw if appropriate
if constexpr (ContextT::use_exceptions) {
using namespace std::literals;
throw std::invalid_argument{ jessilib::join<std::string>("Invalid JSON data; unexpected token: '"sv, inout_read_view, "' when parsing null"sv) };
}
return std::numeric_limits<size_t>::max();
} };
}
template<typename CharT, typename ContextT, char32_t InCodepointV>
constexpr syntax_tree_member<CharT, ContextT> make_noop_pair() {
return { InCodepointV, [](ContextT&, std::basic_string_view<CharT>&) constexpr -> size_t {
return 0;
} };
}
template<typename CharT, typename ContextT>
size_t string_start_action(ContextT& inout_context, std::basic_string_view<CharT>& inout_read_view) {
// Safety check
if (inout_read_view.empty()) {
if constexpr (ContextT::use_exceptions) {
throw std::invalid_argument{ "Invalid JSON data; missing ending quote (\") when parsing string" };
}
return std::numeric_limits<size_t>::max();
}
// Check if this is just an empty string
if (inout_read_view.front() == '\"') {
inout_read_view.remove_prefix(1);
inout_context.out_object = std::u8string{};
return 1;
}
// Not an empty string; search for the ending quote
size_t search_start = 1;
size_t end_pos;
while ((end_pos = inout_read_view.find('\"', search_start)) != std::string_view::npos) {
// Quote found; check if it's escaped
if (inout_read_view[end_pos - 1] != '\\') {
// Unescaped quote; must be end of string
break;
}
search_start = end_pos + 1;
}
// Early out if we didn't find the terminating quote
if (end_pos == std::string_view::npos) {
if constexpr (ContextT::use_exceptions) {
throw std::invalid_argument{ "Invalid JSON data; missing ending quote (\") when parsing string" };
}
return std::numeric_limits<size_t>::max();
}
// jessilib::object only current accepts UTF-8 text; copy the necessary data instead of sequencing in-place
// additionally, even when it does accept other encodings, it'll be storing them as UTF-8 as well, though
// sequencing in-place and recoding the result would still likely be slightly quicker than recoding the input
std::u8string string_data = jessilib::string_cast<char8_t>(inout_read_view.substr(0, end_pos));
inout_read_view.remove_prefix(string_data.size() + 1); // Advance the read view to after the terminating quote
if (!jessilib::apply_cpp_escape_sequences(string_data)) {
if constexpr (ContextT::use_exceptions) {
using namespace std::literals;
throw std::invalid_argument {
jessilib::join_mbstring("Invalid JSON data; invalid token or end of string: "sv, std::u8string_view{ string_data })
};
}
return std::numeric_limits<size_t>::max();
}
inout_context.out_object = std::move(string_data);
return 1;
}
template<typename CharT, typename ContextT, char32_t InCodepointV>
constexpr syntax_tree_member<CharT, ContextT> make_string_start_pair() {
// no constexpr in this context because gcc
return { InCodepointV, string_start_action<CharT, ContextT> };
}
template<typename CharT, bool UseExceptionsV = true,
default_syntax_tree_action<CharT, json_context<CharT, UseExceptionsV>> DefaultActionF = fail_action<CharT, json_context<CharT, UseExceptionsV>, UseExceptionsV>>
bool deserialize_json(object& out_object, std::basic_string_view<CharT>& inout_read_view);
template<typename CharT, typename ContextT>
size_t array_start_action(ContextT& inout_context, std::basic_string_view<CharT>& inout_read_view) {
std::vector<object> result;
advance_whitespace(inout_read_view);
if (inout_read_view.empty()) {
if constexpr (ContextT::use_exceptions) {
throw std::invalid_argument{ "Invalid JSON data: unexpected end of data when parsing object array; expected ']'" };
}
return std::numeric_limits<size_t>::max();
}
// Checking here instead of top of loop means no trailing comma support.
if (inout_read_view.front() == ']') {
// End of array; success
inout_read_view.remove_prefix(1);
inout_context.out_object = std::move(result);
return 1;
}
do {
// Read object
object obj;
if (!deserialize_json<CharT, ContextT::use_exceptions>(obj, inout_read_view)) {
// Invalid JSON! Any exception would've been thrown already
break;
}
result.push_back(std::move(obj));
advance_whitespace(inout_read_view);
if (inout_read_view.empty()) {
// Unexpected end of data; missing ']'; fail
break;
}
CharT front = inout_read_view.front();
if (front == ',') {
// Strip comma
inout_read_view.remove_prefix(1);
advance_whitespace(inout_read_view);
// Right now there's no trailing comma support; should behavior be a template option?
}
else if (front == ']') {
// End of array; success
inout_read_view.remove_prefix(1);
inout_context.out_object = std::move(result);
return 1;
}
else {
// Invalid JSON!
if constexpr (ContextT::use_exceptions) {
using namespace std::literals;
throw std::invalid_argument{ jessilib::join_mbstring("Invalid JSON data: expected ',' or ']', instead encountered: "sv, inout_read_view) };
}
return std::numeric_limits<size_t>::max();
}
} while (!inout_read_view.empty());
// Invalid JSON encountered
if constexpr (ContextT::use_exceptions) {
throw std::invalid_argument{ "Invalid JSON data: unexpected end of data when parsing object array; expected ']'" };
}
return std::numeric_limits<size_t>::max();
}
template<typename CharT, typename ContextT, char32_t InCodepointV>
constexpr syntax_tree_member<CharT, ContextT> make_array_start_pair() {
return { InCodepointV, array_start_action<CharT, ContextT> };
}
template<bool UseExceptionsV>
struct KeyContext {
std::u8string out_object;
static constexpr bool use_exceptions = UseExceptionsV;
};
template<typename CharT, typename ContextT>
size_t make_map_start_action(ContextT& inout_context, std::basic_string_view<CharT>& inout_read_view) {
using namespace std::literals;
object::map_type result;
advance_whitespace(inout_read_view);
KeyContext<ContextT::use_exceptions> key_context;
while (!inout_read_view.empty()) {
// inout_read_view now points to either the start of a key, the end of the object, or invalid data
CharT front = inout_read_view.front();
if (front == '}') {
// End of object
inout_read_view.remove_prefix(1);
inout_context.out_object = object{ std::move(result) }; // TODO: fix move semantics here
return 1;
}
// Assert that we've reached the start of a key
if (front != '\"') {
if constexpr (ContextT::use_exceptions) {
throw std::invalid_argument{ jessilib::join_mbstring("Invalid JSON data; unexpected token: '"sv,
decode_codepoint(inout_read_view).codepoint,
"' when parsing object map (expected '\"' instead)"sv) };
}
return std::numeric_limits<size_t>::max();
}
// Read in key
inout_read_view.remove_prefix(1); // front quote
// TODO: really should be using the escape sequencing method instead of this
if (string_start_action<CharT, decltype(key_context)>(key_context, inout_read_view) != 1) {
// Failed to find end of string; any exception would've been thrown in string_start_action
return std::numeric_limits<size_t>::max();
}
advance_whitespace(inout_read_view);
// Insert our value object
auto& value = result[key_context.out_object];
// Verify next character is ':'
if (inout_read_view.empty()) {
throw std::invalid_argument{
"Invalid JSON data; unexpected end of data after parsing map key; expected ':' followed by value" };
}
front = inout_read_view.front();
if (front != ':') {
throw std::invalid_argument{ jessilib::join_mbstring("Invalid JSON data; unexpected token: '"sv,
decode_codepoint(inout_read_view).codepoint,
"' when parsing map key (expected ':' instead)"sv) };
}
inout_read_view.remove_prefix(1); // strip ':'
// We've reached an object value; parse it
if (!deserialize_json<CharT, ContextT::use_exceptions>(value, inout_read_view)) {
// Invalid JSON! Any exception would've been thrown already
break;
}
// Advance through whitespace to ',' or '}'
advance_whitespace(inout_read_view);
if (inout_read_view.empty()) {
throw std::invalid_argument{
"Invalid JSON data; unexpected end of data after parsing map value; expected '}'" };
}
if (inout_read_view.front() == ',') {
// Strip comma and trailing whitespace
inout_read_view.remove_prefix(1);
advance_whitespace(inout_read_view);
}
}
if constexpr (ContextT::use_exceptions) {
throw std::invalid_argument{ "Invalid JSON data: unexpected end of data when parsing object array; expected '}'" };
}
return std::numeric_limits<size_t>::max();
}
template<typename CharT, typename ContextT, char32_t InCodepointV>
constexpr syntax_tree_member<CharT, ContextT> make_map_start_pair() {
return { InCodepointV, make_map_start_action<CharT, ContextT> };
}
template<typename CharT, typename ContextT, char32_t InCodepointV>
constexpr syntax_tree_member<CharT, ContextT> make_number_pair() {
return { InCodepointV, [](ContextT& inout_context, std::basic_string_view<CharT>& inout_read_view) constexpr -> size_t {
// parse integer
const CharT* number_begin = inout_read_view.data() - 1;
intmax_t integer_value{};
const CharT* from_chars_end = from_chars(number_begin, inout_read_view.data() + inout_read_view.size(), integer_value).ptr;
if constexpr (InCodepointV == '-') {
if (inout_read_view.data() == from_chars_end) {
// Failed to parse integer portion
if constexpr (ContextT::use_exceptions) {
using namespace std::literals;
throw std::invalid_argument{
jessilib::join_mbstring(u8"Invalid JSON data; unexpected token: '"sv, decode_codepoint(inout_read_view).codepoint, u8"' when parsing number"sv) };
}
return std::numeric_limits<size_t>::max();
}
}
// Strip integer portion and return if nothing remains
inout_read_view.remove_prefix(from_chars_end - inout_read_view.data());
if (inout_read_view.empty() || inout_read_view.front() != '.') {
inout_context.out_object = integer_value;
return 1;
}
// Parse decimal portion
/*
// std::from_chars method
long double decimal_value{};
from_chars_end = std::from_chars(data, data_end, decimal_value).ptr;
return static_cast<long double>(integer_value) + decimal_value;
*/
// parse_decimal_part method
inout_read_view.remove_prefix(1); // strip leading '.'
long double decimal_value = static_cast<long double>(integer_value);
from_chars_end = parse_decimal_part(inout_read_view.data(), inout_read_view.data() + inout_read_view.size(), decimal_value);
// TODO: parse exponent
// Strip decimal portion and return
inout_read_view.remove_prefix(from_chars_end - inout_read_view.data());
inout_context.out_object = decimal_value;
return 1;
} };
}
static constexpr std::u8string_view json_false_remainder{ u8"alse" };
static constexpr std::u8string_view json_null_remainder{ u8"ull" };
static constexpr std::u8string_view json_true_remainder{ u8"rue" };
template<typename CharT, bool UseExceptionsV>
static constexpr syntax_tree<CharT, json_context<CharT, UseExceptionsV>> json_object_tree{
make_noop_pair<CharT, json_context<CharT, UseExceptionsV>, U'\t'>(),
make_noop_pair<CharT, json_context<CharT, UseExceptionsV>, U'\n'>(),
make_noop_pair<CharT, json_context<CharT, UseExceptionsV>, U'\r'>(),
make_noop_pair<CharT, json_context<CharT, UseExceptionsV>, U' '>(),
make_string_start_pair<CharT, json_context<CharT, UseExceptionsV>, U'\"'>(),
make_number_pair<CharT, json_context<CharT, UseExceptionsV>, U'-'>(),
make_number_pair<CharT, json_context<CharT, UseExceptionsV>, U'0'>(),
make_number_pair<CharT, json_context<CharT, UseExceptionsV>, U'1'>(),
make_number_pair<CharT, json_context<CharT, UseExceptionsV>, U'2'>(),
make_number_pair<CharT, json_context<CharT, UseExceptionsV>, U'3'>(),
make_number_pair<CharT, json_context<CharT, UseExceptionsV>, U'4'>(),
make_number_pair<CharT, json_context<CharT, UseExceptionsV>, U'5'>(),
make_number_pair<CharT, json_context<CharT, UseExceptionsV>, U'6'>(),
make_number_pair<CharT, json_context<CharT, UseExceptionsV>, U'7'>(),
make_number_pair<CharT, json_context<CharT, UseExceptionsV>, U'8'>(),
make_number_pair<CharT, json_context<CharT, UseExceptionsV>, U'9'>(),
make_array_start_pair<CharT, json_context<CharT, UseExceptionsV>, U'['>(),
make_keyword_value_pair<CharT, json_context<CharT, UseExceptionsV>, U'f', json_false_remainder, bool, false>(),
make_keyword_value_pair<CharT, json_context<CharT, UseExceptionsV>, U'n', json_null_remainder, std::nullptr_t, nullptr>(),
make_keyword_value_pair<CharT, json_context<CharT, UseExceptionsV>, U't', json_true_remainder, bool, true>(),
make_map_start_pair<CharT, json_context<CharT, UseExceptionsV>, U'{'>()
};
template<typename CharT, bool UseExceptionsV,
default_syntax_tree_action<CharT, json_context<CharT, UseExceptionsV>> DefaultActionF>
bool deserialize_json(object& out_object, std::basic_string_view<CharT>& inout_read_view) {
if (inout_read_view.empty()) {
// Empty json; false to indicate out_object not modified with any valid data, but no need to throw
return false;
}
json_context<CharT, UseExceptionsV> context{ out_object };
static_assert(is_sorted<CharT, decltype(context), json_object_tree<CharT, UseExceptionsV>, std::size(json_object_tree<CharT, UseExceptionsV>)>(), "Tree must be pre-sorted");
return apply_syntax_tree<CharT, decltype(context), json_object_tree<CharT, UseExceptionsV>, std::size(json_object_tree<CharT, UseExceptionsV>), DefaultActionF>
(context, inout_read_view);
}
} // namespace jessilib } // namespace jessilib

23
src/include/jessilib/unicode.hpp

@ -459,6 +459,22 @@ constexpr void join_append(OutT& out_string, InT&& in_string, ArgsT&&... in_args
join_append(out_string, std::forward<ArgsT>(in_args)...); join_append(out_string, std::forward<ArgsT>(in_args)...);
} }
constexpr void join_mb_append(std::string&){}; // noop
template<typename InT, typename... ArgsT/*,
typename std::enable_if_t<!std::is_same_v<typename InT::value_type, char>>* = nullptr*/> // no char, ambiguous meaning
void join_mb_append(std::string& out_string, InT&& in_string, ArgsT&&... in_args) {
// TODO: is this a valid approach? is mbstate fine it discard between appends?
if constexpr (std::is_same_v<char32_t, InT>) {
out_string += ustring_to_mbstring(std::u32string_view{ &in_string, 1 }).second;
}
else {
out_string += ustring_to_mbstring(in_string).second;
}
join_mb_append(out_string, std::forward<ArgsT>(in_args)...);
}
} // impl_join } // impl_join
// Join any number of strings of any type // Join any number of strings of any type
@ -470,6 +486,13 @@ OutT join(ArgsT&&... args) {
return result; return result;
} }
template<typename... ArgsT>
std::string join_mbstring(ArgsT&&... args) {
std::string result;
impl_join::join_mb_append<ArgsT...>(result, std::forward<ArgsT>(args)...);
return result;
}
/** to_lower / to_upper */ /** to_lower / to_upper */
//char32_t to_lower(char32_t in_chr); // TODO: implement //char32_t to_lower(char32_t in_chr); // TODO: implement
//char32_t to_upper(char32_t in_chr); // TODO: implement //char32_t to_upper(char32_t in_chr); // TODO: implement

89
src/include/jessilib/unicode_syntax.hpp

@ -34,10 +34,10 @@ namespace jessilib {
*/ */
template<typename CharT, typename ContextT> template<typename CharT, typename ContextT>
using syntax_tree_action = bool(*)(ContextT& inout_context, std::basic_string_view<CharT>& inout_read_view); using syntax_tree_action = size_t(*)(ContextT& inout_context, std::basic_string_view<CharT>& inout_read_view);
template<typename CharT, typename ContextT> template<typename CharT, typename ContextT>
using default_syntax_tree_action = bool(*)(decode_result in_codepoint, ContextT& inout_context, std::basic_string_view<CharT>& inout_read_view); using default_syntax_tree_action = size_t(*)(decode_result in_codepoint, ContextT& inout_context, std::basic_string_view<CharT>& inout_read_view);
template<typename CharT, typename ContextT> template<typename CharT, typename ContextT>
using syntax_tree = const std::pair<char32_t, syntax_tree_action<CharT, ContextT>>[]; using syntax_tree = const std::pair<char32_t, syntax_tree_action<CharT, ContextT>>[];
@ -72,35 +72,60 @@ constexpr bool is_sorted() {
return true; return true;
} }
template<typename CharT, typename ContextT> template<typename CharT, typename ContextT, bool UseExceptionsV = false>
bool fail_action(decode_result, ContextT&, std::basic_string_view<CharT>&) { size_t fail_action(decode_result, ContextT&, std::basic_string_view<CharT>& in_read_view) {
return false; using namespace std::literals;
if constexpr (UseExceptionsV) {
std::string exception = "Invalid parse data; unexpected token: '"s;
jessilib::encode_codepoint(exception, in_read_view.front());
exception += "' when parsing data";
throw std::invalid_argument{ exception };
}
return std::numeric_limits<size_t>::max();
} }
template<typename CharT, typename ContextT> template<typename CharT, typename ContextT>
bool noop_action(decode_result decode, ContextT&, std::basic_string_view<CharT>& inout_read_view) { size_t noop_action(decode_result decode, ContextT&, std::basic_string_view<CharT>& inout_read_view) {
inout_read_view.remove_prefix(decode.units); inout_read_view.remove_prefix(decode.units);
return true; return 0;
} }
template<typename CharT, typename ContextT, char32_t InCodepointV, const syntax_tree<CharT, ContextT> SubTreeBegin, size_t SubTreeSize, default_syntax_tree_action<CharT, ContextT> DefaultActionF = fail_action<CharT, ContextT>> template<typename CharT, typename ContextT, const syntax_tree<CharT, ContextT> SubTreeBegin, size_t SubTreeSize, default_syntax_tree_action<CharT, ContextT> DefaultActionF>
constexpr syntax_tree_member<CharT, ContextT> make_tree_pair() { constexpr size_t tree_action(ContextT& inout_context, std::basic_string_view<CharT>& inout_read_view) {
return { InCodepointV, [](ContextT& inout_context, std::basic_string_view<CharT>& inout_read_view) constexpr { decode_result decode;
auto decode = decode_codepoint(inout_read_view); size_t break_stack_depth;
if (decode.units == 0) { constexpr syntax_tree_member<CharT, ContextT>* SubTreeEnd = SubTreeBegin + SubTreeSize;
return false; while ((decode = decode_codepoint(inout_read_view)).units != 0) {
} auto parser = std::lower_bound(SubTreeBegin, SubTreeEnd, decode.codepoint, &syntax_tree_member_compare<CharT, ContextT>);
constexpr syntax_tree_member<CharT, ContextT>* SubTreeEnd = SubTreeBegin + SubTreeSize;
auto parser = std::lower_bound(SubTreeBegin, SubTreeEnd, decode.codepoint, &syntax_tree_member_compare<CharT>);
if (parser == SubTreeEnd || parser->first != decode.codepoint) { if (parser == SubTreeEnd || parser->first != decode.codepoint) {
return DefaultActionF(decode, inout_context, inout_read_view); break_stack_depth = DefaultActionF(decode, inout_context, inout_read_view);
if (break_stack_depth == 0) {
// Don't jump the stack; continue
continue;
}
return break_stack_depth - 1;
} }
// This is a parsed sequence; pass it to the parser // This is a parsed sequence; pass it to the parser
inout_read_view.remove_prefix(decode.units); inout_read_view.remove_prefix(decode.units);
return (parser->second)(inout_context, inout_read_view); break_stack_depth = (parser->second)(inout_context, inout_read_view);
} }; if (break_stack_depth != 0) {
return break_stack_depth - 1;
}
}
// decode.units == 0; success if view empty, failure otherwise
if (inout_read_view.empty()) {
return 0;
}
return std::numeric_limits<size_t>::max();
}
template<typename CharT, typename ContextT, char32_t InCodepointV, const syntax_tree<CharT, ContextT> SubTreeBegin, size_t SubTreeSize, default_syntax_tree_action<CharT, ContextT> DefaultActionF = fail_action<CharT, ContextT>>
constexpr syntax_tree_member<CharT, ContextT> make_tree_pair() {
return { InCodepointV, tree_action<CharT, ContextT, SubTreeBegin, SubTreeSize, DefaultActionF> };
} }
template<typename CharT, typename ContextT, const syntax_tree<CharT, ContextT> SequenceTreeBegin, size_t SequenceTreeSize, template<typename CharT, typename ContextT, const syntax_tree<CharT, ContextT> SequenceTreeBegin, size_t SequenceTreeSize,
@ -111,29 +136,7 @@ constexpr bool apply_syntax_tree(ContextT& inout_context, std::basic_string_view
return true; return true;
} }
decode_result decode; return tree_action<CharT, ContextT, SequenceTreeBegin, SequenceTreeSize, DefaultActionF>(inout_context, inout_read_view) == 0;
constexpr auto SubTreeEnd = SequenceTreeBegin + SequenceTreeSize;
while ((decode = decode_codepoint(inout_read_view)).units != 0) {
auto parser = std::lower_bound(SequenceTreeBegin, SubTreeEnd, decode.codepoint, &syntax_tree_member_compare<CharT, ContextT>);
if (parser == SubTreeEnd || parser->first != decode.codepoint) {
// Just a normal character; pass it to the default handler
if (!DefaultActionF(decode, inout_context, inout_read_view)) {
return false;
}
continue;
}
// This is a parsed sequence; pass it to the parser instead
inout_read_view.remove_prefix(decode.units);
if (!(parser->second)(inout_context, inout_read_view)) {
// Bad input received; give up
return false;
}
}
// We've finished parsing successfully
return true;
} }
} // namespace jessilib } // namespace jessilib

67
src/include/jessilib/util.hpp

@ -28,8 +28,8 @@
namespace jessilib { namespace jessilib {
template<typename T> template<typename CharT, typename NumberT>
const char* parse_decimal_part(const char* in_str, const char* in_str_end, T& out_value) { const CharT* parse_decimal_part(const CharT* in_str, const CharT* in_str_end, NumberT& out_value) {
int denominator = 10; int denominator = 10;
while (in_str != in_str_end) { while (in_str != in_str_end) {
switch (*in_str) { switch (*in_str) {
@ -44,10 +44,10 @@ const char* parse_decimal_part(const char* in_str, const char* in_str_end, T& ou
case '8': case '8':
case '9': case '9':
if (out_value >= 0.0) { if (out_value >= 0.0) {
out_value += (static_cast<T>(*in_str - '0') / denominator); out_value += (static_cast<NumberT>(*in_str - '0') / denominator);
} }
else { else {
out_value -= (static_cast<T>(*in_str - '0') / denominator); out_value -= (static_cast<NumberT>(*in_str - '0') / denominator);
} }
denominator *= 10; denominator *= 10;
break; break;
@ -62,29 +62,66 @@ const char* parse_decimal_part(const char* in_str, const char* in_str_end, T& ou
return in_str; return in_str;
} }
template<typename T> template<typename CharT>
std::from_chars_result from_chars(const char* in_str, const char* in_str_end, T& out_value) { struct from_chars_result {
const CharT* ptr;
std::errc ec;
};
template<typename CharT, typename NumberT,
std::enable_if_t<sizeof(CharT) == 1>* = nullptr>
from_chars_result<CharT> from_chars(const CharT* in_str, const CharT* in_str_end, NumberT& out_value) {
std::from_chars_result std_result{};
// TODO: use std::from_chars when available for floating point types // TODO: use std::from_chars when available for floating point types
if constexpr (std::is_floating_point<T>::value) { if constexpr (std::is_floating_point<NumberT>::value) {
// Read integer portion // Read integer portion
long long integer_value{}; long long integer_value{};
std::from_chars_result result{ std::from_chars(in_str, in_str_end, integer_value) }; std_result = std::from_chars(reinterpret_cast<const char*>(in_str), reinterpret_cast<const char*>(in_str_end), integer_value);
out_value = integer_value; out_value = integer_value;
// Read decimal portion (if one exists) // Read decimal portion (if one exists)
if (result.ptr != in_str_end && *result.ptr == '.') { if (std_result.ptr != in_str_end && *std_result.ptr == '.') {
++result.ptr; ++std_result.ptr;
result.ptr = parse_decimal_part(result.ptr, in_str_end, out_value); std_result.ptr = parse_decimal_part(std_result.ptr, in_str_end, out_value);
result.ec = std::errc{}; std_result.ec = std::errc{};
} }
// TODO: Read exponents // TODO: Read exponents
return result;
} }
else { else {
return std::from_chars(in_str, in_str_end, out_value); std_result = std::from_chars(reinterpret_cast<const char*>(in_str), reinterpret_cast<const char*>(in_str_end), out_value);
}
return { reinterpret_cast<const CharT*>(std_result.ptr), std_result.ec };
}
// All characters passed in MUST be in: [U'0', U'9'], '.'
template<typename CharT, typename NumberT,
std::enable_if_t<sizeof(CharT) != 1>* = nullptr>
from_chars_result<CharT> from_chars(CharT* in_str, CharT* in_str_end, NumberT& out_value) {
// Copy our string into a temporary buffer, then use from_chars on that
char buffer[256]; // TODO: get some sort of metrics on this to figure out if this is acceptable temporary approach
size_t distance = static_cast<size_t>(in_str_end - in_str);
if (distance > sizeof(buffer)) {
// Way too much data; just slice it off, maybe add a debug assertion. This method's supposed to be temporary.
distance = sizeof(buffer);
in_str_end = in_str + distance;
} }
// Copy the view into the buffer
char* itr = buffer;
while (in_str != in_str_end) {
*itr = static_cast<char>(*in_str);
++itr;
++in_str;
}
// leverage from_chars
char* buffer_end = buffer + distance;
auto char_result = from_chars(buffer, buffer_end, out_value);
distance = char_result.ptr - buffer;
return { in_str + distance, char_result.ec };
} }
template<typename T> template<typename T>

18
src/test/parsers/json.cpp

@ -122,6 +122,22 @@ TEST(JsonParser, deserialize_string) {
json_parser parser; json_parser parser;
EXPECT_EQ(parser.deserialize(R"json("text")json"sv), u8"text"); EXPECT_EQ(parser.deserialize(R"json("text")json"sv), u8"text");
object obj;
std::u8string_view u8text = u8R"json("text")json"sv;
EXPECT_TRUE(deserialize_json(obj, u8text));
EXPECT_EQ(obj, u8"text"sv);
EXPECT_TRUE(u8text.empty());
std::u16string_view u16text = uR"json("text")json"sv;
EXPECT_TRUE(deserialize_json(obj, u16text));
EXPECT_EQ(obj, u8"text"sv);
EXPECT_TRUE(u8text.empty());
std::u32string_view u32text = UR"json("text")json"sv;
EXPECT_TRUE(deserialize_json(obj, u32text));
EXPECT_EQ(obj, u8"text"sv);
EXPECT_TRUE(u8text.empty());
} }
TEST(JsonParser, deserialize_array) { TEST(JsonParser, deserialize_array) {
@ -157,7 +173,7 @@ TEST(JsonParser, deserialize_array_nested) {
1,2,3, 1,2,3,
null, null,
"text", "text",
[5,6,7], [5,6,7]
] , [ ] , [ " text " ], ] , [ ] , [ " text " ],
12.34, 12.34,
0.1234, 0.1234,

Loading…
Cancel
Save