diff --git a/src/common/parser/parser_manager.cpp b/src/common/parser/parser_manager.cpp index e8906ee..6286c66 100644 --- a/src/common/parser/parser_manager.cpp +++ b/src/common/parser/parser_manager.cpp @@ -17,6 +17,7 @@ */ #include "impl/parser_manager.hpp" +#include "parsers/json.hpp" // only for default-registration #include "parser.hpp" #include "assert.hpp" @@ -49,6 +50,11 @@ bool parser_manager::registration::operator<(const registration& rhs) const { return m_format < rhs.m_format; } +parser_manager::parser_manager() { + // Add library-provided default parsers; intentionally delayed until construction rather than self-registration for zero-cost static initialization when unused + register_parser(std::make_shared(), "json", false); +} + parser_manager::id parser_manager::register_parser(std::shared_ptr in_parser, const std::string& in_format, bool in_force) { std::lock_guard guard{ m_mutex }; diff --git a/src/common/parsers/json.cpp b/src/common/parsers/json.cpp index 57ee1fb..d27503d 100644 --- a/src/common/parsers/json.cpp +++ b/src/common/parsers/json.cpp @@ -18,9 +18,6 @@ #include "parsers/json.hpp" #include -#include "unicode.hpp" -#include "unicode_sequence.hpp" -#include "util.hpp" using namespace std::literals; @@ -68,277 +65,12 @@ std::string make_json_string(std::u8string_view in_string) { return result; } -void advance_whitespace(std::string_view& in_data) { - while (!in_data.empty()) { - switch (in_data.front()) { - case ' ': - case '\t': - case '\r': - case '\n': - in_data.remove_prefix(1); - break; - default: - return; - } - } -} - -uint16_t get_codepoint_from_hex(const std::string_view& in_data) { - uint16_t value{}; - auto data = in_data.data(); - auto end = in_data.data() + 4; - data = std::from_chars(data, end, value, 16).ptr; - if (data != end) { - throw std::invalid_argument{ "Invalid JSON data; unexpected token: '"s + *data + "' when parsing unicode escape sequence" }; - }; - - return value; -} - -std::u8string read_json_string(std::string_view& in_data) { - std::u8string result; - - // Remove leading quotation - in_data.remove_prefix(1); - - if (in_data.empty()) { - throw std::invalid_argument{ "Invalid JSON data; missing ending quote (\") when parsing string" }; - } - - if (in_data.front() == '\"') { - in_data.remove_prefix(1); - advance_whitespace(in_data); // strip trailing spaces - return result; - } - - size_t search_start = 1; - size_t end_pos; - while ((end_pos = in_data.find('\"', search_start)) != std::string_view::npos) { - // Quote found; check if it's escaped - if (in_data[end_pos - 1] != '\\') { - // Unescaped quote; must be end of string - break; - } - - search_start = end_pos + 1; - } - - if (end_pos == std::string_view::npos) { - throw std::invalid_argument{ "Invalid JSON data; missing ending quote (\") when parsing string" }; - } - - std::u8string_view string_data = jessilib::string_view_cast(in_data.substr(0, end_pos)); - in_data.remove_prefix(string_data.size() + 1); - advance_whitespace(in_data); // strip trailing spaces - result = string_data; - if (!jessilib::apply_cpp_escape_sequences(result)) { - throw std::invalid_argument{ jessilib::join("Invalid JSON data; invalid token or end of string: "sv, string_data) }; - } - - return result; -} - -object read_json_number(std::string_view& in_data) { - // parse integer - intmax_t integer_value{}; - const char* from_chars_end = std::from_chars(in_data.data(), in_data.data() + in_data.size(), integer_value).ptr; - if (in_data.data() == from_chars_end) { - // Failed to parse integer portion - throw std::invalid_argument{ "Invalid JSON data; unexpected token: '"s + in_data.front() + "' when parsing number" }; - } - - // Strip integer portion and return if nothing remains - in_data.remove_prefix(from_chars_end - in_data.data()); - if (in_data.empty() || in_data.front() != '.') { - return integer_value; - } - - // Parse decimal portion - - /* - // std::from_chars method - long double decimal_value{}; - from_chars_end = std::from_chars(data, data_end, decimal_value).ptr; - return static_cast(integer_value) + decimal_value; - */ - - // parse_decimal_part method - in_data.remove_prefix(1); // strip leading '.' - long double decimal_value = static_cast(integer_value); - from_chars_end = parse_decimal_part(in_data.data(), in_data.data() + in_data.size(), decimal_value); - // TODO: parse exponent - - // Strip decimal portion and return - in_data.remove_prefix(from_chars_end - in_data.data()); - return decimal_value; -} - -object read_json_object(std::string_view& in_data) { - while (!in_data.empty()) { - switch (in_data.front()) { - /** Start of null */ - case 'n': - if (in_data.substr(0, 4) != "null"sv) { - throw std::invalid_argument{ "Invalid JSON data; unexpected token: '"s + std::string{ in_data } + "' when parsing null" }; - } - - in_data.remove_prefix(4); - return {}; - - /** Start of boolean (true) */ - case 't': - if (in_data.substr(0, 4) != "true"sv) { - throw std::invalid_argument{ "Invalid JSON data; unexpected token: '"s + std::string{ in_data } + "' when parsing boolean" }; - } - - in_data.remove_prefix(4); - return true; - - /** Start of boolean (false) */ - case 'f': - if (in_data.substr(0, 5) != "false"sv) { - throw std::invalid_argument{ "Invalid JSON data; unexpected token: '"s + std::string{ in_data } + "' when parsing boolean" }; - } - - in_data.remove_prefix(5); - return false; - - /** Whitespace */ - case ' ': - case '\t': - case '\r': - case '\n': - in_data.remove_prefix(1); - break; - - /** Start of string */ - case '\"': - return read_json_string(in_data); - - /** Start of number */ - case '-': - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': { - return read_json_number(in_data); - } - - /** Start of array */ - case '[': { - // Strip brace and leading whitespace - in_data.remove_prefix(1); - advance_whitespace(in_data); - - // Build and populate result - std::vector result; - while (true) { - if (in_data.empty()) { - throw std::invalid_argument{ "Invalid JSON data; unexpected end of data when parsing object array" }; - } - - if (in_data.front() == ']') { - // End of array - in_data.remove_prefix(1); - return result; - } - - // We've reached the start of an object; parse it into our array - result.push_back(read_json_object(in_data)); - - // Strip leading whitespace - advance_whitespace(in_data); - - if (in_data.empty()) { - throw std::invalid_argument{ "Invalid JSON data; unexpected end of data when parsing object array" }; - } - - if (in_data.front() == ',') { - // Strip comma and trailing whitespace - in_data.remove_prefix(1); - advance_whitespace(in_data); - } - } - } - - /** Start of map */ - case '{': { - // Strip brace and leading whitespace - in_data.remove_prefix(1); - advance_whitespace(in_data); - - // Build and populate result - object result{ object::map_type{} }; - while (true) { - if (in_data.empty()) { - throw std::invalid_argument{ "Invalid JSON data; unexpected end of data when parsing object map" }; - } - - if (in_data.front() == '}') { - // End of object - in_data.remove_prefix(1); - return result; - } - - // Assert that we've reached the start of a key - if (in_data.front() != '\"') { - throw std::invalid_argument{ - "Invalid JSON data; unexpected token: '"s + in_data.front() + "' when parsing object map" }; - } - - // Read in the key and build a value - auto& value = result[read_json_string(in_data)]; - - // Verify next character is ':' - if (in_data.empty()) { - throw std::invalid_argument{ - "Invalid JSON data; unexpected end of data after parsing map key; expected ':' followed by value" }; - } - if (in_data.front() != ':') { - throw std::invalid_argument{ "Invalid JSON data; unexpected token: '"s + in_data.front() - + "' when parsing map key (expected ':' instead)" }; - } - in_data.remove_prefix(1); // strip ':' - - // We've reached an object value; parse it - value = read_json_object(in_data); - - // Advance through whitespace to ',' or '}' - advance_whitespace(in_data); - - if (in_data.empty()) { - throw std::invalid_argument{ - "Invalid JSON data; unexpected end of data after parsing map value; expected '}'" }; - } - - if (in_data.front() == ',') { - // Strip comma and trailing whitespace - in_data.remove_prefix(1); - advance_whitespace(in_data); - } - } - - // Unreachable; above code will always return on success or throw on failure - } - - default: - throw std::invalid_argument{ "Invalid JSON data; unexpected token: '"s + in_data.front() + "' when parsing JSON" }; - } - } - - // No non-whitespace data passed in; return a null object - return {}; -} object json_parser::deserialize(std::string_view in_data) { - return read_json_object(in_data); + object result; + deserialize_json(result, in_data); + return result; } std::string json_parser::serialize(const object& in_object) { diff --git a/src/include/impl/parser_manager.hpp b/src/include/impl/parser_manager.hpp index 05bef2c..a70e040 100644 --- a/src/include/impl/parser_manager.hpp +++ b/src/include/impl/parser_manager.hpp @@ -67,6 +67,8 @@ private: std::string m_format; }; + parser_manager(); + std::shared_mutex m_mutex; id m_last_id{}; std::set m_registrations; // This set and map could be condensed into a bimap @@ -74,4 +76,4 @@ private: }; // parser_manager } // namespace impl -} // namespace jessilib \ No newline at end of file +} // namespace jessilib diff --git a/src/include/jessilib/http_query.hpp b/src/include/jessilib/http_query.hpp index 7d78ce9..d733fab 100644 --- a/src/include/jessilib/http_query.hpp +++ b/src/include/jessilib/http_query.hpp @@ -50,19 +50,6 @@ constexpr bool deserialize_http_query(std::basic_string& inout_string) { return apply_shrink_sequence_tree, std::size(http_query_escapes_root_tree)>(inout_string); } -// TODO: decide whether to take this approach, where query strings are assumed to represent UTF-8 text data, OR implement -// such that calling deserialize_http_query will assume the relevant encoding (i.e: calling with char16_t would read in -// escaped query values as bytes in codepoint char16_t, rather than utf-8 encoding sequence) -/*template* = nullptr> -bool deserialize_http_query(std::basic_string& inout_string) { - //TODO: optimize this? - std::basic_string u8query_string = string_cast(inout_string); - bool result = deserialize_http_query(u8query_string); - inout_string = string_cast(u8query_string); - return result; -}*/ - /** * HTML form parser */ @@ -79,21 +66,21 @@ struct HTMLFormContext { template constexpr syntax_tree_member make_value_start_pair() { // '=' - return { InCodepointV, [](ContextT& inout_context, std::basic_string_view&) constexpr { + return { InCodepointV, [](ContextT& inout_context, std::basic_string_view&) constexpr -> size_t { if (inout_context.value_start != nullptr) { // There's already a value pending; this must just be part of the value. inout_context.write_head += encode_codepoint(inout_context.write_head, InCodepointV); - return true; + return 0; } // Start pending_value inout_context.value_start = inout_context.write_head; - return true; + return 0; } }; } template -constexpr bool value_end_action(ContextT& inout_context, std::basic_string_view&) { +constexpr size_t value_end_action(ContextT& inout_context, std::basic_string_view&) { const CharT* value_end = inout_context.write_head; const CharT* key_start = inout_context.key_start; const CharT* value_start = inout_context.value_start; @@ -106,7 +93,7 @@ constexpr bool value_end_action(ContextT& inout_context, std::basic_string_view< // Start reading next key inout_context.key_start = value_end; inout_context.value_start = nullptr; - return true; + return 0; } // This is a valueless key; terminate the key and push it @@ -115,7 +102,7 @@ constexpr bool value_end_action(ContextT& inout_context, std::basic_string_view< // Start reading next key inout_context.key_start = value_end; - return true; + return 0; } template @@ -126,8 +113,12 @@ constexpr syntax_tree_member make_value_end_pair() { template constexpr syntax_tree_member make_hex_syntax_shrink_pair() { - return { InCodepointV, [](ContextT& inout_context, std::basic_string_view& inout_read_view) constexpr { - return hex_shrink_sequence_action(inout_context.write_head, inout_read_view); + return { InCodepointV, [](ContextT& inout_context, std::basic_string_view& inout_read_view) constexpr -> size_t { + if (hex_shrink_sequence_action(inout_context.write_head, inout_read_view)) { + return 0; + } + + return std::numeric_limits::max(); } }; } @@ -135,16 +126,16 @@ template make_simple_shrink_pair() { return { InCodepointV, - [](ContextT& inout_context, std::basic_string_view&) constexpr { + [](ContextT& inout_context, std::basic_string_view&) constexpr -> size_t { *inout_context.write_head = static_cast(OutCodepointV); ++inout_context.write_head; - return true; + return 0; } }; } template -bool html_form_default_action(decode_result decode, ContextT& inout_context, std::basic_string_view& inout_read_view) { +size_t html_form_default_action(decode_result decode, ContextT& inout_context, std::basic_string_view& inout_read_view) { // A regular character; copy it and advance the read/write heads CharT*& write_head = inout_context.write_head; CharT* write_end = write_head + decode.units; @@ -154,7 +145,7 @@ bool html_form_default_action(decode_result decode, ContextT& inout_context, std inout_read_view.remove_prefix(1); } - return true; + return 0; } template +void advance_whitespace(std::basic_string_view& in_data) { + while (!in_data.empty()) { + switch (in_data.front()) { + case ' ': + case '\t': + case '\r': + case '\n': + in_data.remove_prefix(1); + break; + + default: + return; + } + } +} + +template +struct json_context { + object& out_object; + static constexpr bool use_exceptions{ UseExceptionsV }; +}; + +// Doesn't do decoding, because we know our keyword is all basic latin (1 data unit, regardless of encoding) +template +constexpr bool starts_with_fast(std::basic_string_view in_string, std::u8string_view in_substring) { + if (in_string.size() < in_substring.size()) { + return false; + } + + const CharT* itr = in_string.data(); + for (auto character : in_substring) { + if (*itr != character) { + return false; + } + + ++itr; + } + + return true; +} + +template +constexpr syntax_tree_member make_keyword_value_pair() { + // null, true, false + return { InCodepointV, [](ContextT& inout_context, std::basic_string_view& inout_read_view) constexpr -> size_t { + if (starts_with_fast(inout_read_view, KeywordRemainderV)) { + // This is the keyword; go ahead and chuck it in + if constexpr (std::is_pointer_v || std::is_null_pointer_v) { + if constexpr (ValueV == nullptr) { + inout_context.out_object = object{}; + } + } + else { + inout_context.out_object = ValueV; + } + inout_read_view.remove_prefix(KeywordRemainderV.size()); + return 1; + } + + // Unexpected character; throw if appropriate + if constexpr (ContextT::use_exceptions) { + using namespace std::literals; + throw std::invalid_argument{ jessilib::join("Invalid JSON data; unexpected token: '"sv, inout_read_view, "' when parsing null"sv) }; + } + + return std::numeric_limits::max(); + } }; +} + +template +constexpr syntax_tree_member make_noop_pair() { + return { InCodepointV, [](ContextT&, std::basic_string_view&) constexpr -> size_t { + return 0; + } }; +} + +template +size_t string_start_action(ContextT& inout_context, std::basic_string_view& inout_read_view) { + // Safety check + if (inout_read_view.empty()) { + if constexpr (ContextT::use_exceptions) { + throw std::invalid_argument{ "Invalid JSON data; missing ending quote (\") when parsing string" }; + } + + return std::numeric_limits::max(); + } + + // Check if this is just an empty string + if (inout_read_view.front() == '\"') { + inout_read_view.remove_prefix(1); + inout_context.out_object = std::u8string{}; + return 1; + } + + // Not an empty string; search for the ending quote + size_t search_start = 1; + size_t end_pos; + while ((end_pos = inout_read_view.find('\"', search_start)) != std::string_view::npos) { + // Quote found; check if it's escaped + if (inout_read_view[end_pos - 1] != '\\') { + // Unescaped quote; must be end of string + break; + } + + search_start = end_pos + 1; + } + + // Early out if we didn't find the terminating quote + if (end_pos == std::string_view::npos) { + if constexpr (ContextT::use_exceptions) { + throw std::invalid_argument{ "Invalid JSON data; missing ending quote (\") when parsing string" }; + } + + return std::numeric_limits::max(); + } + + // jessilib::object only current accepts UTF-8 text; copy the necessary data instead of sequencing in-place + // additionally, even when it does accept other encodings, it'll be storing them as UTF-8 as well, though + // sequencing in-place and recoding the result would still likely be slightly quicker than recoding the input + std::u8string string_data = jessilib::string_cast(inout_read_view.substr(0, end_pos)); + inout_read_view.remove_prefix(string_data.size() + 1); // Advance the read view to after the terminating quote + if (!jessilib::apply_cpp_escape_sequences(string_data)) { + if constexpr (ContextT::use_exceptions) { + using namespace std::literals; + throw std::invalid_argument { + jessilib::join_mbstring("Invalid JSON data; invalid token or end of string: "sv, std::u8string_view{ string_data }) + }; + } + + return std::numeric_limits::max(); + } + + inout_context.out_object = std::move(string_data); + return 1; +} + +template +constexpr syntax_tree_member make_string_start_pair() { + // no constexpr in this context because gcc + return { InCodepointV, string_start_action }; +} + +template> DefaultActionF = fail_action, UseExceptionsV>> +bool deserialize_json(object& out_object, std::basic_string_view& inout_read_view); + +template +size_t array_start_action(ContextT& inout_context, std::basic_string_view& inout_read_view) { + std::vector result; + + advance_whitespace(inout_read_view); + if (inout_read_view.empty()) { + if constexpr (ContextT::use_exceptions) { + throw std::invalid_argument{ "Invalid JSON data: unexpected end of data when parsing object array; expected ']'" }; + } + + return std::numeric_limits::max(); + } + + // Checking here instead of top of loop means no trailing comma support. + if (inout_read_view.front() == ']') { + // End of array; success + inout_read_view.remove_prefix(1); + inout_context.out_object = std::move(result); + return 1; + } + + do { + // Read object + object obj; + if (!deserialize_json(obj, inout_read_view)) { + // Invalid JSON! Any exception would've been thrown already + break; + } + result.push_back(std::move(obj)); + + advance_whitespace(inout_read_view); + if (inout_read_view.empty()) { + // Unexpected end of data; missing ']'; fail + break; + } + + CharT front = inout_read_view.front(); + if (front == ',') { + // Strip comma + inout_read_view.remove_prefix(1); + advance_whitespace(inout_read_view); + + // Right now there's no trailing comma support; should behavior be a template option? + } + else if (front == ']') { + // End of array; success + inout_read_view.remove_prefix(1); + inout_context.out_object = std::move(result); + return 1; + } + else { + // Invalid JSON! + if constexpr (ContextT::use_exceptions) { + using namespace std::literals; + throw std::invalid_argument{ jessilib::join_mbstring("Invalid JSON data: expected ',' or ']', instead encountered: "sv, inout_read_view) }; + } + + return std::numeric_limits::max(); + } + } while (!inout_read_view.empty()); + + // Invalid JSON encountered + if constexpr (ContextT::use_exceptions) { + throw std::invalid_argument{ "Invalid JSON data: unexpected end of data when parsing object array; expected ']'" }; + } + + return std::numeric_limits::max(); +} + +template +constexpr syntax_tree_member make_array_start_pair() { + return { InCodepointV, array_start_action }; +} + +template +struct KeyContext { + std::u8string out_object; + static constexpr bool use_exceptions = UseExceptionsV; +}; + +template +size_t make_map_start_action(ContextT& inout_context, std::basic_string_view& inout_read_view) { + using namespace std::literals; + object::map_type result; + + advance_whitespace(inout_read_view); + KeyContext key_context; + while (!inout_read_view.empty()) { + // inout_read_view now points to either the start of a key, the end of the object, or invalid data + CharT front = inout_read_view.front(); + if (front == '}') { + // End of object + inout_read_view.remove_prefix(1); + inout_context.out_object = object{ std::move(result) }; // TODO: fix move semantics here + return 1; + } + + // Assert that we've reached the start of a key + if (front != '\"') { + if constexpr (ContextT::use_exceptions) { + throw std::invalid_argument{ jessilib::join_mbstring("Invalid JSON data; unexpected token: '"sv, + decode_codepoint(inout_read_view).codepoint, + "' when parsing object map (expected '\"' instead)"sv) }; + } + + return std::numeric_limits::max(); + } + + // Read in key + inout_read_view.remove_prefix(1); // front quote + // TODO: really should be using the escape sequencing method instead of this + if (string_start_action(key_context, inout_read_view) != 1) { + // Failed to find end of string; any exception would've been thrown in string_start_action + return std::numeric_limits::max(); + } + advance_whitespace(inout_read_view); + + // Insert our value object + auto& value = result[key_context.out_object]; + + // Verify next character is ':' + if (inout_read_view.empty()) { + throw std::invalid_argument{ + "Invalid JSON data; unexpected end of data after parsing map key; expected ':' followed by value" }; + } + front = inout_read_view.front(); + if (front != ':') { + throw std::invalid_argument{ jessilib::join_mbstring("Invalid JSON data; unexpected token: '"sv, + decode_codepoint(inout_read_view).codepoint, + "' when parsing map key (expected ':' instead)"sv) }; + } + inout_read_view.remove_prefix(1); // strip ':' + + // We've reached an object value; parse it + if (!deserialize_json(value, inout_read_view)) { + // Invalid JSON! Any exception would've been thrown already + break; + } + + // Advance through whitespace to ',' or '}' + advance_whitespace(inout_read_view); + + if (inout_read_view.empty()) { + throw std::invalid_argument{ + "Invalid JSON data; unexpected end of data after parsing map value; expected '}'" }; + } + + if (inout_read_view.front() == ',') { + // Strip comma and trailing whitespace + inout_read_view.remove_prefix(1); + advance_whitespace(inout_read_view); + } + } + + if constexpr (ContextT::use_exceptions) { + throw std::invalid_argument{ "Invalid JSON data: unexpected end of data when parsing object array; expected '}'" }; + } + + return std::numeric_limits::max(); +} + +template +constexpr syntax_tree_member make_map_start_pair() { + return { InCodepointV, make_map_start_action }; +} + +template +constexpr syntax_tree_member make_number_pair() { + return { InCodepointV, [](ContextT& inout_context, std::basic_string_view& inout_read_view) constexpr -> size_t { + // parse integer + const CharT* number_begin = inout_read_view.data() - 1; + intmax_t integer_value{}; + const CharT* from_chars_end = from_chars(number_begin, inout_read_view.data() + inout_read_view.size(), integer_value).ptr; + if constexpr (InCodepointV == '-') { + if (inout_read_view.data() == from_chars_end) { + // Failed to parse integer portion + if constexpr (ContextT::use_exceptions) { + using namespace std::literals; + throw std::invalid_argument{ + jessilib::join_mbstring(u8"Invalid JSON data; unexpected token: '"sv, decode_codepoint(inout_read_view).codepoint, u8"' when parsing number"sv) }; + } + + return std::numeric_limits::max(); + } + } + + // Strip integer portion and return if nothing remains + inout_read_view.remove_prefix(from_chars_end - inout_read_view.data()); + if (inout_read_view.empty() || inout_read_view.front() != '.') { + inout_context.out_object = integer_value; + return 1; + } + + // Parse decimal portion + + /* + // std::from_chars method + long double decimal_value{}; + from_chars_end = std::from_chars(data, data_end, decimal_value).ptr; + return static_cast(integer_value) + decimal_value; + */ + + // parse_decimal_part method + inout_read_view.remove_prefix(1); // strip leading '.' + long double decimal_value = static_cast(integer_value); + from_chars_end = parse_decimal_part(inout_read_view.data(), inout_read_view.data() + inout_read_view.size(), decimal_value); + // TODO: parse exponent + + // Strip decimal portion and return + inout_read_view.remove_prefix(from_chars_end - inout_read_view.data()); + inout_context.out_object = decimal_value; + return 1; + } }; +} + +static constexpr std::u8string_view json_false_remainder{ u8"alse" }; +static constexpr std::u8string_view json_null_remainder{ u8"ull" }; +static constexpr std::u8string_view json_true_remainder{ u8"rue" }; + +template +static constexpr syntax_tree> json_object_tree{ + make_noop_pair, U'\t'>(), + make_noop_pair, U'\n'>(), + make_noop_pair, U'\r'>(), + make_noop_pair, U' '>(), + make_string_start_pair, U'\"'>(), + make_number_pair, U'-'>(), + make_number_pair, U'0'>(), + make_number_pair, U'1'>(), + make_number_pair, U'2'>(), + make_number_pair, U'3'>(), + make_number_pair, U'4'>(), + make_number_pair, U'5'>(), + make_number_pair, U'6'>(), + make_number_pair, U'7'>(), + make_number_pair, U'8'>(), + make_number_pair, U'9'>(), + make_array_start_pair, U'['>(), + make_keyword_value_pair, U'f', json_false_remainder, bool, false>(), + make_keyword_value_pair, U'n', json_null_remainder, std::nullptr_t, nullptr>(), + make_keyword_value_pair, U't', json_true_remainder, bool, true>(), + make_map_start_pair, U'{'>() +}; + +template> DefaultActionF> +bool deserialize_json(object& out_object, std::basic_string_view& inout_read_view) { + if (inout_read_view.empty()) { + // Empty json; false to indicate out_object not modified with any valid data, but no need to throw + return false; + } + + json_context context{ out_object }; + static_assert(is_sorted, std::size(json_object_tree)>(), "Tree must be pre-sorted"); + + return apply_syntax_tree, std::size(json_object_tree), DefaultActionF> + (context, inout_read_view); +} + } // namespace jessilib diff --git a/src/include/jessilib/unicode.hpp b/src/include/jessilib/unicode.hpp index e78d182..14ca215 100644 --- a/src/include/jessilib/unicode.hpp +++ b/src/include/jessilib/unicode.hpp @@ -459,6 +459,22 @@ constexpr void join_append(OutT& out_string, InT&& in_string, ArgsT&&... in_args join_append(out_string, std::forward(in_args)...); } +constexpr void join_mb_append(std::string&){}; // noop + +template>* = nullptr*/> // no char, ambiguous meaning +void join_mb_append(std::string& out_string, InT&& in_string, ArgsT&&... in_args) { + // TODO: is this a valid approach? is mbstate fine it discard between appends? + if constexpr (std::is_same_v) { + out_string += ustring_to_mbstring(std::u32string_view{ &in_string, 1 }).second; + } + else { + out_string += ustring_to_mbstring(in_string).second; + } + + join_mb_append(out_string, std::forward(in_args)...); +} + } // impl_join // Join any number of strings of any type @@ -470,6 +486,13 @@ OutT join(ArgsT&&... args) { return result; } +template +std::string join_mbstring(ArgsT&&... args) { + std::string result; + impl_join::join_mb_append(result, std::forward(args)...); + return result; +} + /** to_lower / to_upper */ //char32_t to_lower(char32_t in_chr); // TODO: implement //char32_t to_upper(char32_t in_chr); // TODO: implement diff --git a/src/include/jessilib/unicode_syntax.hpp b/src/include/jessilib/unicode_syntax.hpp index 8be0d8f..3571215 100644 --- a/src/include/jessilib/unicode_syntax.hpp +++ b/src/include/jessilib/unicode_syntax.hpp @@ -34,10 +34,10 @@ namespace jessilib { */ template -using syntax_tree_action = bool(*)(ContextT& inout_context, std::basic_string_view& inout_read_view); +using syntax_tree_action = size_t(*)(ContextT& inout_context, std::basic_string_view& inout_read_view); template -using default_syntax_tree_action = bool(*)(decode_result in_codepoint, ContextT& inout_context, std::basic_string_view& inout_read_view); +using default_syntax_tree_action = size_t(*)(decode_result in_codepoint, ContextT& inout_context, std::basic_string_view& inout_read_view); template using syntax_tree = const std::pair>[]; @@ -72,35 +72,60 @@ constexpr bool is_sorted() { return true; } -template -bool fail_action(decode_result, ContextT&, std::basic_string_view&) { - return false; +template +size_t fail_action(decode_result, ContextT&, std::basic_string_view& in_read_view) { + using namespace std::literals; + if constexpr (UseExceptionsV) { + std::string exception = "Invalid parse data; unexpected token: '"s; + jessilib::encode_codepoint(exception, in_read_view.front()); + exception += "' when parsing data"; + throw std::invalid_argument{ exception }; + } + return std::numeric_limits::max(); } template -bool noop_action(decode_result decode, ContextT&, std::basic_string_view& inout_read_view) { +size_t noop_action(decode_result decode, ContextT&, std::basic_string_view& inout_read_view) { inout_read_view.remove_prefix(decode.units); - return true; + return 0; } -template SubTreeBegin, size_t SubTreeSize, default_syntax_tree_action DefaultActionF = fail_action> -constexpr syntax_tree_member make_tree_pair() { - return { InCodepointV, [](ContextT& inout_context, std::basic_string_view& inout_read_view) constexpr { - auto decode = decode_codepoint(inout_read_view); - if (decode.units == 0) { - return false; - } - - constexpr syntax_tree_member* SubTreeEnd = SubTreeBegin + SubTreeSize; - auto parser = std::lower_bound(SubTreeBegin, SubTreeEnd, decode.codepoint, &syntax_tree_member_compare); +template SubTreeBegin, size_t SubTreeSize, default_syntax_tree_action DefaultActionF> +constexpr size_t tree_action(ContextT& inout_context, std::basic_string_view& inout_read_view) { + decode_result decode; + size_t break_stack_depth; + constexpr syntax_tree_member* SubTreeEnd = SubTreeBegin + SubTreeSize; + while ((decode = decode_codepoint(inout_read_view)).units != 0) { + auto parser = std::lower_bound(SubTreeBegin, SubTreeEnd, decode.codepoint, &syntax_tree_member_compare); if (parser == SubTreeEnd || parser->first != decode.codepoint) { - return DefaultActionF(decode, inout_context, inout_read_view); + break_stack_depth = DefaultActionF(decode, inout_context, inout_read_view); + if (break_stack_depth == 0) { + // Don't jump the stack; continue + continue; + } + + return break_stack_depth - 1; } // This is a parsed sequence; pass it to the parser inout_read_view.remove_prefix(decode.units); - return (parser->second)(inout_context, inout_read_view); - } }; + break_stack_depth = (parser->second)(inout_context, inout_read_view); + if (break_stack_depth != 0) { + return break_stack_depth - 1; + } + } + + // decode.units == 0; success if view empty, failure otherwise + if (inout_read_view.empty()) { + return 0; + } + + return std::numeric_limits::max(); +} + +template SubTreeBegin, size_t SubTreeSize, default_syntax_tree_action DefaultActionF = fail_action> +constexpr syntax_tree_member make_tree_pair() { + return { InCodepointV, tree_action }; } template SequenceTreeBegin, size_t SequenceTreeSize, @@ -111,29 +136,7 @@ constexpr bool apply_syntax_tree(ContextT& inout_context, std::basic_string_view return true; } - decode_result decode; - constexpr auto SubTreeEnd = SequenceTreeBegin + SequenceTreeSize; - while ((decode = decode_codepoint(inout_read_view)).units != 0) { - auto parser = std::lower_bound(SequenceTreeBegin, SubTreeEnd, decode.codepoint, &syntax_tree_member_compare); - if (parser == SubTreeEnd || parser->first != decode.codepoint) { - // Just a normal character; pass it to the default handler - if (!DefaultActionF(decode, inout_context, inout_read_view)) { - return false; - } - - continue; - } - - // This is a parsed sequence; pass it to the parser instead - inout_read_view.remove_prefix(decode.units); - if (!(parser->second)(inout_context, inout_read_view)) { - // Bad input received; give up - return false; - } - } - - // We've finished parsing successfully - return true; + return tree_action(inout_context, inout_read_view) == 0; } } // namespace jessilib diff --git a/src/include/jessilib/util.hpp b/src/include/jessilib/util.hpp index 87cdde0..9979888 100644 --- a/src/include/jessilib/util.hpp +++ b/src/include/jessilib/util.hpp @@ -28,8 +28,8 @@ namespace jessilib { -template -const char* parse_decimal_part(const char* in_str, const char* in_str_end, T& out_value) { +template +const CharT* parse_decimal_part(const CharT* in_str, const CharT* in_str_end, NumberT& out_value) { int denominator = 10; while (in_str != in_str_end) { switch (*in_str) { @@ -44,10 +44,10 @@ const char* parse_decimal_part(const char* in_str, const char* in_str_end, T& ou case '8': case '9': if (out_value >= 0.0) { - out_value += (static_cast(*in_str - '0') / denominator); + out_value += (static_cast(*in_str - '0') / denominator); } else { - out_value -= (static_cast(*in_str - '0') / denominator); + out_value -= (static_cast(*in_str - '0') / denominator); } denominator *= 10; break; @@ -62,29 +62,66 @@ const char* parse_decimal_part(const char* in_str, const char* in_str_end, T& ou return in_str; } -template -std::from_chars_result from_chars(const char* in_str, const char* in_str_end, T& out_value) { +template +struct from_chars_result { + const CharT* ptr; + std::errc ec; +}; + +template* = nullptr> +from_chars_result from_chars(const CharT* in_str, const CharT* in_str_end, NumberT& out_value) { + std::from_chars_result std_result{}; + // TODO: use std::from_chars when available for floating point types - if constexpr (std::is_floating_point::value) { + if constexpr (std::is_floating_point::value) { // Read integer portion long long integer_value{}; - std::from_chars_result result{ std::from_chars(in_str, in_str_end, integer_value) }; + std_result = std::from_chars(reinterpret_cast(in_str), reinterpret_cast(in_str_end), integer_value); out_value = integer_value; // Read decimal portion (if one exists) - if (result.ptr != in_str_end && *result.ptr == '.') { - ++result.ptr; - result.ptr = parse_decimal_part(result.ptr, in_str_end, out_value); - result.ec = std::errc{}; + if (std_result.ptr != in_str_end && *std_result.ptr == '.') { + ++std_result.ptr; + std_result.ptr = parse_decimal_part(std_result.ptr, in_str_end, out_value); + std_result.ec = std::errc{}; } // TODO: Read exponents - - return result; } else { - return std::from_chars(in_str, in_str_end, out_value); + std_result = std::from_chars(reinterpret_cast(in_str), reinterpret_cast(in_str_end), out_value); + } + + return { reinterpret_cast(std_result.ptr), std_result.ec }; +} + +// All characters passed in MUST be in: [U'0', U'9'], '.' +template* = nullptr> +from_chars_result from_chars(CharT* in_str, CharT* in_str_end, NumberT& out_value) { + // Copy our string into a temporary buffer, then use from_chars on that + char buffer[256]; // TODO: get some sort of metrics on this to figure out if this is acceptable temporary approach + size_t distance = static_cast(in_str_end - in_str); + if (distance > sizeof(buffer)) { + // Way too much data; just slice it off, maybe add a debug assertion. This method's supposed to be temporary. + distance = sizeof(buffer); + in_str_end = in_str + distance; } + + // Copy the view into the buffer + char* itr = buffer; + while (in_str != in_str_end) { + *itr = static_cast(*in_str); + ++itr; + ++in_str; + } + + // leverage from_chars + char* buffer_end = buffer + distance; + auto char_result = from_chars(buffer, buffer_end, out_value); + distance = char_result.ptr - buffer; + return { in_str + distance, char_result.ec }; } template diff --git a/src/test/parsers/json.cpp b/src/test/parsers/json.cpp index d96a897..514c4d6 100644 --- a/src/test/parsers/json.cpp +++ b/src/test/parsers/json.cpp @@ -122,6 +122,22 @@ TEST(JsonParser, deserialize_string) { json_parser parser; EXPECT_EQ(parser.deserialize(R"json("text")json"sv), u8"text"); + + object obj; + std::u8string_view u8text = u8R"json("text")json"sv; + EXPECT_TRUE(deserialize_json(obj, u8text)); + EXPECT_EQ(obj, u8"text"sv); + EXPECT_TRUE(u8text.empty()); + + std::u16string_view u16text = uR"json("text")json"sv; + EXPECT_TRUE(deserialize_json(obj, u16text)); + EXPECT_EQ(obj, u8"text"sv); + EXPECT_TRUE(u8text.empty()); + + std::u32string_view u32text = UR"json("text")json"sv; + EXPECT_TRUE(deserialize_json(obj, u32text)); + EXPECT_EQ(obj, u8"text"sv); + EXPECT_TRUE(u8text.empty()); } TEST(JsonParser, deserialize_array) { @@ -157,7 +173,7 @@ TEST(JsonParser, deserialize_array_nested) { 1,2,3, null, "text", - [5,6,7], + [5,6,7] ] , [ ] , [ " text " ], 12.34, 0.1234,