diff --git a/src/include/jessilib/unicode.hpp b/src/include/jessilib/unicode.hpp index 34d45bb..de812f8 100644 --- a/src/include/jessilib/unicode.hpp +++ b/src/include/jessilib/unicode.hpp @@ -692,6 +692,52 @@ inline void find_if(std::basic_string_view& in_string, find_if_view_predic } } +namespace impl_join { + +constexpr size_t join_sizes() { + return 0; +} + +// Returns maximum number of bytes needed to represent the joined +template +constexpr size_t join_sizes(const FirstArgT& in_arg, const ArgsT&... in_args) { + return in_arg.size() + join_sizes(in_args...); +} + +template +constexpr void join_append(T&){}; // noop + +template +constexpr void join_append(OutT& out_string, InT&& in_string, ArgsT&&... in_args) { + using InCharT = typename std::remove_cvref_t::value_type; + if constexpr (std::is_same_v::value_type, typename std::remove_cvref_t::value_type>) { + // Join these straight together + out_string += std::forward(in_string); + } + else { + // Append over all the codepoints + get_endpoint_result decode; + std::basic_string_view in_view = in_string; + while ((decode = decode_codepoint(in_view)).units != 0) { + encode_codepoint(out_string, decode.codepoint); + in_view.remove_prefix(decode.units); + } + } + + join_append(out_string, std::forward(in_args)...); +} + +} // impl_join + +// Join any number of strings of any type +template +OutT join(ArgsT&&... args) { + OutT result; + result.reserve(impl_join::join_sizes(args...)); + impl_join::join_append(result, std::forward(args)...); + return result; +} + /** to_lower / to_upper */ //char32_t to_lower(char32_t in_chr); // TODO: implement //char32_t to_upper(char32_t in_chr); // TODO: implement diff --git a/src/include/jessilib/unicode_sequence.hpp b/src/include/jessilib/unicode_sequence.hpp index 935a132..b8c0678 100644 --- a/src/include/jessilib/unicode_sequence.hpp +++ b/src/include/jessilib/unicode_sequence.hpp @@ -35,14 +35,19 @@ template using shrink_sequence_tree_action = bool(*)(CharT*& in_write_head, std::basic_string_view& read_view); template -using shrink_sequence_tree = std::map>; +using shrink_sequence_tree = const std::pair>[]; template -using shrink_sequence_tree_member = std::pair>; +using shrink_sequence_tree_member = const std::pair>; + +template +bool shrink_tree_member_compare(const shrink_sequence_tree_member& in_lhs, const char32_t in_rhs) { + return in_lhs.first < in_rhs; +} // Only use for ASTs where each character process is guaranteed to write at most 1 character for each character consumed -template -bool apply_shrink_sequence_tree(std::basic_string& inout_string, const SequenceTreeT& in_tree) { +template SequenceTreeBegin, size_t SequenceTreeSize> +bool apply_shrink_sequence_tree(std::basic_string& inout_string) { if (inout_string.empty()) { // Nothing to parse return true; @@ -52,9 +57,10 @@ bool apply_shrink_sequence_tree(std::basic_string& inout_string, const Se CharT* write_head = inout_string.data(); get_endpoint_result decode; - while ((decode = decode_codepoint(read_view)).units != 0) { - auto parser = in_tree.find(decode.codepoint); - if (parser == in_tree.end()) { + constexpr auto SubTreeEnd = SequenceTreeBegin + SequenceTreeSize; + while ((decode = decode_codepoint(read_view)).units != 0) { // TODO: make constexpr + auto parser = std::lower_bound(SequenceTreeBegin, SubTreeEnd, decode.codepoint, &shrink_tree_member_compare); + if (parser == SubTreeEnd || parser->first != decode.codepoint) { // Just a normal character; write it over while (decode.units != 0) { *write_head = read_view.front(); @@ -84,10 +90,10 @@ bool apply_shrink_sequence_tree(std::basic_string& inout_string, const Se // Only for codepoints representable w/ char8_t (i.e: \n) template -shrink_sequence_tree_member make_simple_sequence_pair() { +constexpr shrink_sequence_tree_member make_simple_sequence_pair() { return { InCodepointV, - [](CharT*& in_write_head, std::basic_string_view&) { + [](CharT*& in_write_head, std::basic_string_view&) constexpr { *in_write_head = static_cast(OutCodepointV); ++in_write_head; return true; @@ -97,10 +103,10 @@ shrink_sequence_tree_member make_simple_sequence_pair() { // Skips a character (i.e: skipping/ignoring newlines) template -shrink_sequence_tree_member make_noop_sequence_pair() { +constexpr shrink_sequence_tree_member make_noop_sequence_pair() { return { InCodepointV, - [](CharT*&, std::basic_string_view&) { + [](CharT*&, std::basic_string_view&) constexpr { return true; } }; @@ -108,10 +114,10 @@ shrink_sequence_tree_member make_noop_sequence_pair() { // Skips a character or two (i.e: skipping/ignoring newlines) template -shrink_sequence_tree_member make_noop_sequence_pair() { +constexpr shrink_sequence_tree_member make_noop_sequence_pair() { return { InCodepointV, - [](CharT*&, std::basic_string_view& read_view) { + [](CharT*&, std::basic_string_view& read_view) constexpr { // Strip trailing 'InTrailing', if it's present auto decode = decode_codepoint(read_view); if (decode.units != 0 @@ -125,7 +131,7 @@ shrink_sequence_tree_member make_noop_sequence_pair() { } template -shrink_sequence_tree_member make_octal_sequence_pair() { +constexpr shrink_sequence_tree_member make_octal_sequence_pair() { static_assert(MaxDigitsV > 0); // Use noop instead static_assert((MaxDigitsV == 2 && InCodepointV >= U'0' && InCodepointV <= U'7') || (MaxDigitsV == 3 && InCodepointV >= U'0' && InCodepointV <= U'3')); // Only currently support single-octet octal values @@ -133,7 +139,7 @@ shrink_sequence_tree_member make_octal_sequence_pair() { // Must have at least 1 octal digit (this one), but may not have more than 3 (2 more). return { InCodepointV, - [](CharT*& in_write_head, std::basic_string_view& read_view) { + [](CharT*& in_write_head, std::basic_string_view& read_view) constexpr { // Read in first octal character from InCodepointV unsigned int out_value = InCodepointV - U'0'; // Set initial value if (read_view.empty()) { @@ -198,12 +204,12 @@ shrink_sequence_tree_member make_octal_sequence_pair() { } template -shrink_sequence_tree_member make_hex_sequence_pair() { +constexpr shrink_sequence_tree_member make_hex_sequence_pair() { static_assert(MaxDigitsV > 0); return { InCodepointV, - [](CharT*& in_write_head, std::basic_string_view& read_view) { + [](CharT*& in_write_head, std::basic_string_view& read_view) constexpr { // Does not modify auto read_hex = [](uint32_t& out_value, std::basic_string_view in_view, size_t max_digits) { size_t result{}; @@ -263,12 +269,14 @@ shrink_sequence_tree_member make_hex_sequence_pair() { } // Calls into another tree with the next character -template& SubTreeR, bool FailNotFound = true> -shrink_sequence_tree_member make_tree_sequence_pair() { +template SubTreeBegin, size_t SubTreeSize, bool FailNotFound = true> +constexpr shrink_sequence_tree_member make_tree_sequence_pair() { return { InCodepointV, [](CharT*& in_write_head, std::basic_string_view& read_view) { - auto decode = decode_codepoint(read_view); - auto parser = SubTreeR.find(decode.codepoint); - if (parser == SubTreeR.end()) { + auto decode = decode_codepoint(read_view); // TODO: make constexpr + + constexpr auto SubTreeEnd = SubTreeBegin + SubTreeSize; + auto parser = std::lower_bound(SubTreeBegin, SubTreeEnd, decode.codepoint, &shrink_tree_member_compare); + if (parser == SubTreeEnd || parser->first != decode.codepoint) { if constexpr (FailNotFound) { // Code not found; fail return false; @@ -291,53 +299,83 @@ shrink_sequence_tree_member make_tree_sequence_pair() { } }; } +// Lessers on left +template SubTreeBegin, size_t SubTreeSize> +constexpr bool is_sorted() { + auto head = SubTreeBegin; + constexpr auto end = SubTreeBegin + SubTreeSize; + + if (head == end) { + return true; + } + + while (head + 1 != end) { + const auto next = head + 1; + if (head->first > next->first) { + return false; + } + + ++head; + } + + return true; +} + +template +static constexpr shrink_sequence_tree cpp_escapes_main_tree{ + /** Newline skippers; not actually a C++ thing, but I want it */ + make_noop_sequence_pair(), + make_noop_sequence_pair(), + + /** Simple quote escape sequences */ + make_simple_sequence_pair(), + make_simple_sequence_pair(), + + // Octal (Single byte value only); should we support octal escapes in sequence? + make_octal_sequence_pair(), + make_octal_sequence_pair(), + make_octal_sequence_pair(), + make_octal_sequence_pair(), + make_octal_sequence_pair(), + make_octal_sequence_pair(), + make_octal_sequence_pair(), + make_octal_sequence_pair(), + + /** Simple escape sequence (question mark) */ + make_simple_sequence_pair(), + + /** Uppercase escapes */ + make_hex_sequence_pair(), + + /** Simple escape sequence (backslash) */ + make_simple_sequence_pair(), + + /** Lowercase escapes */ + make_simple_sequence_pair(), + make_simple_sequence_pair(), + make_simple_sequence_pair(), + make_simple_sequence_pair(), + make_simple_sequence_pair(), + make_simple_sequence_pair(), + make_hex_sequence_pair(), + make_simple_sequence_pair(), + + // Hexadecimal; should we support hex escapes in sequence? (i.e: \x00FF == \x00\xFF, which is only true for char/char8_t atm) + make_hex_sequence_pair(), +}; + +template +static constexpr shrink_sequence_tree cpp_escapes_root_tree{ + make_tree_sequence_pair, std::size(cpp_escapes_main_tree)>() +}; + // Return true for valid sequences, false otherwise template bool apply_cpp_escape_sequences(std::basic_string& inout_string) { - // Handles parsing first character of escape sequence - static const shrink_sequence_tree main_tree{ - /** Newline skippers; not actually a C++ thing, but I want it */ - make_noop_sequence_pair(), - make_noop_sequence_pair(), - - /** Simple escape sequences */ - make_simple_sequence_pair(), - make_simple_sequence_pair(), - make_simple_sequence_pair(), - make_simple_sequence_pair(), - make_simple_sequence_pair(), - make_simple_sequence_pair(), - make_simple_sequence_pair(), - make_simple_sequence_pair(), - make_simple_sequence_pair(), - make_simple_sequence_pair(), - make_simple_sequence_pair(), - - /** Numeric escape sequences */ - // Octal (Single byte value only); should we support octal escapes in sequence? - make_octal_sequence_pair(), - make_octal_sequence_pair(), - make_octal_sequence_pair(), - make_octal_sequence_pair(), - make_octal_sequence_pair(), - make_octal_sequence_pair(), - make_octal_sequence_pair(), - make_octal_sequence_pair(), - - // Hex; should we support hex escapes in sequence? (i.e: \x00FF == \x00\xFF, which is only true for char/char8_t atm) - make_hex_sequence_pair(), - - /** Unicode escape sequences */ - make_hex_sequence_pair(), - make_hex_sequence_pair(), - }; - - // Only checks for '\' - static const shrink_sequence_tree root_tree{ - make_tree_sequence_pair() - }; + static_assert(is_sorted, std::size(cpp_escapes_root_tree)>(), "Tree must be pre-sorted"); + static_assert(is_sorted, std::size(cpp_escapes_main_tree)>(), "Tree must be pre-sorted"); - return apply_shrink_sequence_tree(inout_string, root_tree); + return apply_shrink_sequence_tree, std::size(cpp_escapes_root_tree)>(inout_string); } } // namespace jessilib