Browse Source

Add join(), refactored sequence parser to use sorted constexpr array instead of std::map

master
Jessica James 3 years ago
parent
commit
bce3bfefc6
  1. 46
      src/include/jessilib/unicode.hpp
  2. 134
      src/include/jessilib/unicode_sequence.hpp

46
src/include/jessilib/unicode.hpp

@ -692,6 +692,52 @@ inline void find_if(std::basic_string_view<char>& in_string, find_if_view_predic
} }
} }
namespace impl_join {
constexpr size_t join_sizes() {
return 0;
}
// Returns maximum number of bytes needed to represent the joined
template<typename FirstArgT, typename... ArgsT>
constexpr size_t join_sizes(const FirstArgT& in_arg, const ArgsT&... in_args) {
return in_arg.size() + join_sizes(in_args...);
}
template<typename T>
constexpr void join_append(T&){}; // noop
template<typename OutT, typename InT, typename... ArgsT>
constexpr void join_append(OutT& out_string, InT&& in_string, ArgsT&&... in_args) {
using InCharT = typename std::remove_cvref_t<InT>::value_type;
if constexpr (std::is_same_v<typename std::remove_cvref_t<OutT>::value_type, typename std::remove_cvref_t<InT>::value_type>) {
// Join these straight together
out_string += std::forward<InT>(in_string);
}
else {
// Append over all the codepoints
get_endpoint_result decode;
std::basic_string_view<InCharT> in_view = in_string;
while ((decode = decode_codepoint(in_view)).units != 0) {
encode_codepoint(out_string, decode.codepoint);
in_view.remove_prefix(decode.units);
}
}
join_append(out_string, std::forward<ArgsT>(in_args)...);
}
} // impl_join
// Join any number of strings of any type
template<typename OutT, typename... ArgsT>
OutT join(ArgsT&&... args) {
OutT result;
result.reserve(impl_join::join_sizes(args...));
impl_join::join_append<OutT, ArgsT...>(result, std::forward<ArgsT>(args)...);
return result;
}
/** to_lower / to_upper */ /** to_lower / to_upper */
//char32_t to_lower(char32_t in_chr); // TODO: implement //char32_t to_lower(char32_t in_chr); // TODO: implement
//char32_t to_upper(char32_t in_chr); // TODO: implement //char32_t to_upper(char32_t in_chr); // TODO: implement

134
src/include/jessilib/unicode_sequence.hpp

@ -35,14 +35,19 @@ template<typename CharT>
using shrink_sequence_tree_action = bool(*)(CharT*& in_write_head, std::basic_string_view<CharT>& read_view); using shrink_sequence_tree_action = bool(*)(CharT*& in_write_head, std::basic_string_view<CharT>& read_view);
template<typename CharT> template<typename CharT>
using shrink_sequence_tree = std::map<char32_t, shrink_sequence_tree_action<CharT>>; using shrink_sequence_tree = const std::pair<char32_t, shrink_sequence_tree_action<CharT>>[];
template<typename CharT> template<typename CharT>
using shrink_sequence_tree_member = std::pair<char32_t, shrink_sequence_tree_action<CharT>>; using shrink_sequence_tree_member = const std::pair<char32_t, shrink_sequence_tree_action<CharT>>;
template<typename CharT>
bool shrink_tree_member_compare(const shrink_sequence_tree_member<CharT>& in_lhs, const char32_t in_rhs) {
return in_lhs.first < in_rhs;
}
// Only use for ASTs where each character process is guaranteed to write at most 1 character for each character consumed // Only use for ASTs where each character process is guaranteed to write at most 1 character for each character consumed
template<typename CharT, typename SequenceTreeT> template<typename CharT, const shrink_sequence_tree<CharT> SequenceTreeBegin, size_t SequenceTreeSize>
bool apply_shrink_sequence_tree(std::basic_string<CharT>& inout_string, const SequenceTreeT& in_tree) { bool apply_shrink_sequence_tree(std::basic_string<CharT>& inout_string) {
if (inout_string.empty()) { if (inout_string.empty()) {
// Nothing to parse // Nothing to parse
return true; return true;
@ -52,9 +57,10 @@ bool apply_shrink_sequence_tree(std::basic_string<CharT>& inout_string, const Se
CharT* write_head = inout_string.data(); CharT* write_head = inout_string.data();
get_endpoint_result decode; get_endpoint_result decode;
while ((decode = decode_codepoint(read_view)).units != 0) { constexpr auto SubTreeEnd = SequenceTreeBegin + SequenceTreeSize;
auto parser = in_tree.find(decode.codepoint); while ((decode = decode_codepoint(read_view)).units != 0) { // TODO: make constexpr
if (parser == in_tree.end()) { auto parser = std::lower_bound(SequenceTreeBegin, SubTreeEnd, decode.codepoint, &shrink_tree_member_compare<CharT>);
if (parser == SubTreeEnd || parser->first != decode.codepoint) {
// Just a normal character; write it over // Just a normal character; write it over
while (decode.units != 0) { while (decode.units != 0) {
*write_head = read_view.front(); *write_head = read_view.front();
@ -84,10 +90,10 @@ bool apply_shrink_sequence_tree(std::basic_string<CharT>& inout_string, const Se
// Only for codepoints representable w/ char8_t (i.e: \n) // Only for codepoints representable w/ char8_t (i.e: \n)
template<typename CharT, char32_t InCodepointV, char8_t OutCodepointV> template<typename CharT, char32_t InCodepointV, char8_t OutCodepointV>
shrink_sequence_tree_member<CharT> make_simple_sequence_pair() { constexpr shrink_sequence_tree_member<CharT> make_simple_sequence_pair() {
return { return {
InCodepointV, InCodepointV,
[](CharT*& in_write_head, std::basic_string_view<CharT>&) { [](CharT*& in_write_head, std::basic_string_view<CharT>&) constexpr {
*in_write_head = static_cast<CharT>(OutCodepointV); *in_write_head = static_cast<CharT>(OutCodepointV);
++in_write_head; ++in_write_head;
return true; return true;
@ -97,10 +103,10 @@ shrink_sequence_tree_member<CharT> make_simple_sequence_pair() {
// Skips a character (i.e: skipping/ignoring newlines) // Skips a character (i.e: skipping/ignoring newlines)
template<typename CharT, char32_t InCodepointV> template<typename CharT, char32_t InCodepointV>
shrink_sequence_tree_member<CharT> make_noop_sequence_pair() { constexpr shrink_sequence_tree_member<CharT> make_noop_sequence_pair() {
return { return {
InCodepointV, InCodepointV,
[](CharT*&, std::basic_string_view<CharT>&) { [](CharT*&, std::basic_string_view<CharT>&) constexpr {
return true; return true;
} }
}; };
@ -108,10 +114,10 @@ shrink_sequence_tree_member<CharT> make_noop_sequence_pair() {
// Skips a character or two (i.e: skipping/ignoring newlines) // Skips a character or two (i.e: skipping/ignoring newlines)
template<typename CharT, char32_t InCodepointV, char32_t InOptionalTrailing> template<typename CharT, char32_t InCodepointV, char32_t InOptionalTrailing>
shrink_sequence_tree_member<CharT> make_noop_sequence_pair() { constexpr shrink_sequence_tree_member<CharT> make_noop_sequence_pair() {
return { return {
InCodepointV, InCodepointV,
[](CharT*&, std::basic_string_view<CharT>& read_view) { [](CharT*&, std::basic_string_view<CharT>& read_view) constexpr {
// Strip trailing 'InTrailing', if it's present // Strip trailing 'InTrailing', if it's present
auto decode = decode_codepoint(read_view); auto decode = decode_codepoint(read_view);
if (decode.units != 0 if (decode.units != 0
@ -125,7 +131,7 @@ shrink_sequence_tree_member<CharT> make_noop_sequence_pair() {
} }
template<typename CharT, char32_t InCodepointV, size_t MaxDigitsV, bool ExactDigitsV, bool IsUnicode> template<typename CharT, char32_t InCodepointV, size_t MaxDigitsV, bool ExactDigitsV, bool IsUnicode>
shrink_sequence_tree_member<CharT> make_octal_sequence_pair() { constexpr shrink_sequence_tree_member<CharT> make_octal_sequence_pair() {
static_assert(MaxDigitsV > 0); // Use noop instead static_assert(MaxDigitsV > 0); // Use noop instead
static_assert((MaxDigitsV == 2 && InCodepointV >= U'0' && InCodepointV <= U'7') static_assert((MaxDigitsV == 2 && InCodepointV >= U'0' && InCodepointV <= U'7')
|| (MaxDigitsV == 3 && InCodepointV >= U'0' && InCodepointV <= U'3')); // Only currently support single-octet octal values || (MaxDigitsV == 3 && InCodepointV >= U'0' && InCodepointV <= U'3')); // Only currently support single-octet octal values
@ -133,7 +139,7 @@ shrink_sequence_tree_member<CharT> make_octal_sequence_pair() {
// Must have at least 1 octal digit (this one), but may not have more than 3 (2 more). // Must have at least 1 octal digit (this one), but may not have more than 3 (2 more).
return { return {
InCodepointV, InCodepointV,
[](CharT*& in_write_head, std::basic_string_view<CharT>& read_view) { [](CharT*& in_write_head, std::basic_string_view<CharT>& read_view) constexpr {
// Read in first octal character from InCodepointV // Read in first octal character from InCodepointV
unsigned int out_value = InCodepointV - U'0'; // Set initial value unsigned int out_value = InCodepointV - U'0'; // Set initial value
if (read_view.empty()) { if (read_view.empty()) {
@ -198,12 +204,12 @@ shrink_sequence_tree_member<CharT> make_octal_sequence_pair() {
} }
template<typename CharT, char32_t InCodepointV, size_t MaxDigitsV, bool ExactDigitsV, bool IsUnicode> template<typename CharT, char32_t InCodepointV, size_t MaxDigitsV, bool ExactDigitsV, bool IsUnicode>
shrink_sequence_tree_member<CharT> make_hex_sequence_pair() { constexpr shrink_sequence_tree_member<CharT> make_hex_sequence_pair() {
static_assert(MaxDigitsV > 0); static_assert(MaxDigitsV > 0);
return { return {
InCodepointV, InCodepointV,
[](CharT*& in_write_head, std::basic_string_view<CharT>& read_view) { [](CharT*& in_write_head, std::basic_string_view<CharT>& read_view) constexpr {
// Does not modify // Does not modify
auto read_hex = [](uint32_t& out_value, std::basic_string_view<CharT> in_view, size_t max_digits) { auto read_hex = [](uint32_t& out_value, std::basic_string_view<CharT> in_view, size_t max_digits) {
size_t result{}; size_t result{};
@ -263,12 +269,14 @@ shrink_sequence_tree_member<CharT> make_hex_sequence_pair() {
} }
// Calls into another tree with the next character // Calls into another tree with the next character
template<typename CharT, char32_t InCodepointV, const shrink_sequence_tree<CharT>& SubTreeR, bool FailNotFound = true> template<typename CharT, char32_t InCodepointV, const shrink_sequence_tree<CharT> SubTreeBegin, size_t SubTreeSize, bool FailNotFound = true>
shrink_sequence_tree_member<CharT> make_tree_sequence_pair() { constexpr shrink_sequence_tree_member<CharT> make_tree_sequence_pair() {
return { InCodepointV, [](CharT*& in_write_head, std::basic_string_view<CharT>& read_view) { return { InCodepointV, [](CharT*& in_write_head, std::basic_string_view<CharT>& read_view) {
auto decode = decode_codepoint(read_view); auto decode = decode_codepoint(read_view); // TODO: make constexpr
auto parser = SubTreeR.find(decode.codepoint);
if (parser == SubTreeR.end()) { constexpr auto SubTreeEnd = SubTreeBegin + SubTreeSize;
auto parser = std::lower_bound(SubTreeBegin, SubTreeEnd, decode.codepoint, &shrink_tree_member_compare<CharT>);
if (parser == SubTreeEnd || parser->first != decode.codepoint) {
if constexpr (FailNotFound) { if constexpr (FailNotFound) {
// Code not found; fail // Code not found; fail
return false; return false;
@ -291,29 +299,38 @@ shrink_sequence_tree_member<CharT> make_tree_sequence_pair() {
} }; } };
} }
// Return true for valid sequences, false otherwise // Lessers on left
template<typename CharT, const shrink_sequence_tree<CharT> SubTreeBegin, size_t SubTreeSize>
constexpr bool is_sorted() {
auto head = SubTreeBegin;
constexpr auto end = SubTreeBegin + SubTreeSize;
if (head == end) {
return true;
}
while (head + 1 != end) {
const auto next = head + 1;
if (head->first > next->first) {
return false;
}
++head;
}
return true;
}
template<typename CharT> template<typename CharT>
bool apply_cpp_escape_sequences(std::basic_string<CharT>& inout_string) { static constexpr shrink_sequence_tree<CharT> cpp_escapes_main_tree{
// Handles parsing first character of escape sequence
static const shrink_sequence_tree<CharT> main_tree{
/** Newline skippers; not actually a C++ thing, but I want it */ /** Newline skippers; not actually a C++ thing, but I want it */
make_noop_sequence_pair<CharT, U'\n', U'\r'>(), make_noop_sequence_pair<CharT, U'\n', U'\r'>(),
make_noop_sequence_pair<CharT, U'\r', U'\n'>(), make_noop_sequence_pair<CharT, U'\r', U'\n'>(),
/** Simple escape sequences */ /** Simple quote escape sequences */
make_simple_sequence_pair<CharT, U'\'', '\''>(),
make_simple_sequence_pair<CharT, U'\"', '\"'>(), make_simple_sequence_pair<CharT, U'\"', '\"'>(),
make_simple_sequence_pair<CharT, U'?', '\?'>(), make_simple_sequence_pair<CharT, U'\'', '\''>(),
make_simple_sequence_pair<CharT, U'\\', '\\'>(),
make_simple_sequence_pair<CharT, U'a', '\a'>(),
make_simple_sequence_pair<CharT, U'b', '\b'>(),
make_simple_sequence_pair<CharT, U'f', '\f'>(),
make_simple_sequence_pair<CharT, U'n', '\n'>(),
make_simple_sequence_pair<CharT, U'r', '\r'>(),
make_simple_sequence_pair<CharT, U't', '\t'>(),
make_simple_sequence_pair<CharT, U'v', '\v'>(),
/** Numeric escape sequences */
// Octal (Single byte value only); should we support octal escapes in sequence? // Octal (Single byte value only); should we support octal escapes in sequence?
make_octal_sequence_pair<CharT, U'0', 3, false, false>(), make_octal_sequence_pair<CharT, U'0', 3, false, false>(),
make_octal_sequence_pair<CharT, U'1', 3, false, false>(), make_octal_sequence_pair<CharT, U'1', 3, false, false>(),
@ -324,20 +341,41 @@ bool apply_cpp_escape_sequences(std::basic_string<CharT>& inout_string) {
make_octal_sequence_pair<CharT, U'6', 2, false, false>(), make_octal_sequence_pair<CharT, U'6', 2, false, false>(),
make_octal_sequence_pair<CharT, U'7', 2, false, false>(), make_octal_sequence_pair<CharT, U'7', 2, false, false>(),
// Hex; should we support hex escapes in sequence? (i.e: \x00FF == \x00\xFF, which is only true for char/char8_t atm) /** Simple escape sequence (question mark) */
make_hex_sequence_pair<CharT, U'x', sizeof(CharT) * 2, false, false>(), make_simple_sequence_pair<CharT, U'?', '\?'>(),
/** Unicode escape sequences */ /** Uppercase escapes */
make_hex_sequence_pair<CharT, U'u', 4, true, true>(),
make_hex_sequence_pair<CharT, U'U', 8, true, true>(), make_hex_sequence_pair<CharT, U'U', 8, true, true>(),
};
// Only checks for '\' /** Simple escape sequence (backslash) */
static const shrink_sequence_tree<CharT> root_tree{ make_simple_sequence_pair<CharT, U'\\', '\\'>(),
make_tree_sequence_pair<CharT, U'\\', main_tree>()
}; /** Lowercase escapes */
make_simple_sequence_pair<CharT, U'a', '\a'>(),
make_simple_sequence_pair<CharT, U'b', '\b'>(),
make_simple_sequence_pair<CharT, U'f', '\f'>(),
make_simple_sequence_pair<CharT, U'n', '\n'>(),
make_simple_sequence_pair<CharT, U'r', '\r'>(),
make_simple_sequence_pair<CharT, U't', '\t'>(),
make_hex_sequence_pair<CharT, U'u', 4, true, true>(),
make_simple_sequence_pair<CharT, U'v', '\v'>(),
// Hexadecimal; should we support hex escapes in sequence? (i.e: \x00FF == \x00\xFF, which is only true for char/char8_t atm)
make_hex_sequence_pair<CharT, U'x', sizeof(CharT) * 2, false, false>(),
};
template<typename CharT>
static constexpr shrink_sequence_tree<CharT> cpp_escapes_root_tree{
make_tree_sequence_pair<CharT, U'\\', cpp_escapes_main_tree<CharT>, std::size(cpp_escapes_main_tree<CharT>)>()
};
// Return true for valid sequences, false otherwise
template<typename CharT>
bool apply_cpp_escape_sequences(std::basic_string<CharT>& inout_string) {
static_assert(is_sorted<CharT, cpp_escapes_root_tree<CharT>, std::size(cpp_escapes_root_tree<CharT>)>(), "Tree must be pre-sorted");
static_assert(is_sorted<CharT, cpp_escapes_main_tree<CharT>, std::size(cpp_escapes_main_tree<CharT>)>(), "Tree must be pre-sorted");
return apply_shrink_sequence_tree(inout_string, root_tree); return apply_shrink_sequence_tree<CharT, cpp_escapes_root_tree<CharT>, std::size(cpp_escapes_root_tree<CharT>)>(inout_string);
} }
} // namespace jessilib } // namespace jessilib

Loading…
Cancel
Save