Browse Source

Add various string helpers to unicode.hpp & appropriate tests; fix a bug with word_split

master
Jessica James 3 years ago
parent
commit
8e59b16eb2
  1. 253
      src/common/unicode.cpp
  2. 374
      src/include/jessilib/unicode.hpp
  3. 2
      src/include/jessilib/word_split.hpp
  4. 5
      src/test/CMakeLists.txt
  5. 1624
      src/test/data/CaseFolding.txt
  6. 351
      src/test/unicode.cpp

253
src/common/unicode.cpp

@ -266,4 +266,257 @@ get_endpoint_result decode_surrogate_pair(char16_t in_high_surrogate, char16_t i
return { 0, 0 }; return { 0, 0 };
} }
/**
* Codepoint folding (case-insensitive character comparisons)
*/
// describes a range
struct folding_set {
enum class mode_type {
constant,
alternating,
single
} mode = mode_type::single;
uint32_t range_start; // inclusive
uint32_t range_end; // inclusive
int64_t diff; // difference between two codepoints
uint32_t fold(uint32_t in_codepoint) const {
if (mode != mode_type::alternating // Constant & single types immediately apply diff and return
|| (in_codepoint % 2) == (range_start % 2)) { // Alternating type apply diff only if matches range type
return static_cast<uint32_t>(in_codepoint + diff);
}
return in_codepoint;
}
};
bool operator<(const folding_set& in_lhs, const uint32_t in_rhs) {
return in_rhs > in_lhs.range_end;
}
bool operator<(const uint32_t in_lhs, const folding_set& in_rhs) {
return in_lhs < in_rhs.range_start;
}
char32_t fold(char32_t in_codepoint) {
// Break this up? The most common case is going to be points < 0x80, but even without breaking this up we'll only
// perform at most like 8+1 integer comparisons, which is hardly a major constraint
static constexpr folding_set s_folding_sets[]{
{ folding_set::mode_type::constant, 0x0041, 0x005A, 32 },
{ folding_set::mode_type::single, 0x00B5, 0x00B5, 775 },
{ folding_set::mode_type::constant, 0x00C0, 0x00D6, 32 },
{ folding_set::mode_type::constant, 0x00D8, 0x00DE, 32 },
{ folding_set::mode_type::alternating, 0x0100, 0x012E, 1 },
{ folding_set::mode_type::alternating, 0x0132, 0x0136, 1 },
{ folding_set::mode_type::alternating, 0x0139, 0x0147, 1 },
{ folding_set::mode_type::alternating, 0x014A, 0x0176, 1 },
{ folding_set::mode_type::single, 0x0178, 0x0178, -121 },
{ folding_set::mode_type::alternating, 0x0179, 0x017D, 1 },
{ folding_set::mode_type::single, 0x017F, 0x017F, -268 },
{ folding_set::mode_type::single, 0x0181, 0x0181, 210 },
{ folding_set::mode_type::alternating, 0x0182, 0x0184, 1 },
{ folding_set::mode_type::single, 0x0186, 0x0186, 206 },
{ folding_set::mode_type::single, 0x0187, 0x0187, 1 },
{ folding_set::mode_type::constant, 0x0189, 0x018A, 205 },
{ folding_set::mode_type::single, 0x018B, 0x018B, 1 },
{ folding_set::mode_type::single, 0x018E, 0x018E, 79 },
{ folding_set::mode_type::single, 0x018F, 0x018F, 202 },
{ folding_set::mode_type::single, 0x0190, 0x0190, 203 },
{ folding_set::mode_type::single, 0x0191, 0x0191, 1 },
{ folding_set::mode_type::single, 0x0193, 0x0193, 205 },
{ folding_set::mode_type::single, 0x0194, 0x0194, 207 },
{ folding_set::mode_type::single, 0x0196, 0x0196, 211 },
{ folding_set::mode_type::single, 0x0197, 0x0197, 209 },
{ folding_set::mode_type::single, 0x0198, 0x0198, 1 },
{ folding_set::mode_type::single, 0x019C, 0x019C, 211 },
{ folding_set::mode_type::single, 0x019D, 0x019D, 213 },
{ folding_set::mode_type::single, 0x019F, 0x019F, 214 },
{ folding_set::mode_type::alternating, 0x01A0, 0x01A4, 1 },
{ folding_set::mode_type::single, 0x01A6, 0x01A6, 218 },
{ folding_set::mode_type::single, 0x01A7, 0x01A7, 1 },
{ folding_set::mode_type::single, 0x01A9, 0x01A9, 218 },
{ folding_set::mode_type::single, 0x01AC, 0x01AC, 1 },
{ folding_set::mode_type::single, 0x01AE, 0x01AE, 218 },
{ folding_set::mode_type::single, 0x01AF, 0x01AF, 1 },
{ folding_set::mode_type::constant, 0x01B1, 0x01B2, 217 },
{ folding_set::mode_type::alternating, 0x01B3, 0x01B5, 1 },
{ folding_set::mode_type::single, 0x01B7, 0x01B7, 219 },
{ folding_set::mode_type::single, 0x01B8, 0x01B8, 1 },
{ folding_set::mode_type::single, 0x01BC, 0x01BC, 1 },
{ folding_set::mode_type::single, 0x01C4, 0x01C4, 2 },
{ folding_set::mode_type::single, 0x01C5, 0x01C5, 1 },
{ folding_set::mode_type::single, 0x01C7, 0x01C7, 2 },
{ folding_set::mode_type::single, 0x01C8, 0x01C8, 1 },
{ folding_set::mode_type::single, 0x01CA, 0x01CA, 2 },
{ folding_set::mode_type::alternating, 0x01CB, 0x01DB, 1 },
{ folding_set::mode_type::alternating, 0x01DE, 0x01EE, 1 },
{ folding_set::mode_type::single, 0x01F1, 0x01F1, 2 },
{ folding_set::mode_type::alternating, 0x01F2, 0x01F4, 1 },
{ folding_set::mode_type::single, 0x01F6, 0x01F6, -97 },
{ folding_set::mode_type::single, 0x01F7, 0x01F7, -56 },
{ folding_set::mode_type::alternating, 0x01F8, 0x021E, 1 },
{ folding_set::mode_type::single, 0x0220, 0x0220, -130 },
{ folding_set::mode_type::alternating, 0x0222, 0x0232, 1 },
{ folding_set::mode_type::single, 0x023A, 0x023A, 10795 },
{ folding_set::mode_type::single, 0x023B, 0x023B, 1 },
{ folding_set::mode_type::single, 0x023D, 0x023D, -163 },
{ folding_set::mode_type::single, 0x023E, 0x023E, 10792 },
{ folding_set::mode_type::single, 0x0241, 0x0241, 1 },
{ folding_set::mode_type::single, 0x0243, 0x0243, -195 },
{ folding_set::mode_type::single, 0x0244, 0x0244, 69 },
{ folding_set::mode_type::single, 0x0245, 0x0245, 71 },
{ folding_set::mode_type::alternating, 0x0246, 0x024E, 1 },
{ folding_set::mode_type::single, 0x0345, 0x0345, 116 },
{ folding_set::mode_type::alternating, 0x0370, 0x0372, 1 },
{ folding_set::mode_type::single, 0x0376, 0x0376, 1 },
{ folding_set::mode_type::single, 0x037F, 0x037F, 116 },
{ folding_set::mode_type::single, 0x0386, 0x0386, 38 },
{ folding_set::mode_type::constant, 0x0388, 0x038A, 37 },
{ folding_set::mode_type::single, 0x038C, 0x038C, 64 },
{ folding_set::mode_type::constant, 0x038E, 0x038F, 63 },
{ folding_set::mode_type::constant, 0x0391, 0x03A1, 32 },
{ folding_set::mode_type::constant, 0x03A3, 0x03AB, 32 },
{ folding_set::mode_type::single, 0x03C2, 0x03C2, 1 },
{ folding_set::mode_type::single, 0x03CF, 0x03CF, 8 },
{ folding_set::mode_type::single, 0x03D0, 0x03D0, -30 },
{ folding_set::mode_type::single, 0x03D1, 0x03D1, -25 },
{ folding_set::mode_type::single, 0x03D5, 0x03D5, -15 },
{ folding_set::mode_type::single, 0x03D6, 0x03D6, -22 },
{ folding_set::mode_type::alternating, 0x03D8, 0x03EE, 1 },
{ folding_set::mode_type::single, 0x03F0, 0x03F0, -54 },
{ folding_set::mode_type::single, 0x03F1, 0x03F1, -48 },
{ folding_set::mode_type::single, 0x03F4, 0x03F4, -60 },
{ folding_set::mode_type::single, 0x03F5, 0x03F5, -64 },
{ folding_set::mode_type::single, 0x03F7, 0x03F7, 1 },
{ folding_set::mode_type::single, 0x03F9, 0x03F9, -7 },
{ folding_set::mode_type::single, 0x03FA, 0x03FA, 1 },
{ folding_set::mode_type::constant, 0x03FD, 0x03FF, -130 },
{ folding_set::mode_type::constant, 0x0400, 0x040F, 80 },
{ folding_set::mode_type::constant, 0x0410, 0x042F, 32 },
{ folding_set::mode_type::alternating, 0x0460, 0x0480, 1 },
{ folding_set::mode_type::alternating, 0x048A, 0x04BE, 1 },
{ folding_set::mode_type::single, 0x04C0, 0x04C0, 15 },
{ folding_set::mode_type::alternating, 0x04C1, 0x04CD, 1 },
{ folding_set::mode_type::alternating, 0x04D0, 0x052E, 1 },
{ folding_set::mode_type::constant, 0x0531, 0x0556, 48 },
{ folding_set::mode_type::constant, 0x10A0, 0x10C5, 7264 },
{ folding_set::mode_type::single, 0x10C7, 0x10C7, 7264 },
{ folding_set::mode_type::single, 0x10CD, 0x10CD, 7264 },
{ folding_set::mode_type::constant, 0x13F8, 0x13FD, -8 },
{ folding_set::mode_type::single, 0x1C80, 0x1C80, -6222 },
{ folding_set::mode_type::single, 0x1C81, 0x1C81, -6221 },
{ folding_set::mode_type::single, 0x1C82, 0x1C82, -6212 },
{ folding_set::mode_type::constant, 0x1C83, 0x1C84, -6210 },
{ folding_set::mode_type::single, 0x1C85, 0x1C85, -6211 },
{ folding_set::mode_type::single, 0x1C86, 0x1C86, -6204 },
{ folding_set::mode_type::single, 0x1C87, 0x1C87, -6180 },
{ folding_set::mode_type::single, 0x1C88, 0x1C88, 35267 },
{ folding_set::mode_type::constant, 0x1C90, 0x1CBA, -3008 },
{ folding_set::mode_type::constant, 0x1CBD, 0x1CBF, -3008 },
{ folding_set::mode_type::alternating, 0x1E00, 0x1E94, 1 },
{ folding_set::mode_type::single, 0x1E9B, 0x1E9B, -58 },
{ folding_set::mode_type::single, 0x1E9E, 0x1E9E, -7615 },
{ folding_set::mode_type::alternating, 0x1EA0, 0x1EFE, 1 },
{ folding_set::mode_type::constant, 0x1F08, 0x1F0F, -8 },
{ folding_set::mode_type::constant, 0x1F18, 0x1F1D, -8 },
{ folding_set::mode_type::constant, 0x1F28, 0x1F2F, -8 },
{ folding_set::mode_type::constant, 0x1F38, 0x1F3F, -8 },
{ folding_set::mode_type::constant, 0x1F48, 0x1F4D, -8 },
{ folding_set::mode_type::alternating, 0x1F59, 0x1F5F, -8 },
{ folding_set::mode_type::constant, 0x1F68, 0x1F6F, -8 },
{ folding_set::mode_type::constant, 0x1F88, 0x1F8F, -8 },
{ folding_set::mode_type::constant, 0x1F98, 0x1F9F, -8 },
{ folding_set::mode_type::constant, 0x1FA8, 0x1FAF, -8 },
{ folding_set::mode_type::constant, 0x1FB8, 0x1FB9, -8 },
{ folding_set::mode_type::constant, 0x1FBA, 0x1FBB, -74 },
{ folding_set::mode_type::single, 0x1FBC, 0x1FBC, -9 },
{ folding_set::mode_type::single, 0x1FBE, 0x1FBE, -7173 },
{ folding_set::mode_type::constant, 0x1FC8, 0x1FCB, -86 },
{ folding_set::mode_type::single, 0x1FCC, 0x1FCC, -9 },
{ folding_set::mode_type::constant, 0x1FD8, 0x1FD9, -8 },
{ folding_set::mode_type::constant, 0x1FDA, 0x1FDB, -100 },
{ folding_set::mode_type::constant, 0x1FE8, 0x1FE9, -8 },
{ folding_set::mode_type::constant, 0x1FEA, 0x1FEB, -112 },
{ folding_set::mode_type::single, 0x1FEC, 0x1FEC, -7 },
{ folding_set::mode_type::constant, 0x1FF8, 0x1FF9, -128 },
{ folding_set::mode_type::constant, 0x1FFA, 0x1FFB, -126 },
{ folding_set::mode_type::single, 0x1FFC, 0x1FFC, -9 },
{ folding_set::mode_type::single, 0x2126, 0x2126, -7517 },
{ folding_set::mode_type::single, 0x212A, 0x212A, -8383 },
{ folding_set::mode_type::single, 0x212B, 0x212B, -8262 },
{ folding_set::mode_type::single, 0x2132, 0x2132, 28 },
{ folding_set::mode_type::constant, 0x2160, 0x216F, 16 },
{ folding_set::mode_type::single, 0x2183, 0x2183, 1 },
{ folding_set::mode_type::constant, 0x24B6, 0x24CF, 26 },
{ folding_set::mode_type::constant, 0x2C00, 0x2C2F, 48 },
{ folding_set::mode_type::single, 0x2C60, 0x2C60, 1 },
{ folding_set::mode_type::single, 0x2C62, 0x2C62, -10743 },
{ folding_set::mode_type::single, 0x2C63, 0x2C63, -3814 },
{ folding_set::mode_type::single, 0x2C64, 0x2C64, -10727 },
{ folding_set::mode_type::alternating, 0x2C67, 0x2C6B, 1 },
{ folding_set::mode_type::single, 0x2C6D, 0x2C6D, -10780 },
{ folding_set::mode_type::single, 0x2C6E, 0x2C6E, -10749 },
{ folding_set::mode_type::single, 0x2C6F, 0x2C6F, -10783 },
{ folding_set::mode_type::single, 0x2C70, 0x2C70, -10782 },
{ folding_set::mode_type::single, 0x2C72, 0x2C72, 1 },
{ folding_set::mode_type::single, 0x2C75, 0x2C75, 1 },
{ folding_set::mode_type::constant, 0x2C7E, 0x2C7F, -10815 },
{ folding_set::mode_type::alternating, 0x2C80, 0x2CE2, 1 },
{ folding_set::mode_type::alternating, 0x2CEB, 0x2CED, 1 },
{ folding_set::mode_type::single, 0x2CF2, 0x2CF2, 1 },
{ folding_set::mode_type::alternating, 0xA640, 0xA66C, 1 },
{ folding_set::mode_type::alternating, 0xA680, 0xA69A, 1 },
{ folding_set::mode_type::alternating, 0xA722, 0xA72E, 1 },
{ folding_set::mode_type::alternating, 0xA732, 0xA76E, 1 },
{ folding_set::mode_type::alternating, 0xA779, 0xA77B, 1 },
{ folding_set::mode_type::single, 0xA77D, 0xA77D, -35332 },
{ folding_set::mode_type::alternating, 0xA77E, 0xA786, 1 },
{ folding_set::mode_type::single, 0xA78B, 0xA78B, 1 },
{ folding_set::mode_type::single, 0xA78D, 0xA78D, -42280 },
{ folding_set::mode_type::alternating, 0xA790, 0xA792, 1 },
{ folding_set::mode_type::alternating, 0xA796, 0xA7A8, 1 },
{ folding_set::mode_type::single, 0xA7AA, 0xA7AA, -42308 },
{ folding_set::mode_type::single, 0xA7AB, 0xA7AB, -42319 },
{ folding_set::mode_type::single, 0xA7AC, 0xA7AC, -42315 },
{ folding_set::mode_type::single, 0xA7AD, 0xA7AD, -42305 },
{ folding_set::mode_type::single, 0xA7AE, 0xA7AE, -42308 },
{ folding_set::mode_type::single, 0xA7B0, 0xA7B0, -42258 },
{ folding_set::mode_type::single, 0xA7B1, 0xA7B1, -42282 },
{ folding_set::mode_type::single, 0xA7B2, 0xA7B2, -42261 },
{ folding_set::mode_type::single, 0xA7B3, 0xA7B3, 928 },
{ folding_set::mode_type::alternating, 0xA7B4, 0xA7C2, 1 },
{ folding_set::mode_type::single, 0xA7C4, 0xA7C4, -48 },
{ folding_set::mode_type::single, 0xA7C5, 0xA7C5, -42307 },
{ folding_set::mode_type::single, 0xA7C6, 0xA7C6, -35384 },
{ folding_set::mode_type::alternating, 0xA7C7, 0xA7C9, 1 },
{ folding_set::mode_type::single, 0xA7D0, 0xA7D0, 1 },
{ folding_set::mode_type::alternating, 0xA7D6, 0xA7D8, 1 },
{ folding_set::mode_type::single, 0xA7F5, 0xA7F5, 1 },
{ folding_set::mode_type::constant, 0xAB70, 0xABBF, -38864 },
{ folding_set::mode_type::constant, 0xFF21, 0xFF3A, 32 },
{ folding_set::mode_type::constant, 0x10400, 0x10427, 40 },
{ folding_set::mode_type::constant, 0x104B0, 0x104D3, 40 },
{ folding_set::mode_type::constant, 0x10570, 0x1057A, 39 },
{ folding_set::mode_type::constant, 0x1057C, 0x1058A, 39 },
{ folding_set::mode_type::constant, 0x1058C, 0x10592, 39 },
{ folding_set::mode_type::constant, 0x10594, 0x10595, 39 },
{ folding_set::mode_type::constant, 0x10C80, 0x10CB2, 64 },
{ folding_set::mode_type::constant, 0x118A0, 0x118BF, 32 },
{ folding_set::mode_type::constant, 0x16E40, 0x16E5F, 32 },
{ folding_set::mode_type::constant, 0x1E900, 0x1E921, 34 },
};
const folding_set* folding_sets_end = std::end(s_folding_sets);
auto match = std::lower_bound(s_folding_sets, folding_sets_end, in_codepoint);
if (match == folding_sets_end
|| in_codepoint < *match) {
// in_codepoint doesn't fit into any set;
return in_codepoint;
}
return match->fold(in_codepoint);
}
} // namespace jessilib } // namespace jessilib

374
src/include/jessilib/unicode.hpp

@ -26,14 +26,36 @@ namespace jessilib {
/** encode_codepoint */ /** encode_codepoint */
size_t encode_codepoint(std::string& out_string, char32_t in_codepoint); // ASSUMES UTF-8 /**
* Encodes a codepoint, and appends it to an output string
*
* @param out_string String to append
* @param in_codepoint Codepoint to encode
* @return Number of data elements appended to out_string
*/
size_t encode_codepoint(std::string& out_string, char32_t in_codepoint); // DEPRECATED: ASSUMES UTF-8
size_t encode_codepoint(std::u8string& out_string, char32_t in_codepoint); size_t encode_codepoint(std::u8string& out_string, char32_t in_codepoint);
size_t encode_codepoint(std::u16string& out_string, char32_t in_codepoint); size_t encode_codepoint(std::u16string& out_string, char32_t in_codepoint);
size_t encode_codepoint(std::u32string& out_string, char32_t in_codepoint); size_t encode_codepoint(std::u32string& out_string, char32_t in_codepoint);
size_t encode_codepoint(std::basic_ostream<char>& out_stream, char32_t in_codepoint); // ASSUMES UTF-8
/**
* Encodes a codepoint to an output stream
*
* @param out_stream Stream to write codepoint to
* @param in_codepoint Codepoint to encode
* @return Number of data elements appending to out_stream
*/
size_t encode_codepoint(std::basic_ostream<char>& out_stream, char32_t in_codepoint); // DEPRECATED: ASSUMES UTF-8
size_t encode_codepoint(std::basic_ostream<char8_t>& out_stream, char32_t in_codepoint); size_t encode_codepoint(std::basic_ostream<char8_t>& out_stream, char32_t in_codepoint);
size_t encode_codepoint(std::basic_ostream<char16_t>& out_stream, char32_t in_codepoint); size_t encode_codepoint(std::basic_ostream<char16_t>& out_stream, char32_t in_codepoint);
size_t encode_codepoint(std::basic_ostream<char32_t>& out_stream, char32_t in_codepoint); size_t encode_codepoint(std::basic_ostream<char32_t>& out_stream, char32_t in_codepoint);
/**
* Encodes a codepoint and returns it as a string
*
* @param in_codepoint Codepoint to encode
* @return A string containing the codepoint encoded to the appropriate underlying CharT type
*/
std::u8string encode_codepoint_u8(char32_t in_codepoint); std::u8string encode_codepoint_u8(char32_t in_codepoint);
std::u16string encode_codepoint_u16(char32_t in_codepoint); std::u16string encode_codepoint_u16(char32_t in_codepoint);
std::u32string encode_codepoint_u32(char32_t in_codepoint); std::u32string encode_codepoint_u32(char32_t in_codepoint);
@ -41,11 +63,17 @@ std::u32string encode_codepoint_u32(char32_t in_codepoint);
/** decode_codepoint */ /** decode_codepoint */
struct get_endpoint_result { struct get_endpoint_result {
char32_t codepoint{}; char32_t codepoint{}; // Codepoint
size_t units{}; size_t units{}; // Number of data units codepoint was represented by, or 0
}; };
get_endpoint_result decode_codepoint(const std::string_view& in_string); // ASSUMES UTF-8 /**
* Decodes the front codepoint in a string
*
* @param in_string String to decode a codepoint from
* @return A struct containing a valid codepoint and the number of representative data units on success, zero otherwise.
*/
get_endpoint_result decode_codepoint(const std::string_view& in_string); // DEPRECATED: ASSUMES UTF-8
get_endpoint_result decode_codepoint(const std::u8string_view& in_string); // UTF-8 get_endpoint_result decode_codepoint(const std::u8string_view& in_string); // UTF-8
get_endpoint_result decode_codepoint(const std::u16string_view& in_string); // UTF-16 get_endpoint_result decode_codepoint(const std::u16string_view& in_string); // UTF-16
get_endpoint_result decode_codepoint(const std::u32string_view& in_string); // UTF-32 get_endpoint_result decode_codepoint(const std::u32string_view& in_string); // UTF-32
@ -79,4 +107,340 @@ bool is_high_surrogate(char32_t in_codepoint);
bool is_low_surrogate(char32_t in_codepoint); bool is_low_surrogate(char32_t in_codepoint);
get_endpoint_result decode_surrogate_pair(char16_t in_high_surrogate, char16_t in_low_surrogate); get_endpoint_result decode_surrogate_pair(char16_t in_high_surrogate, char16_t in_low_surrogate);
/** Utilities */
template<typename InT>
bool is_valid(const InT& in_string) {
using InCharT = typename InT::value_type;
using InViewT = std::basic_string_view<InCharT>;
InViewT in_string_view = static_cast<InViewT>(in_string);
while (!in_string_view.empty()) {
get_endpoint_result string_front = decode_codepoint(in_string_view);
if (string_front.units == 0) {
return false;
}
in_string_view.remove_prefix(string_front.units);
}
return true;
}
/**
* Interprets one string type as another (i.e: reading from a I/O stream or buffer, char -> whatever)
* NOTE: THIS DOES NOT GUARANTEE THE RESULTING VIEW IS SANE, ONLY THAT IT'S A TECHNICALLY VALID SEQUENCE
*
* @tparam OutCharT Output view character type
* @tparam InT Input string type
* @param in_string Container holding string data
* @return A valid view into in_string on success, an empty string_view otherwise
*/
template<typename OutCharT, typename InT>
std::basic_string_view<OutCharT> string_view_cast(const InT& in_string) {
using InCharT = typename InT::value_type;
size_t in_string_bytes = in_string.size() * sizeof(InCharT);
if constexpr (sizeof(OutCharT) > sizeof(InCharT)) {
// The output type is larger than the input type; verify no partial codepoints
if (in_string_bytes % sizeof(OutCharT) != 0) {
// This cannot be used to produce a valid result
return {};
}
}
size_t out_string_units = in_string_bytes / sizeof(OutCharT);
const OutCharT* data_begin = reinterpret_cast<const OutCharT*>(in_string.data());
std::basic_string_view<OutCharT> result{ data_begin, out_string_units };
if (!is_valid(result)) {
// Result isn't valid; discard and return empty
return {};
}
return result;
}
template<typename OutCharT, typename InT>
std::basic_string<OutCharT> string_cast(const InT& in_string) {
using InCharT = typename InT::value_type;
using InViewT = std::basic_string_view<InCharT>;
std::basic_string<OutCharT> result;
// Just do a dumb copy when same type & valid; should be slightly faster than re-encoding
if constexpr (std::is_same_v<OutCharT, InCharT>) {
if (is_valid(in_string)) {
result = in_string;
}
return result;
}
InViewT in_string_view = static_cast<InViewT>(in_string);
if constexpr (sizeof(InCharT) <= sizeof(OutCharT)) {
// When copying to a larger type, we will need _at most_ as many elements as the smaller storage type
result.reserve(in_string_view.size());
}
else {
result.reserve(in_string_view.size() * (sizeof(OutCharT) / sizeof(InCharT)));
}
while (!in_string_view.empty()) {
get_endpoint_result string_front = decode_codepoint(in_string_view);
if (string_front.units == 0) {
return {};
}
in_string_view.remove_prefix(string_front.units);
encode_codepoint(result, string_front.codepoint);
}
return result;
}
/** single-unit case folding utilities */
char32_t fold(char32_t in_codepoint); // Folds codepoint for case insensitive checks (not for human output)
/**
* Checks if two codepoints are equal to eachother (case insensitive)
*
* @param lhs First codepoint to compare
* @param rhs Second codepoint to compare
* @return True if the characters are equal, false otherwise
*/
inline bool equalsi(char32_t lhs, char32_t rhs) {
return lhs == rhs
|| fold(lhs) == fold(rhs);
}
// Should just make these methods container-type agnostic rather than this mess...
#define ADAPT_BASIC_STRING(method) \
template<typename LhsCharT, typename RhsCharT> \
auto method(const std::basic_string<LhsCharT>& lhs, std::basic_string_view<RhsCharT> rhs) { \
return method(static_cast<std::basic_string_view<LhsCharT>>(lhs), rhs); } \
template<typename LhsCharT, typename RhsCharT> \
bool method(std::basic_string_view<LhsCharT> lhs, const std::basic_string<RhsCharT>& rhs) { \
return method(lhs, static_cast<std::basic_string_view<RhsCharT>>(rhs)); } \
template<typename LhsCharT, typename RhsCharT> \
bool method(const std::basic_string<LhsCharT>& lhs, const std::basic_string<RhsCharT>& rhs) { \
return method(static_cast<std::basic_string_view<LhsCharT>>(lhs), static_cast<std::basic_string_view<RhsCharT>>(rhs)); }
/**
* Checks if two strings are equal
*
* @tparam LhsCharT Unicode codepoint container type for left-hand parameter
* @tparam RhsCharT Unicode codepoint container type for right-hand parameter
* @param lhs First string to compare
* @param rhs Second string to compare against
* @return True if the strings are equal, false otherwise
*/
template<typename LhsCharT, typename RhsCharT>
bool equals(std::basic_string_view<LhsCharT> lhs, std::basic_string_view<RhsCharT> rhs) {
// If lhs and rhs are the same type, compare their sizes and quickly return if not same
if constexpr (std::is_same_v<LhsCharT, RhsCharT>) {
return lhs == rhs;
}
while (!lhs.empty() && !rhs.empty()) {
auto lhs_front = decode_codepoint(lhs);
auto rhs_front = decode_codepoint(rhs);
if (lhs_front.units == 0
|| rhs_front.units == 0) {
// Failed to decode front codepoint; bad unicode sequence
return false;
}
if (lhs_front.codepoint != rhs_front.codepoint) {
// Codepoints aren't the same
return false;
}
// Codepoints are equal; trim off the fronts and continue
lhs.remove_prefix(lhs_front.units);
rhs.remove_prefix(rhs_front.units);
}
return lhs.empty() && rhs.empty();
}
ADAPT_BASIC_STRING(equals)
/**
* Checks if two strings are equal (case insensitive)
*
* @tparam LhsCharT Unicode codepoint container type for left-hand parameter
* @tparam RhsCharT Unicode codepoint container type for right-hand parameter
* @param lhs First string to compare
* @param rhs Second string to compare against
* @return True if the strings are equal, false otherwise
*/
template<typename LhsCharT, typename RhsCharT>
bool equalsi(std::basic_string_view<LhsCharT> lhs, std::basic_string_view<RhsCharT> rhs) {
// If lhs and rhs are the same type, compare their sizes and quickly return if not same
if constexpr (std::is_same_v<LhsCharT, RhsCharT>) {
if (lhs.size() != rhs.size()) {
return false;
}
}
while (!lhs.empty() && !rhs.empty()) {
auto lhs_front = decode_codepoint(lhs);
auto rhs_front = decode_codepoint(rhs);
if (lhs_front.units == 0
|| rhs_front.units == 0) {
// Failed to decode front codepoint; bad unicode sequence
return false;
}
if (!equalsi(lhs_front.codepoint, rhs_front.codepoint)) {
// Codepoints don't fold to same value
return false;
}
// Codepoints are equal; trim off the fronts and continue
lhs.remove_prefix(lhs_front.units);
rhs.remove_prefix(rhs_front.units);
}
return lhs.empty() && rhs.empty();
}
ADAPT_BASIC_STRING(equalsi)
/**
* Checks if a string starts with a substring
*
* @tparam LhsCharT Unicode codepoint container type for underlying string
* @tparam RhsCharT Unicode codepoint container type for prefix string
* @param in_string String to check for prefix
* @param in_prefix Substring prefix to check for
* @return Data length of in_prefix in terms of LhsCharT if in_string starts with in_prefix, 0 otherwise
*/
template<typename LhsCharT, typename RhsCharT>
size_t starts_with_length(std::basic_string_view<LhsCharT> in_string, std::basic_string_view<RhsCharT> in_prefix) {
// If in_string and in_prefix are the same type, compare their sizes and quickly return if in_string is too small
if constexpr (std::is_same_v<LhsCharT, RhsCharT>) {
if (in_string.size() < in_prefix.size()) {
return 0;
}
}
size_t codepoints_removed{};
while (!in_string.empty() && !in_prefix.empty()) {
get_endpoint_result string_front = decode_codepoint(in_string);
get_endpoint_result prefix_front = decode_codepoint(in_prefix);
if (string_front.units == 0
|| prefix_front.units == 0) {
// Failed to decode front codepoint; bad unicode sequence
return 0;
}
if (string_front.codepoint != prefix_front.codepoint) {
// Codepoints aren't the same
return 0;
}
// Codepoints are equal; trim off the fronts and continue
in_string.remove_prefix(string_front.units);
in_prefix.remove_prefix(prefix_front.units);
codepoints_removed += string_front.units;
}
if (!in_prefix.empty()) {
// We reached end of in_string before end of prefix
return 0;
}
return codepoints_removed;
}
ADAPT_BASIC_STRING(starts_with_length)
/**
* Checks if a string starts with a substring (case insensitive)
*
* @tparam LhsCharT Unicode codepoint container type for underlying string
* @tparam RhsCharT Unicode codepoint container type for prefix string
* @param in_string String to check for prefix
* @param in_prefix Substring prefix to check for
* @return Data length of in_prefix in terms of LhsCharT if in_string starts with in_prefix, 0 otherwise
*/
template<typename LhsCharT, typename RhsCharT>
size_t starts_with_lengthi(std::basic_string_view<LhsCharT> in_string, std::basic_string_view<RhsCharT> in_prefix) {
// If in_string and in_prefix are the same type, compare their sizes and quickly return if in_string is too small
if constexpr (std::is_same_v<LhsCharT, RhsCharT>) {
if (in_string.size() < in_prefix.size()) {
return 0;
}
}
size_t codepoints_removed{};
while (!in_string.empty() && !in_prefix.empty()) {
get_endpoint_result string_front = decode_codepoint(in_string);
get_endpoint_result prefix_front = decode_codepoint(in_prefix);
if (string_front.units == 0
|| prefix_front.units == 0) {
// Failed to decode front codepoint; bad unicode sequence
return 0;
}
if (!equalsi(string_front.codepoint, prefix_front.codepoint)) {
// Codepoints don't fold to same value
return 0;
}
// Codepoints are equal; trim off the fronts and continue
in_string.remove_prefix(string_front.units);
in_prefix.remove_prefix(prefix_front.units);
codepoints_removed += string_front.units;
}
if (!in_prefix.empty()) {
// We reached end of in_string before end of prefix
return 0;
}
return codepoints_removed;
}
ADAPT_BASIC_STRING(starts_with_lengthi)
/**
* Checks if a string starts with a substring
*
* @tparam LhsCharT Unicode codepoint container type for underlying string
* @tparam RhsCharT Unicode codepoint container type for prefix string
* @param in_string String to check for prefix
* @param in_prefix Prefix to check for
* @return True if both strings are valid and in_string starts with in_prefix, false otherwise
*/
template<typename LhsCharT, typename RhsCharT>
bool starts_with(std::basic_string_view<LhsCharT> in_string, std::basic_string_view<RhsCharT> in_prefix) {
return starts_with_length<LhsCharT, RhsCharT>(in_string, in_prefix) != 0;
}
ADAPT_BASIC_STRING(starts_with)
/**
* Checks if a string starts with a substring (case insensitive)
*
* @tparam LhsCharT Unicode codepoint container type for underlying string
* @tparam RhsCharT Unicode codepoint container type for prefix string
* @param in_string String to check for prefix
* @param in_prefix Prefix to check for
* @return True if both strings are valid and in_string starts with in_prefix, false otherwise
*/
template<typename LhsCharT, typename RhsCharT>
bool starts_withi(std::basic_string_view<LhsCharT> in_string, std::basic_string_view<RhsCharT> in_prefix) {
return starts_with_lengthi<LhsCharT, RhsCharT>(in_string, in_prefix) != 0;
}
ADAPT_BASIC_STRING(starts_withi)
/** to_lower / to_upper */
//char32_t to_lower(char32_t in_chr); // TODO: implement
//char32_t to_upper(char32_t in_chr); // TODO: implement
} // namespace jessilib } // namespace jessilib

2
src/include/jessilib/word_split.hpp

@ -301,7 +301,7 @@ constexpr auto word_split_once(ItrT begin, EndT end, SpaceItrT in_whitespace_beg
} }
for (auto itr = begin; itr < end;) { for (auto itr = begin; itr < end;) {
if (is_whitespace(itr)) { if (is_whitespace(*itr)) {
// in_whitespace found; word_split upon it // in_whitespace found; word_split upon it
result.first = make_word_split_member<MemberT>(begin, itr); result.first = make_word_split_member<MemberT>(begin, itr);

5
src/test/CMakeLists.txt

@ -16,3 +16,8 @@ target_include_directories(jessilib_tests PRIVATE .)
# Link with gtest # Link with gtest
target_link_libraries(jessilib_tests gtest gtest_main jessilib) target_link_libraries(jessilib_tests gtest gtest_main jessilib)
# Also ensure JESSITEST_SRC_DIR is defined
target_compile_definitions(jessilib_tests PRIVATE
JESSITEST_SRC_DIR="${CMAKE_CURRENT_SOURCE_DIR}/"
JESSITEST_DATA_DIR="${CMAKE_CURRENT_SOURCE_DIR}/data/")

1624
src/test/data/CaseFolding.txt

File diff suppressed because it is too large

351
src/test/unicode.cpp

@ -17,6 +17,9 @@
*/ */
#include "jessilib/unicode.hpp" #include "jessilib/unicode.hpp"
#include <fstream>
#include <charconv>
#include "jessilib/split.hpp"
#include "test.hpp" #include "test.hpp"
using namespace jessilib; using namespace jessilib;
@ -118,3 +121,351 @@ TEST(UTF32Test, decode_codepoint) {
DECODE_CODEPOINT_TEST(U"\U0010FFFF"sv, U'\U0010FFFF', 1U); DECODE_CODEPOINT_TEST(U"\U0010FFFF"sv, U'\U0010FFFF', 1U);
DECODE_CODEPOINT_TEST(U"\U0001F604"sv, U'\U0001F604', 1U); DECODE_CODEPOINT_TEST(U"\U0001F604"sv, U'\U0001F604', 1U);
} }
using char_types = ::testing::Types<char, char8_t, char16_t, char32_t>;
using char_type_combos = ::testing::Types<
std::pair<char, char>, std::pair<char, char8_t>, std::pair<char, char16_t>, std::pair<char, char32_t>,
std::pair<char8_t, char>, std::pair<char8_t, char8_t>, std::pair<char8_t, char16_t>, std::pair<char8_t, char32_t>,
std::pair<char16_t, char>, std::pair<char16_t, char8_t>, std::pair<char16_t, char16_t>, std::pair<char16_t, char32_t>,
std::pair<char32_t, char>, std::pair<char32_t, char8_t>, std::pair<char32_t, char16_t>, std::pair<char32_t, char32_t>>;
template<typename T>
class UnicodeAbcdTest : public ::testing::Test {
public:
};
TYPED_TEST_SUITE(UnicodeAbcdTest, char_type_combos);
template<typename CharT, size_t InLength>
std::basic_string<CharT> make_str(const char32_t (&in_str)[InLength]) {
std::basic_string<CharT> result;
auto in_str_end = std::end(in_str) - 1; // ignore null terminator
for (auto itr = std::begin(in_str); itr != in_str_end; ++itr) {
jessilib::encode_codepoint(result, *itr);
}
return result;
}
/** string_cast */
TYPED_TEST(UnicodeAbcdTest, string_cast) {
auto abcd_str = make_str<typename TypeParam::first_type>(U"ABCD");
std::basic_string_view<typename TypeParam::first_type> abcd_string_view = abcd_str;
EXPECT_TRUE(equals(abcd_str,
string_cast<typename TypeParam::second_type>(abcd_str)));
EXPECT_TRUE(is_valid(abcd_str));
EXPECT_TRUE(equals(abcd_string_view,
string_cast<typename TypeParam::second_type>(abcd_string_view)));
EXPECT_TRUE(is_valid(abcd_string_view));
}
TEST(UTF8Test, string_view_cast) {
auto abcd_str = make_str<char8_t>(U"ABCD");
auto view = string_view_cast<char>(abcd_str);
EXPECT_TRUE(equals(view, abcd_str));
}
/** equals */
TYPED_TEST(UnicodeAbcdTest, equals) {
// TypeParam::first_type == TypeParam::second_type
EXPECT_TRUE(equals(make_str<typename TypeParam::first_type>(U"ABCD"),
make_str<typename TypeParam::second_type>(U"ABCD")));
EXPECT_TRUE(equals(make_str<typename TypeParam::first_type>(U"abcd"),
make_str<typename TypeParam::second_type>(U"abcd")));
EXPECT_FALSE(equals(make_str<typename TypeParam::first_type>(U"ABCD"),
make_str<typename TypeParam::second_type>(U"abcd")));
EXPECT_FALSE(equals(make_str<typename TypeParam::first_type>(U"abcd"),
make_str<typename TypeParam::second_type>(U"ABCD")));
EXPECT_FALSE(equals(make_str<typename TypeParam::first_type>(U"ABcd"),
make_str<typename TypeParam::second_type>(U"abCD")));
}
/** equalsi */
TYPED_TEST(UnicodeAbcdTest, equalsi) {
// TypeParam::first_type == TypeParam::second_type
EXPECT_TRUE(equalsi(make_str<typename TypeParam::first_type>(U"ABCD"),
make_str<typename TypeParam::second_type>(U"ABCD")));
EXPECT_TRUE(equalsi(make_str<typename TypeParam::first_type>(U"abcd"),
make_str<typename TypeParam::second_type>(U"abcd")));
EXPECT_TRUE(equalsi(make_str<typename TypeParam::first_type>(U"ABCD"),
make_str<typename TypeParam::second_type>(U"abcd")));
EXPECT_TRUE(equalsi(make_str<typename TypeParam::first_type>(U"abcd"),
make_str<typename TypeParam::second_type>(U"ABCD")));
EXPECT_TRUE(equalsi(make_str<typename TypeParam::first_type>(U"ABcd"),
make_str<typename TypeParam::second_type>(U"abCD")));
}
/** starts_with */
TYPED_TEST(UnicodeAbcdTest, starts_with) {
// TypeParam::first_type == TypeParam::second_type
EXPECT_TRUE(starts_with(make_str<typename TypeParam::first_type>(U"ABCD"),
make_str<typename TypeParam::second_type>(U"ABCD")));
EXPECT_TRUE(starts_with(make_str<typename TypeParam::first_type>(U"abcd"),
make_str<typename TypeParam::second_type>(U"abcd")));
EXPECT_FALSE(starts_with(make_str<typename TypeParam::first_type>(U"ABCD"),
make_str<typename TypeParam::second_type>(U"abcd")));
EXPECT_FALSE(starts_with(make_str<typename TypeParam::first_type>(U"abcd"),
make_str<typename TypeParam::second_type>(U"ABCD")));
EXPECT_FALSE(starts_with(make_str<typename TypeParam::first_type>(U"ABcd"),
make_str<typename TypeParam::second_type>(U"abCD")));
// TypeParam::first_type starts_with TypeParam::second_type... (always false)
EXPECT_FALSE(starts_with(make_str<typename TypeParam::first_type>(U"ABCD"),
make_str<typename TypeParam::second_type>(U"ABCDzz")));
EXPECT_FALSE(starts_with(make_str<typename TypeParam::first_type>(U"abcd"),
make_str<typename TypeParam::second_type>(U"abcdzz")));
EXPECT_FALSE(starts_with(make_str<typename TypeParam::first_type>(U"ABCD"),
make_str<typename TypeParam::second_type>(U"abcdzz")));
EXPECT_FALSE(starts_with(make_str<typename TypeParam::first_type>(U"abcd"),
make_str<typename TypeParam::second_type>(U"ABCDzz")));
EXPECT_FALSE(starts_with(make_str<typename TypeParam::first_type>(U"ABcd"),
make_str<typename TypeParam::second_type>(U"abCDzz")));
// TypeParam::first_type... starts_with TypeParam::second_type (always same results as first)
EXPECT_TRUE(starts_with(make_str<typename TypeParam::first_type>(U"ABCDzz"),
make_str<typename TypeParam::second_type>(U"ABCD")));
EXPECT_TRUE(starts_with(make_str<typename TypeParam::first_type>(U"abcdzz"),
make_str<typename TypeParam::second_type>(U"abcd")));
EXPECT_FALSE(starts_with(make_str<typename TypeParam::first_type>(U"ABCDzz"),
make_str<typename TypeParam::second_type>(U"abcd")));
EXPECT_FALSE(starts_with(make_str<typename TypeParam::first_type>(U"abcdzz"),
make_str<typename TypeParam::second_type>(U"ABCD")));
EXPECT_FALSE(starts_with(make_str<typename TypeParam::first_type>(U"ABcdzz"),
make_str<typename TypeParam::second_type>(U"abCD")));
}
/** starts_withi */
TYPED_TEST(UnicodeAbcdTest, starts_withi) {
// TypeParam::first_type == TypeParam::second_type
EXPECT_TRUE(starts_withi(make_str<typename TypeParam::first_type>(U"ABCD"),
make_str<typename TypeParam::second_type>(U"ABCD")));
EXPECT_TRUE(starts_withi(make_str<typename TypeParam::first_type>(U"abcd"),
make_str<typename TypeParam::second_type>(U"abcd")));
EXPECT_TRUE(starts_withi(make_str<typename TypeParam::first_type>(U"ABCD"),
make_str<typename TypeParam::second_type>(U"abcd")));
EXPECT_TRUE(starts_withi(make_str<typename TypeParam::first_type>(U"abcd"),
make_str<typename TypeParam::second_type>(U"ABCD")));
EXPECT_TRUE(starts_withi(make_str<typename TypeParam::first_type>(U"ABcd"),
make_str<typename TypeParam::second_type>(U"abCD")));
// TypeParam::first_type starts_with TypeParam::second_type... (always false)
EXPECT_FALSE(starts_withi(make_str<typename TypeParam::first_type>(U"ABCD"),
make_str<typename TypeParam::second_type>(U"ABCDzz")));
EXPECT_FALSE(starts_withi(make_str<typename TypeParam::first_type>(U"abcd"),
make_str<typename TypeParam::second_type>(U"abcdzz")));
EXPECT_FALSE(starts_withi(make_str<typename TypeParam::first_type>(U"ABCD"),
make_str<typename TypeParam::second_type>(U"abcdzz")));
EXPECT_FALSE(starts_withi(make_str<typename TypeParam::first_type>(U"abcd"),
make_str<typename TypeParam::second_type>(U"ABCDzz")));
EXPECT_FALSE(starts_withi(make_str<typename TypeParam::first_type>(U"ABcd"),
make_str<typename TypeParam::second_type>(U"abCDzz")));
// TypeParam::first_type... starts_with TypeParam::second_type (always same results as first)
EXPECT_TRUE(starts_withi(make_str<typename TypeParam::first_type>(U"ABCDzz"),
make_str<typename TypeParam::second_type>(U"ABCD")));
EXPECT_TRUE(starts_withi(make_str<typename TypeParam::first_type>(U"abcdzz"),
make_str<typename TypeParam::second_type>(U"abcd")));
EXPECT_TRUE(starts_withi(make_str<typename TypeParam::first_type>(U"ABCDzz"),
make_str<typename TypeParam::second_type>(U"abcd")));
EXPECT_TRUE(starts_withi(make_str<typename TypeParam::first_type>(U"abcdzz"),
make_str<typename TypeParam::second_type>(U"ABCD")));
EXPECT_TRUE(starts_withi(make_str<typename TypeParam::first_type>(U"ABcdzz"),
make_str<typename TypeParam::second_type>(U"abCD")));
EXPECT_TRUE(starts_withi(make_str<typename TypeParam::first_type>(U"Les Bean del Dallas"),
make_str<typename TypeParam::second_type>(U"les")));
EXPECT_TRUE(starts_withi(make_str<typename TypeParam::first_type>(U"Les Bean del Dallas"),
make_str<typename TypeParam::second_type>(U"les Bean")));
EXPECT_FALSE(starts_withi(make_str<typename TypeParam::first_type>(U"Les Bean del Dallas"),
make_str<typename TypeParam::second_type>(U"del")));
}
/**
* Folding test
*/
struct folding_info {
uint32_t in_codepoint;
uint32_t out_codepoint;
};
constexpr bool operator<(const folding_info& lhs, uint32_t rhs) {
return lhs.in_codepoint < rhs;
}
constexpr bool operator<(uint32_t lhs, const folding_info& rhs) {
return lhs < rhs.in_codepoint;
}
// TODO: make this just download from unicode.org (https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt)
std::vector<folding_info> read_CaseFolding_txt(const char* filename = JESSITEST_DATA_DIR "CaseFolding.txt") {
std::vector<folding_info> result;
std::ifstream file{ filename };
while (file.good() && !file.eof()) {
std::string line;
std::getline(file, line);
auto split_line = jessilib::split(line, "; "sv);
if (split_line.size() < 3) {
continue;
}
// From CaseFolding.txt usage; we're doing simple case folding:
// "A. To do a simple case folding, use the mappings with status C + S."
// "B. To do a full case folding, use the mappings with status C + F."
if (split_line[1] == "C" || split_line[1] == "S") {
folding_info info;
std::from_chars(split_line[0].data(), split_line[0].data() + split_line[0].size(), info.in_codepoint, 16);
std::from_chars(split_line[2].data(), split_line[2].data() + split_line[2].size(), info.out_codepoint, 16);
result.push_back(info);
}
//else {
// std::cout << "ignoring: " << line << std::endl;
//}
}
return result;
}
// describes a range
struct folding_set {
enum class mode_type {
constant,
alternating,
single
} mode = mode_type::single;
uint32_t range_start; // inclusive
uint32_t range_end; // inclusive
int64_t diff; // difference between two codepoints
const char* type() const {
switch (mode) {
case mode_type::constant:
return "constant";
case mode_type::alternating:
return "alternating";
case mode_type::single:
return "single";
default:
break;
}
return nullptr;
}
};
std::vector<folding_set> folding_sets_from_folding_info(const std::vector<folding_info>& in_info) {
std::vector<folding_set> result;
folding_set current{};
uint32_t last_match;
for (auto& info : in_info) {
int64_t diff = static_cast<int64_t>(info.out_codepoint) - static_cast<int64_t>(info.in_codepoint);
if (current.diff == 0) {
// this is a new set; set range_start and diff
current.range_start = info.in_codepoint;
current.diff = diff;
last_match = info.in_codepoint;
continue;
}
if (current.mode == folding_set::mode_type::single) {
// We don't know what mode we are yet; check if this is constant
if (diff == current.diff
&& info.in_codepoint == last_match + 1) {
// diff is same for range_start + 1; must be a constant-diff continuous set
current.mode = folding_set::mode_type::constant;
last_match = info.in_codepoint;
continue;
}
// Either one-off fold, or alternating set. Check if in_codepoint is last_match + 2
if (diff == current.diff
&& info.in_codepoint == last_match + 2) {
// Looks like an alternating set
current.mode = folding_set::mode_type::alternating;
last_match = info.in_codepoint;
continue;
}
}
if (current.mode == folding_set::mode_type::constant // For constant offset diffs to continue...
&& current.diff == diff // Diff must be the same
&& last_match + 1 == info.in_codepoint) { // And current codepoint must be 1 higher than previous
// This matches the current range pattern; just update last_match and carry on
last_match = info.in_codepoint;
continue;
}
if (current.mode == folding_set::mode_type::alternating // For alternating sets to continue...
&& current.diff == diff // Diff must be same
&& last_match + 2 == info.in_codepoint) { // And current codepoint must be 2 higher than previous
last_match = info.in_codepoint;
continue;
}
// This isn't a continuation of the current set; push current and start a new set
current.range_end = last_match;
result.push_back(current);
current = folding_set{};
current.range_start = info.in_codepoint;
current.diff = diff;
last_match = info.in_codepoint;
}
// Push final range
if (current.diff != 0) {
current.range_end = last_match;
result.push_back(current);
}
return result;
}
TEST(jess, unicode_case_folding_txt) {
auto folded_codepoints = read_CaseFolding_txt();
ASSERT_FALSE(folded_codepoints.empty());
// Test that every folded codepoint is equal to its folded equivalent
for (auto& folded_codepoint : folded_codepoints) {
EXPECT_EQ(jessilib::fold(folded_codepoint.in_codepoint), folded_codepoint.out_codepoint)
<< std::hex << std::uppercase << "lhs: " << std::setw(4) << folded_codepoint.in_codepoint << "; rhs: " << std::setw(8) << folded_codepoint.out_codepoint;
EXPECT_TRUE(jessilib::equalsi(folded_codepoint.in_codepoint, folded_codepoint.out_codepoint));
}
// If the above failed, print out what the fold table needs to be replaced with
if (::testing::Test::HasFailure()) {
auto folding_sets = folding_sets_from_folding_info(folded_codepoints);
ASSERT_FALSE(folding_sets.empty());
for (auto& set : folding_sets) {
// Generate code in the format: { folding_set::mode_type::constant, 0x1234, 0x2345, 1 },
std::cout << std::hex << std::uppercase << std::setfill('0') << "{ "
<< "folding_set::mode_type::" << set.type()
<< ", "
<< "0x" << std::setw(4) << set.range_start << ", "
<< "0x" << std::setw(4) << set.range_end << ", "
<< std::dec << set.diff
<< " }," << std::endl;
}
std::cout << "folded_codepoints.size(): " << folded_codepoints.size() << "; folding_sets.size(): " << folding_sets.size() << std::endl;
}
// Test that every non-folded codepoint in [0, 0xFFFF] is equal to itself; we already tested folded ones above
for (char32_t codepoint = 0; codepoint != 0x10000; ++codepoint) {
bool is_folded = std::binary_search(folded_codepoints.begin(), folded_codepoints.end(), codepoint);
if (!is_folded) {
EXPECT_EQ(codepoint, jessilib::fold(codepoint));
EXPECT_TRUE(jessilib::equalsi(codepoint, codepoint));
}
}
}

Loading…
Cancel
Save