Browse Source

Replace jessilib::encoding with jessilib::text_encoding

master
Jessica James 3 years ago
parent
commit
fdc9b2bc28
  1. 2
      src/bot/console/console.cpp
  2. 23
      src/common/config.cpp
  3. 7
      src/common/parser/parser.cpp
  4. 22
      src/common/parsers/json.cpp
  5. 10
      src/common/serialize.cpp
  6. 11
      src/include/jessilib/config.hpp
  7. 14
      src/include/jessilib/object.hpp
  8. 12
      src/include/jessilib/parser.hpp
  9. 4
      src/include/jessilib/parsers/json.hpp
  10. 5
      src/include/jessilib/serialize.hpp
  11. 224
      src/include/jessilib/text_encoding.hpp
  12. 61
      src/include/jessilib/unicode_base.hpp
  13. 28
      src/test/parser.cpp

2
src/bot/console/console.cpp

@ -33,7 +33,7 @@ void console_input_loop() {
std::wstring input; std::wstring input;
auto shutdown_future = get_shutdown_future(); auto shutdown_future = get_shutdown_future();
while (shutdown_future.wait_for(std::chrono::milliseconds(10)) != std::future_status::ready) { while (shutdown_future.wait_for(std::chrono::milliseconds(10)) != std::future_status::ready) {
std::getline(std::wcin, input); // TODO: use a non-bloicking call and poll running periodically? std::getline(std::wcin, input); // TODO: use a non-blocking call and poll running periodically?
jessibot::io::console_command_context context{ jessilib::string_cast<char8_t>(input) }; jessibot::io::console_command_context context{ jessilib::string_cast<char8_t>(input) };
if (!command_manager::instance().execute_command(context)) { if (!command_manager::instance().execute_command(context)) {
text error_text{ u8"ERROR", text::property::bold, color{ 0xFF0000 }}; text error_text{ u8"ERROR", text::property::bold, color{ 0xFF0000 }};

23
src/common/config.cpp

@ -53,6 +53,11 @@ std::string config::format() const {
return m_format; return m_format;
} }
text_encoding config::encoding() const {
std::shared_lock<std::shared_mutex> guard{ m_mutex };
return m_encoding;
}
/** Modifiers */ /** Modifiers */
void config::set_data(const object& in_data) { void config::set_data(const object& in_data) {
std::lock_guard<std::shared_mutex> guard{ m_mutex }; std::lock_guard<std::shared_mutex> guard{ m_mutex };
@ -60,16 +65,17 @@ void config::set_data(const object& in_data) {
} }
/** File I/O */ /** File I/O */
void config::load(const std::filesystem::path& in_filename, const std::string& in_format) { void config::load(const std::filesystem::path& in_filename, const std::string& in_format, text_encoding in_encoding) {
jessilib_assert(!in_filename.empty()); jessilib_assert(!in_filename.empty());
std::lock_guard<std::shared_mutex> guard{ m_mutex }; std::lock_guard<std::shared_mutex> guard{ m_mutex };
// Determine format // Determine format
m_filename = in_filename; m_filename = in_filename;
m_format = get_format(m_filename, in_format); m_format = get_format(m_filename, in_format);
m_encoding = in_encoding;
// Load // Load
m_data = read_object(m_filename, m_format); m_data = read_object(m_filename, m_format, m_encoding);
} }
void config::reload() { void config::reload() {
@ -92,20 +98,21 @@ void config::write() const {
} }
} }
void config::write(const std::filesystem::path& in_filename , const std::string& in_format) { void config::write(const std::filesystem::path& in_filename , const std::string& in_format, text_encoding in_encoding) {
jessilib_assert(!in_filename.empty()); jessilib_assert(!in_filename.empty());
std::lock_guard<std::shared_mutex> guard{ m_mutex }; std::lock_guard<std::shared_mutex> guard{ m_mutex };
// Setup // Setup
m_filename = in_filename; m_filename = in_filename;
m_format = get_format(m_filename, in_format); m_format = get_format(m_filename, in_format);
m_encoding = in_encoding;
// Write // Write
write_object(m_data, m_filename, m_format); write_object(m_data, m_filename, m_format, m_encoding);
} }
/** Static File I/O */ /** Static File I/O */
object config::read_object(const std::filesystem::path& in_filename, const std::string& in_format) { object config::read_object(const std::filesystem::path& in_filename, const std::string& in_format, text_encoding in_encoding) {
// Open up file for reading // Open up file for reading
std::ifstream file{ in_filename, std::ios::in | std::ios::binary }; std::ifstream file{ in_filename, std::ios::in | std::ios::binary };
if (!file) { if (!file) {
@ -114,10 +121,10 @@ object config::read_object(const std::filesystem::path& in_filename, const std::
} }
// Deserialize1 // Deserialize1
return deserialize_object(file, get_format(in_filename, in_format)); return deserialize_object(file, get_format(in_filename, in_format), in_encoding);
} }
void config::write_object(const object& in_object, const std::filesystem::path& in_filename, const std::string& in_format) { void config::write_object(const object& in_object, const std::filesystem::path& in_filename, const std::string& in_format, text_encoding in_encoding) {
// Open up file for writing // Open up file for writing
std::ofstream file{ in_filename, std::ios::out | std::ios::binary }; std::ofstream file{ in_filename, std::ios::out | std::ios::binary };
if (!file) { if (!file) {
@ -126,7 +133,7 @@ void config::write_object(const object& in_object, const std::filesystem::path&
} }
// Deserialize1 // Deserialize1
return serialize_object(file, in_object, get_format(in_filename, in_format)); return serialize_object(file, in_object, get_format(in_filename, in_format), in_encoding);
} }
std::string config::get_format(const std::filesystem::path& in_filename, const std::string& in_format) { std::string config::get_format(const std::filesystem::path& in_filename, const std::string& in_format) {

7
src/common/parser/parser.cpp

@ -17,12 +17,11 @@
*/ */
#include "parser.hpp" #include "parser.hpp"
#include "unicode.hpp"
#include <istream> #include <istream>
namespace jessilib { namespace jessilib {
object parser::deserialize_bytes(std::istream& in_stream, encoding in_read_encoding) { object parser::deserialize_bytes(std::istream& in_stream, text_encoding in_read_encoding) {
std::vector<byte_type> data; std::vector<byte_type> data;
// Read entire stream into data // Read entire stream into data
@ -36,10 +35,10 @@ object parser::deserialize_bytes(std::istream& in_stream, encoding in_read_encod
return deserialize_bytes(bytes_view_type{ &data.front(), data.size() }, in_read_encoding); return deserialize_bytes(bytes_view_type{ &data.front(), data.size() }, in_read_encoding);
} }
void parser::serialize_bytes(std::ostream& in_stream, const object& in_object, encoding in_write_encoding) { void parser::serialize_bytes(std::ostream& in_stream, const object& in_object, text_encoding in_write_encoding) {
// TODO: replace this method // TODO: replace this method
auto bytes = serialize_bytes(in_object, in_write_encoding); auto bytes = serialize_bytes(in_object, in_write_encoding);
in_stream << bytes; in_stream.write(bytes.data(), bytes.size());
} }
} // namespace jessilib } // namespace jessilib

22
src/common/parsers/json.cpp

@ -20,26 +20,26 @@
namespace jessilib { namespace jessilib {
object json_parser::deserialize_bytes(bytes_view_type in_data, encoding in_write_encoding) { object json_parser::deserialize_bytes(bytes_view_type in_data, text_encoding in_write_encoding) {
object result; object result;
if (in_write_encoding == encoding::utf_8) { if (in_write_encoding == text_encoding::utf_8) {
std::u8string_view data_view = jessilib::string_view_cast<char8_t>(in_data); std::u8string_view data_view = jessilib::string_view_cast<char8_t>(in_data);
deserialize_json<char8_t, true>(result, data_view); deserialize_json<char8_t, true>(result, data_view);
} }
else if (in_write_encoding == encoding::utf_16) { else if (in_write_encoding == text_encoding::utf_16) {
std::u16string_view data_view = jessilib::string_view_cast<char16_t>(in_data); std::u16string_view data_view = jessilib::string_view_cast<char16_t>(in_data);
deserialize_json<char16_t, true>(result, data_view); deserialize_json<char16_t, true>(result, data_view);
} }
else if (in_write_encoding == encoding::utf_32) { else if (in_write_encoding == text_encoding::utf_32) {
std::u32string_view data_view = jessilib::string_view_cast<char32_t>(in_data); std::u32string_view data_view = jessilib::string_view_cast<char32_t>(in_data);
deserialize_json<char32_t, true>(result, data_view); deserialize_json<char32_t, true>(result, data_view);
} }
else if (in_write_encoding == encoding::wchar) { else if (in_write_encoding == text_encoding::wchar) {
std::wstring_view data_view = jessilib::string_view_cast<wchar_t>(in_data); std::wstring_view data_view = jessilib::string_view_cast<wchar_t>(in_data);
deserialize_json<wchar_t, true>(result, data_view); deserialize_json<wchar_t, true>(result, data_view);
} }
else if (in_write_encoding == encoding::multibyte) { else if (in_write_encoding == text_encoding::multibyte) {
// TODO: support without copying... somehow // TODO: support without copying... somehow
auto u8_data = mbstring_to_ustring<char8_t>(jessilib::string_view_cast<char>(in_data)); auto u8_data = mbstring_to_ustring<char8_t>(jessilib::string_view_cast<char>(in_data));
std::u8string_view data_view = u8_data.second; std::u8string_view data_view = u8_data.second;
@ -49,15 +49,15 @@ object json_parser::deserialize_bytes(bytes_view_type in_data, encoding in_write
return result; return result;
} }
std::string json_parser::serialize_bytes(const object& in_object, encoding in_write_encoding) { std::string json_parser::serialize_bytes(const object& in_object, text_encoding in_write_encoding) {
switch (in_write_encoding) { switch (in_write_encoding) {
case encoding::utf_8: case text_encoding::utf_8:
return serialize_impl<char8_t, char>(in_object); return serialize_impl<char8_t, char>(in_object);
case encoding::utf_16: case text_encoding::utf_16:
return serialize_impl<char16_t, char>(in_object); return serialize_impl<char16_t, char>(in_object);
case encoding::utf_32: case text_encoding::utf_32:
return serialize_impl<char16_t, char>(in_object); return serialize_impl<char16_t, char>(in_object);
case encoding::wchar: case text_encoding::wchar:
return serialize_impl<char16_t, char>(in_object); return serialize_impl<char16_t, char>(in_object);
default: default:
break; break;

10
src/common/serialize.cpp

@ -53,8 +53,8 @@ object deserialize_object(std::u8string_view in_data, const std::string& in_form
return get_parser(in_format)->deserialize(in_data); return get_parser(in_format)->deserialize(in_data);
} }
object deserialize_object(std::istream& in_stream, const std::string& in_format) { object deserialize_object(std::istream& in_stream, const std::string& in_format, text_encoding in_encoding) {
return get_parser(in_format)->deserialize_bytes(in_stream, encoding::utf_8); return get_parser(in_format)->deserialize_bytes(in_stream, in_encoding);
} }
/** Serialization */ /** Serialization */
@ -62,8 +62,10 @@ std::u8string serialize_object(const object& in_object, const std::string& in_fo
return get_parser(in_format)->serialize<char8_t>(in_object); return get_parser(in_format)->serialize<char8_t>(in_object);
} }
void serialize_object(std::ostream& in_stream, const object& in_object, const std::string& in_format) { void serialize_object(std::ostream& in_stream, const object& in_object, const std::string& in_format, text_encoding in_encoding) {
get_parser(in_format)->serialize_bytes(in_stream, in_object, encoding::utf_8); in_object.get<object::string_view_type>(object::string_view_type{});
get_parser(in_format)->serialize_bytes(in_stream, in_object, in_encoding);
} }
} // namespace jessilib } // namespace jessilib

11
src/include/jessilib/config.hpp

@ -24,6 +24,7 @@
#include <filesystem> #include <filesystem>
#include <shared_mutex> #include <shared_mutex>
#include "object.hpp" #include "object.hpp"
#include "text_encoding.hpp"
namespace jessilib { namespace jessilib {
@ -41,19 +42,20 @@ public:
object data() const; object data() const;
std::filesystem::path filename() const; std::filesystem::path filename() const;
std::string format() const; std::string format() const;
text_encoding encoding() const;
/** Modifiers */ /** Modifiers */
void set_data(const object& in_data); void set_data(const object& in_data);
/** File I/O */ /** File I/O */
void load(const std::filesystem::path& in_filename, const std::string& in_format = {}); void load(const std::filesystem::path& in_filename, const std::string& in_format = {}, text_encoding in_encoding = text_encoding::utf_8);
void reload(); void reload();
void write() const; void write() const;
void write(const std::filesystem::path& in_filename , const std::string& in_format = {}); void write(const std::filesystem::path& in_filename, const std::string& in_format = {}, text_encoding in_encoding = text_encoding::utf_8);
/** Static File I/O */ /** Static File I/O */
static object read_object(const std::filesystem::path& in_filename, const std::string& in_format = {}); static object read_object(const std::filesystem::path& in_filename, const std::string& in_format = {}, text_encoding in_encoding = text_encoding::utf_8);
static void write_object(const object& in_object, const std::filesystem::path& in_filename, const std::string& in_format = {}); static void write_object(const object& in_object, const std::filesystem::path& in_filename, const std::string& in_format = {}, text_encoding in_encoding = text_encoding::utf_8);
/** Static helpers */ /** Static helpers */
static std::string get_format(const std::filesystem::path& in_filename, const std::string& in_format = {}); static std::string get_format(const std::filesystem::path& in_filename, const std::string& in_format = {});
@ -62,6 +64,7 @@ private:
mutable std::shared_mutex m_mutex; mutable std::shared_mutex m_mutex;
object m_data; object m_data;
std::string m_format; std::string m_format;
text_encoding m_encoding;
std::filesystem::path m_filename; std::filesystem::path m_filename;
}; };

14
src/include/jessilib/object.hpp

@ -252,12 +252,24 @@ public:
T get(DefaultT&& in_default_value) const { T get(DefaultT&& in_default_value) const {
const string_type* result = std::get_if<string_type>(&m_value); const string_type* result = std::get_if<string_type>(&m_value);
if (result != nullptr) { if (result != nullptr) {
return *result; return T{ *result };
} }
return { in_default_value.begin(), in_default_value.end() }; return { in_default_value.begin(), in_default_value.end() };
} }
// TODO: support other basic_string_view types
template<typename T, typename DefaultT = T,
typename std::enable_if<std::is_same<T, string_view_type>::value && std::is_same<typename std::decay<DefaultT>::type, string_view_type>::value>::type* = nullptr>
T get(DefaultT&& in_default_value) const {
const string_type* result = std::get_if<string_type>(&m_value);
if (result != nullptr) {
return T{ *result };
}
return in_default_value;
}
/** arrays */ /** arrays */
// reference getter (array_type) // reference getter (array_type)

12
src/include/jessilib/parser.hpp

@ -20,7 +20,7 @@
#include <memory> #include <memory>
#include "object.hpp" #include "object.hpp"
#include "unicode_base.hpp" #include "text_encoding.hpp"
#include "impl/parser_manager.hpp" #include "impl/parser_manager.hpp"
namespace jessilib { namespace jessilib {
@ -40,15 +40,15 @@ public:
* @param in_stream Stream to deserialize object from * @param in_stream Stream to deserialize object from
* @return A valid (possibly null) object * @return A valid (possibly null) object
*/ */
virtual object deserialize_bytes(std::istream& in_stream, encoding in_read_encoding); virtual object deserialize_bytes(std::istream& in_stream, text_encoding in_read_encoding);
virtual object deserialize_bytes(bytes_view_type in_data, encoding in_read_encoding) = 0; virtual object deserialize_bytes(bytes_view_type in_data, text_encoding in_read_encoding) = 0;
virtual void serialize_bytes(std::ostream& in_stream, const object& in_object, encoding in_write_encoding); virtual void serialize_bytes(std::ostream& in_stream, const object& in_object, text_encoding in_write_encoding);
virtual std::string serialize_bytes(const object& in_object, encoding in_write_encoding) = 0; virtual std::string serialize_bytes(const object& in_object, text_encoding in_write_encoding) = 0;
template<typename CharT> template<typename CharT>
object deserialize(std::basic_string_view<CharT> in_text) { object deserialize(std::basic_string_view<CharT> in_text) {
bytes_view_type byte_view{ reinterpret_cast<const byte_type*>(in_text.data()), in_text.size() * sizeof(CharT) }; bytes_view_type byte_view{ reinterpret_cast<const byte_type*>(in_text.data()), in_text.size() * sizeof(CharT) };
return deserialize_bytes(byte_view, default_encoding_info<CharT>::text_encoding); return deserialize_bytes(byte_view, default_encoding_info<CharT>::encoding);
} }
// Perhaps this could be condensed down to a simple method such that: serialize(out_variant, in_object, encoding)? // Perhaps this could be condensed down to a simple method such that: serialize(out_variant, in_object, encoding)?

4
src/include/jessilib/parsers/json.hpp

@ -30,8 +30,8 @@ namespace jessilib {
class json_parser : public parser { class json_parser : public parser {
public: public:
/** deserialize/serialize overrides */ /** deserialize/serialize overrides */
object deserialize_bytes(bytes_view_type in_data, encoding in_write_encoding) override; object deserialize_bytes(bytes_view_type in_data, text_encoding in_write_encoding) override;
std::string serialize_bytes(const object& in_object, encoding in_write_encoding) override; std::string serialize_bytes(const object& in_object, text_encoding in_write_encoding) override;
std::u8string serialize_u8(const object& in_object) override { return serialize_impl<char8_t>(in_object); } std::u8string serialize_u8(const object& in_object) override { return serialize_impl<char8_t>(in_object); }
std::u16string serialize_u16(const object& in_object) override { return serialize_impl<char16_t>(in_object); } std::u16string serialize_u16(const object& in_object) override { return serialize_impl<char16_t>(in_object); }

5
src/include/jessilib/serialize.hpp

@ -20,6 +20,7 @@
#include <istream> #include <istream>
#include "object.hpp" #include "object.hpp"
#include "text_encoding.hpp"
namespace jessilib { namespace jessilib {
@ -34,10 +35,10 @@ object deserialize_object(std::u16string_view in_data, const std::string& in_for
object deserialize_object(std::u32string_view in_data, const std::string& in_format); object deserialize_object(std::u32string_view in_data, const std::string& in_format);
object deserialize_object(const std::vector<char8_t>& in_data, const std::string& in_format); object deserialize_object(const std::vector<char8_t>& in_data, const std::string& in_format);
//object deserialize_object(std::u8string_view in_data, const std::string& in_format); //object deserialize_object(std::u8string_view in_data, const std::string& in_format);
object deserialize_object(std::istream& in_stream, const std::string& in_format); // TODO: add encoding param object deserialize_object(std::istream& in_stream, const std::string& in_format, text_encoding in_encoding = text_encoding::unknown);
/** Serialization */ /** Serialization */
std::u8string serialize_object(const object& in_object, const std::string& in_format); // TODO: templatize? std::u8string serialize_object(const object& in_object, const std::string& in_format); // TODO: templatize?
void serialize_object(std::ostream& in_stream, const object& in_object, const std::string& in_format); // TODO: add encoding param void serialize_object(std::ostream& in_stream, const object& in_object, const std::string& in_format, text_encoding in_encoding = text_encoding::unknown);
} // namespace jessilib } // namespace jessilib

224
src/include/jessilib/text_encoding.hpp

@ -0,0 +1,224 @@
/**
* Copyright (C) 2021 Jessica James.
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*
* Written by Jessica James <jessica.aj@outlook.com>
*/
#pragma once
#include <bit>
#include <string_view>
namespace jessilib {
static constexpr char32_t byte_order_mark_codepoint = 0xFEFF;
static_assert(sizeof(wchar_t) == sizeof(char16_t) || sizeof(wchar_t) == sizeof(char32_t),
"Unexpected wchar_t size; neither char16 nor char32");
enum class text_encoding {
utf_8 = 0,
utf_16_little = 1,
utf_16_big = 2,
utf_16_native = (std::endian::native == std::endian::little ? utf_16_little : utf_16_big),
utf_16_foreign = (std::endian::native == std::endian::little ? utf_16_big : utf_16_little),
utf_16 = utf_16_native, // Alias for native
utf_32_little = 3,
utf_32_big = 4,
utf_32_native = (std::endian::native == std::endian::little ? utf_32_little : utf_32_big),
utf_32_foreign = (std::endian::native == std::endian::little ? utf_32_big : utf_32_little),
utf_32 = utf_32_native, // Alias for native
wchar = 5, // essentially only really for std::wcout / std::wcout
multibyte = 6, // essentially only really for std::cout / std::cin
//wchar = (sizeof(wchar_t) == sizeof(char16_t) ? utf_16 : utf_32),
unknown
};
template<text_encoding EncodingV>
struct encoding_info;
template<>
struct encoding_info<text_encoding::utf_8> {
using data_type = char8_t;
static constexpr bool is_little = false;
static constexpr bool is_big = false;
static constexpr bool is_native = true;
static constexpr bool is_foreign = !is_native;
static constexpr bool is_agnostic = true;
static constexpr size_t bom_byte_size = 3;
static constexpr text_encoding encoding = text_encoding::utf_8;
};
template<>
struct encoding_info<text_encoding::utf_16_little> {
using data_type = char16_t;
static constexpr bool is_little = true;
static constexpr bool is_big = false;
static constexpr bool is_native = std::endian::native == std::endian::little;
static constexpr bool is_foreign = !is_native;
static constexpr bool is_agnostic = false;
static constexpr size_t bom_byte_size = 2;
static constexpr text_encoding encoding = text_encoding::utf_16;
};
template<>
struct encoding_info<text_encoding::utf_16_big> {
using data_type = char16_t;
static constexpr bool is_little = false;
static constexpr bool is_big = true;
static constexpr bool is_native = std::endian::native == std::endian::big;
static constexpr bool is_foreign = !is_native;
static constexpr bool is_agnostic = false;
static constexpr size_t bom_byte_size = 2;
static constexpr text_encoding encoding = text_encoding::utf_16;
};
template<>
struct encoding_info<text_encoding::utf_32_little> {
using data_type = char32_t;
static constexpr bool is_little = true;
static constexpr bool is_big = false;
static constexpr bool is_native = std::endian::native == std::endian::little;
static constexpr bool is_foreign = !is_native;
static constexpr bool is_agnostic = false;
static constexpr size_t bom_byte_size = 4;
static constexpr text_encoding encoding = text_encoding::utf_32;
};
template<>
struct encoding_info<text_encoding::utf_32_big> {
using data_type = char32_t;
static constexpr bool is_little = false;
static constexpr bool is_big = true;
static constexpr bool is_native = std::endian::native == std::endian::big;
static constexpr bool is_foreign = !is_native;
static constexpr bool is_agnostic = false;
static constexpr size_t bom_byte_size = 4;
static constexpr text_encoding encoding = text_encoding::utf_32;
};
template<>
struct encoding_info<text_encoding::wchar> {
using data_type = wchar_t;
static constexpr bool is_little = std::endian::native == std::endian::little;
static constexpr bool is_big = std::endian::native == std::endian::big;
static constexpr bool is_native = true;
static constexpr bool is_foreign = !is_native;
static constexpr bool is_agnostic = true; // Not truly agnostic, but wchar_t is for system-local use only... probably.
static constexpr size_t bom_byte_size = 0; // Not supporting for wchar at this time
static constexpr text_encoding encoding = text_encoding::wchar;
};
template<>
struct encoding_info<text_encoding::multibyte> {
using data_type = char;
static constexpr bool is_little = false;
static constexpr bool is_big = false;
static constexpr bool is_native = true;
static constexpr bool is_foreign = !is_native;
static constexpr bool is_agnostic = true;
static constexpr size_t bom_byte_size = 0;
static constexpr text_encoding encoding = text_encoding::multibyte;
};
template<typename CharT>
struct default_encoding_info;
template<>
struct default_encoding_info<char8_t> : public encoding_info<text_encoding::utf_8> {
};
template<>
struct default_encoding_info<char16_t> : public encoding_info<text_encoding::utf_16> {
};
template<>
struct default_encoding_info<char32_t> : public encoding_info<text_encoding::utf_32> {
};
template<>
struct default_encoding_info<wchar_t> : public encoding_info<text_encoding::wchar> {
};
/** Unicode byte-order markers */
enum class bom_encoding {
utf_8 = 0,
utf_16_little = 1,
utf_16_big = 2,
utf_16_native = (std::endian::native == std::endian::little ? utf_16_little : utf_16_big),
utf_16 = utf_16_native,
utf_32_little = 4,
utf_32_big = 5,
utf_32_native = (std::endian::native == std::endian::little ? utf_32_little : utf_32_big),
utf_32 = utf_32_native,
unknown = 0xFF
};
// If this results in a non-native encoding, the swaps have to be done passing to decode_character
constexpr bom_encoding peek_bom(std::string_view in_data) {
if (in_data.size() < 2) {
// Not enough space for any BOM
return bom_encoding::unknown;
}
// Try UTF-16 BE
if (in_data[0] == '\xFE' && in_data[1] == '\xFF') {
return bom_encoding::utf_16_big;
}
// Try UTF-16 LE
if (in_data[0] == '\xFF' && in_data[1] == '\xFE') {
// Check UTF-32 LE
if (in_data.size() >= 4
&& in_data[2] == 0 && in_data[3] == 0) {
return bom_encoding::utf_32_little;
}
return bom_encoding::utf_16_little;
}
if (in_data.size() < 3) {
// Not enough space for any other BOMs
return bom_encoding::unknown;
}
// Try UTF-8
if (in_data[0] == '\xEF' && in_data[1] == '\xBB' && in_data[2] == '\xBF') {
return bom_encoding::utf_8;
}
if (in_data.size() < 4) {
// Not enough space for any other BOMs
return bom_encoding::unknown;
}
// Try UTF-32 BE
if (in_data[0] == 0 && in_data[1] == 0 && in_data[2] == '\xFE' && in_data[3] == '\xFF') {
return bom_encoding::utf_32_big;
}
// No matches; return unknown (mostly likely encoded as UTF-8)
return bom_encoding::unknown;
}
} // namespace jessilib

61
src/include/jessilib/unicode_base.hpp

@ -169,67 +169,6 @@ struct unicode_traits<wchar_t> : std::true_type {
template<typename CharT> template<typename CharT>
using encode_buffer_type = CharT[unicode_traits<CharT>::max_units_per_codepoint]; using encode_buffer_type = CharT[unicode_traits<CharT>::max_units_per_codepoint];
// enum representing the character encodings I intend to support
enum class encoding {
utf_8, // The most common and arguably superior encoding for files and networking protocols not in straight ASCII
utf_16,
utf_32,
wchar, // essentially only really for std::wcout / std::wcout
multibyte // essentially only really for std::cout / std::cin
};
template<encoding EncodingV>
struct encoding_info;
template<>
struct encoding_info<encoding::utf_8> {
using data_type = char8_t;
static constexpr encoding text_encoding = encoding::utf_8;
};
template<>
struct encoding_info<encoding::utf_16> {
using data_type = char16_t;
static constexpr encoding text_encoding = encoding::utf_16;
};
template<>
struct encoding_info<encoding::utf_32> {
using data_type = char32_t;
static constexpr encoding text_encoding = encoding::utf_32;
};
template<>
struct encoding_info<encoding::wchar> {
using data_type = wchar_t;
static constexpr encoding text_encoding = encoding::wchar;
};
template<>
struct encoding_info<encoding::multibyte> {
using data_type = char;
static constexpr encoding text_encoding = encoding::multibyte;
};
template<typename CharT>
struct default_encoding_info;
template<>
struct default_encoding_info<char8_t> : public encoding_info<encoding::utf_8> {
};
template<>
struct default_encoding_info<char16_t> : public encoding_info<encoding::utf_16> {
};
template<>
struct default_encoding_info<char32_t> : public encoding_info<encoding::utf_32> {
};
template<>
struct default_encoding_info<wchar_t> : public encoding_info<encoding::wchar> {
};
/** single-unit helper utilities */ /** single-unit helper utilities */
char32_t fold(char32_t in_codepoint); // Folds codepoint for case-insensitive checks (not for human output) char32_t fold(char32_t in_codepoint); // Folds codepoint for case-insensitive checks (not for human output)
constexpr int as_base(char32_t in_character, unsigned int base); // The value represented by in_character in terms of base if valid, -1 otherwise constexpr int as_base(char32_t in_character, unsigned int base); // The value represented by in_character in terms of base if valid, -1 otherwise

28
src/test/parser.cpp

@ -30,50 +30,54 @@ using namespace std::literals;
class test_parser : public parser { class test_parser : public parser {
public: public:
/** deserialize/serialize overrides */ /** deserialize/serialize overrides */
object deserialize_bytes(bytes_view_type in_data, encoding in_write_encoding) override { object deserialize_bytes(bytes_view_type in_data, text_encoding in_write_encoding) override {
std::u8string u8_string; std::u8string u8_string;
switch (in_write_encoding) { switch (in_write_encoding) {
case encoding::utf_8: case text_encoding::utf_8:
u8_string = string_view_cast<char8_t>(in_data); u8_string = string_view_cast<char8_t>(in_data);
break; break;
case encoding::utf_16: case text_encoding::utf_16:
u8_string = jessilib::string_cast<char8_t>(string_view_cast<char16_t>(in_data)); u8_string = jessilib::string_cast<char8_t>(string_view_cast<char16_t>(in_data));
break; break;
case encoding::utf_32: case text_encoding::utf_32:
u8_string = jessilib::string_cast<char8_t>(string_view_cast<char32_t>(in_data)); u8_string = jessilib::string_cast<char8_t>(string_view_cast<char32_t>(in_data));
break; break;
case encoding::wchar: case text_encoding::wchar:
u8_string = jessilib::string_cast<char8_t>(string_view_cast<wchar_t>(in_data)); u8_string = jessilib::string_cast<char8_t>(string_view_cast<wchar_t>(in_data));
break; break;
case encoding::multibyte: case text_encoding::multibyte:
u8_string = mbstring_to_ustring<char8_t>(string_view_cast<char>(in_data)).second; u8_string = mbstring_to_ustring<char8_t>(string_view_cast<char>(in_data)).second;
break; break;
default:
break;
} }
return deserialize_impl(std::u8string_view{ u8_string }); return deserialize_impl(std::u8string_view{ u8_string });
} }
std::string serialize_bytes(const object& in_object, encoding in_write_encoding) override { std::string serialize_bytes(const object& in_object, text_encoding in_write_encoding) override {
std::u8string u8_serialized = serialize_impl(in_object); std::u8string u8_serialized = serialize_impl(in_object);
switch (in_write_encoding) { switch (in_write_encoding) {
case encoding::utf_8: case text_encoding::utf_8:
return { u8_serialized.begin(), u8_serialized.end() }; return { u8_serialized.begin(), u8_serialized.end() };
case encoding::utf_16: { case text_encoding::utf_16: {
auto casted = string_cast<char16_t>(u8_serialized); auto casted = string_cast<char16_t>(u8_serialized);
return { reinterpret_cast<const char*>(casted.data()), casted.size() * sizeof(char16_t) }; return { reinterpret_cast<const char*>(casted.data()), casted.size() * sizeof(char16_t) };
} }
case encoding::utf_32: { case text_encoding::utf_32: {
auto casted = string_cast<char32_t>(u8_serialized); auto casted = string_cast<char32_t>(u8_serialized);
return { reinterpret_cast<const char*>(casted.data()), casted.size() * sizeof(char32_t) }; return { reinterpret_cast<const char*>(casted.data()), casted.size() * sizeof(char32_t) };
} }
case encoding::wchar: { case text_encoding::wchar: {
auto casted = string_cast<wchar_t>(u8_serialized); auto casted = string_cast<wchar_t>(u8_serialized);
return { reinterpret_cast<const char*>(casted.data()), casted.size() * sizeof(wchar_t) }; return { reinterpret_cast<const char*>(casted.data()), casted.size() * sizeof(wchar_t) };
} }
case encoding::multibyte: case text_encoding::multibyte:
return ustring_to_mbstring(u8_serialized).second; return ustring_to_mbstring(u8_serialized).second;
default:
break;
} }
return {}; return {};

Loading…
Cancel
Save