From f57fde422bb409ba0fedb042e870ce970d4afc6b Mon Sep 17 00:00:00 2001 From: Jessica James Date: Tue, 7 Dec 2021 00:45:24 -0600 Subject: [PATCH] Replace ntmbs_to_u8string with mbstring_to_ustring; add ustring_to_mbstring for completeness --- src/common/app_parameters.cpp | 136 ++++++++++++------------ src/include/jessilib/app_parameters.hpp | 39 ++++--- src/include/jessilib/unicode.hpp | 60 +++++++++++ 3 files changed, 149 insertions(+), 86 deletions(-) diff --git a/src/common/app_parameters.cpp b/src/common/app_parameters.cpp index 3e49bf8..ed9ccae 100644 --- a/src/common/app_parameters.cpp +++ b/src/common/app_parameters.cpp @@ -17,80 +17,77 @@ */ #include "app_parameters.hpp" -#include #include "jessilib/unicode.hpp" #include "jessilib/split.hpp" namespace jessilib { -// Convert null-terminated multi-byte string to UTF-8 -std::u8string ntmbs_to_u8string(const char* in_ntmbs) { - std::u8string result; - - std::string_view ntmbs_view = in_ntmbs; - std::mbstate_t mbstate{}; - char32_t codepoint{}; - while (!ntmbs_view.empty()) { - size_t bytes_read = std::mbrtoc32(&codepoint, ntmbs_view.data(), ntmbs_view.size(), &mbstate); - if (bytes_read > ntmbs_view.size()) { - // Some sort of error; just break - break; - } - // bytes_read will never be 0 except for null characters, which are excluded from our view; here for future reuse - bytes_read = std::max(size_t{1}, bytes_read); - ntmbs_view.remove_prefix(bytes_read); - encode_codepoint(result, codepoint); - } - - return result; -} - -std::vector vectorize(const char** in_argv) { - std::vector result; - if (in_argv == nullptr) { +template, char>>* = nullptr> +std::vector vectorize(CharT** in_ntarg_array) { + std::vector result; + if (in_ntarg_array == nullptr) { return result; } - for (auto argv = in_argv; *argv != nullptr; ++argv) { - result.emplace_back(ntmbs_to_u8string(*argv)); + for (auto argv = in_ntarg_array; *argv != nullptr; ++argv) { + result.emplace_back(mbstring_to_ustring(*argv).second); } + return result; } -std::vector vectorize(const wchar_t** in_argv) { - std::vector result; - if (in_argv == nullptr) { +template, wchar_t>>* = nullptr> +std::vector vectorize(CharT** in_ntarg_array) { + std::vector result; + if (in_ntarg_array == nullptr) { return result; } - for (auto argv = in_argv; *argv != nullptr; ++argv) { + for (auto argv = in_ntarg_array; *argv != nullptr; ++argv) { result.emplace_back(jessilib::string_cast(std::wstring_view{ *argv })); } + return result; } -app_parameters::app_parameters(int in_argc, char** in_argv, char** in_envp) - : app_parameters{ in_argc, const_cast(in_argv), const_cast(in_envp) } { +app_parameters::app_parameters(int, char** in_argv, char** in_envp) + : app_parameters{ vectorize(in_argv), vectorize(in_envp) } { // Empty ctor body } app_parameters::app_parameters(int, const char** in_argv, const char** in_envp) : app_parameters{ vectorize(in_argv), vectorize(in_envp) } { + // Empty ctor body } -app_parameters::app_parameters(int in_argc, wchar_t** in_argv, wchar_t** in_envp) - : app_parameters{ in_argc, const_cast(in_argv), const_cast(in_envp) } { +app_parameters::app_parameters(int, wchar_t** in_argv, wchar_t** in_envp) + : app_parameters{ vectorize(in_argv), vectorize(in_envp) } { + // Empty ctor body } app_parameters::app_parameters(int, const wchar_t** in_argv, const wchar_t** in_envp) : app_parameters{ vectorize(in_argv), vectorize(in_envp) } { + // Empty ctor body +} + +#ifdef __cpp_lib_generic_unordered_lookup +#define WRAP_MAP_KEY(in_key) in_key +#else // We can't use std::string_view for InKeyType until GCC 11 & clang 12, and I still want to support GCC 9 +#define WRAP_MAP_KEY(in_key) static_cast(in_key) +#endif // __cpp_lib_generic_unordered_lookup + +bool app_parameters::has_switch(string_view_type in_switch) const { + return m_switches_set.find(WRAP_MAP_KEY(in_switch)) != m_switches_set.end(); } -app_parameters::app_parameters(std::vector in_args, std::vector in_env) { +app_parameters::app_parameters(std::vector in_args, std::vector in_env) { // Parse in environment variables first to ensure they're parsed before the early-out for (const auto& env : in_env) { auto split = jessilib::split_once(env, u8'='); - m_env_values[split.first] = split.second; + m_values[split.first] = split.second; + m_env_values[split.first] = std::move(split.second); } // Sanity safety check; should never happen @@ -102,8 +99,8 @@ app_parameters::app_parameters(std::vector in_args, std::vector in_args, std::vector in_args, std::vector in_args, std::vector& app_parameters::arguments() const { +const std::vector& app_parameters::arguments() const { return m_args; } -const std::vector& app_parameters::switches() const { +const std::vector& app_parameters::switches() const { return m_switches; } @@ -178,10 +177,18 @@ const app_parameters::set_type& app_parameters::switches_set() const { return m_switches_set; } -const app_parameters::map_type& app_parameters::values() const { +const app_parameters::map_type& app_parameters::arg_values() const { return m_arg_values; } +const app_parameters::map_type& app_parameters::env_values() const { + return m_env_values; +} + +const app_parameters::map_type& app_parameters::values() const { + return m_values; +} + object app_parameters::as_object() const { using namespace std::literals; @@ -192,26 +199,17 @@ object app_parameters::as_object() const { return object{}; } - return std::map{ + return std::map{ { u8"Path"s, m_path }, { u8"Args"s, m_args }, { u8"Switches"s, m_switches }, - { u8"Values"s, m_arg_values }, - { u8"Env"s, m_env_values } + { u8"ArgValues"s, m_arg_values }, + { u8"EnvValues"s, m_env_values }, + { u8"Values"s, m_values } }; } -#ifdef __cpp_lib_generic_unordered_lookup -#define WRAP_MAP_KEY(in_key) in_key -#else // We can't use std::string_view for InKeyType until GCC 11 & clang 12, and I still want to support GCC 9 -#define WRAP_MAP_KEY(in_key) static_cast(in_key) -#endif // __cpp_lib_generic_unordered_lookup - -bool app_parameters::has_switch(std::u8string_view in_switch) const { - return m_switches_set.find(WRAP_MAP_KEY(in_switch)) != m_switches_set.end(); -} - -std::u8string_view app_parameters::get_arg_value(std::u8string_view in_key, std::u8string_view in_default) const { +app_parameters::string_view_type app_parameters::get_arg_value(string_view_type in_key, string_view_type in_default) const { auto result = m_arg_values.find(WRAP_MAP_KEY(in_key)); // Safety check @@ -222,7 +220,7 @@ std::u8string_view app_parameters::get_arg_value(std::u8string_view in_key, std: return result->second; } -std::u8string_view app_parameters::get_env_value(std::u8string_view in_key, std::u8string_view in_default) const { +app_parameters::string_view_type app_parameters::get_env_value(string_view_type in_key, string_view_type in_default) const { auto result = m_env_values.find(WRAP_MAP_KEY(in_key)); // Safety check @@ -233,15 +231,15 @@ std::u8string_view app_parameters::get_env_value(std::u8string_view in_key, std: return result->second; } -std::u8string_view app_parameters::get_value(std::u8string_view in_key, std::u8string_view in_default) const { - // Explicit args take priority - auto result = m_arg_values.find(WRAP_MAP_KEY(in_key)); - if (result != m_arg_values.end()) { - return result->second; +app_parameters::string_view_type app_parameters::get_value(string_view_type in_key, string_view_type in_default) const { + auto result = m_values.find(WRAP_MAP_KEY(in_key)); + + // Safety check + if (result == m_values.end()) { + return in_default; } - // Fallback to env - return get_env_value(in_key, in_default); + return result->second; } } // namespace jessilib diff --git a/src/include/jessilib/app_parameters.hpp b/src/include/jessilib/app_parameters.hpp index 82306bc..f3b4ed3 100644 --- a/src/include/jessilib/app_parameters.hpp +++ b/src/include/jessilib/app_parameters.hpp @@ -23,37 +23,42 @@ namespace jessilib { class app_parameters { public: - using set_type = std::unordered_set; - using map_type = std::unordered_map; + using string_type = std::u8string; + using string_view_type = std::u8string_view; + using set_type = std::unordered_set; + using map_type = std::unordered_map; app_parameters(int in_argc, char** in_argv, char** in_envp = nullptr); app_parameters(int in_argc, const char** in_argv, const char** in_envp = nullptr); app_parameters(int in_argc, wchar_t** in_argv, wchar_t** in_envp = nullptr); app_parameters(int in_argc, const wchar_t** in_argv, const wchar_t** in_envp = nullptr); - app_parameters(std::vector in_args, std::vector in_env = {}); + app_parameters(std::vector in_args, std::vector in_env = {}); - std::u8string_view path() const; - const std::vector& arguments() const; + [[nodiscard]] string_view_type path() const; + [[nodiscard]] const std::vector& arguments() const; - const std::vector& switches() const; - const set_type& switches_set() const; - const map_type& values() const; - jessilib::object as_object() const; + [[nodiscard]] const std::vector& switches() const; + [[nodiscard]] const set_type& switches_set() const; + [[nodiscard]] const map_type& arg_values() const; + [[nodiscard]] const map_type& env_values() const; + [[nodiscard]] const map_type& values() const; + [[nodiscard]] jessilib::object as_object() const; - bool has_switch(std::u8string_view in_switch) const; - std::u8string_view get_arg_value(std::u8string_view in_key, std::u8string_view in_default = {}) const; - std::u8string_view get_env_value(std::u8string_view in_key, std::u8string_view in_default = {}) const; - std::u8string_view get_value(std::u8string_view in_key, std::u8string_view in_default = {}) const; + [[nodiscard]] bool has_switch(string_view_type in_switch) const; + [[nodiscard]] string_view_type get_arg_value(string_view_type in_key, string_view_type in_default = {}) const; + [[nodiscard]] string_view_type get_env_value(string_view_type in_key, string_view_type in_default = {}) const; + [[nodiscard]] string_view_type get_value(string_view_type in_key, string_view_type in_default = {}) const; - operator jessilib::object() const { return as_object(); } + [[nodiscard]] inline operator jessilib::object() const { return as_object(); } private: - std::u8string m_path; - std::vector m_args; - std::vector m_switches; + string_type m_path; + std::vector m_args; + std::vector m_switches; set_type m_switches_set; map_type m_arg_values; map_type m_env_values; + map_type m_values; }; } // namespace jessilib diff --git a/src/include/jessilib/unicode.hpp b/src/include/jessilib/unicode.hpp index 78c0787..ec6f6e9 100644 --- a/src/include/jessilib/unicode.hpp +++ b/src/include/jessilib/unicode.hpp @@ -18,6 +18,7 @@ #pragma once +#include #include "unicode_compare.hpp" namespace jessilib { @@ -176,6 +177,65 @@ std::basic_string string_cast(const InT& in_string) { } } +/** + * Recodes a multi-byte string into a unicode-encoded string + * + * @tparam CharT Character type for resulting unicode string + * @param in_mbs Multibyte string to recode + * @return A pair containing a boolean which is true on success, and a unicode string + */ +template +std::pair> mbstring_to_ustring(std::string_view in_mbstring) { + std::pair> result; + + std::mbstate_t mbstate{}; + while (!in_mbstring.empty()) { + char32_t codepoint{}; + size_t bytes_read = std::mbrtoc32(&codepoint, in_mbstring.data(), in_mbstring.size(), &mbstate); + if (bytes_read > in_mbstring.size()) { + // Some sort of error; return + result.first = false; + return result; + } + + // bytes_read will never be 0 except for null characters, which are excluded from our view; here for future reuse + bytes_read = std::max(size_t{1}, bytes_read); + in_mbstring.remove_prefix(bytes_read); + encode_codepoint(result.second, codepoint); + } + + result.first = true; + return result; +} + +/** + * Recodes a unicode string into a multi-byte string + * @tparam CharT Character type for input unicode string + * @param in_string Unicode string to recode + * @return A pair containing a boolean which is true on success, and a multi-byte string + */ +template +std::pair ustring_to_mbstring(std::basic_string_view in_string) { + std::pair result; + + std::mbstate_t mbstate{}; + decode_result decode; + while ((decode = decode_codepoint(in_string).units != 0)) { + char buffer[MB_CUR_MAX]; // MB_LEN_MAX + size_t bytes_written = std::c32rtomb(buffer, decode.codepoint, &mbstate); + if (bytes_written > MB_CUR_MAX) { + // Invalid codepoint; return + result.first = false; + return result; + } + + result.second.append(buffer, bytes_written); + } + + result.first = true; + return result; +} + /** * Searches a string for a specified substring *