mirror of https://github.com/JAJames/jessilib.git
Jessica James
3 years ago
13 changed files with 308 additions and 117 deletions
@ -0,0 +1,224 @@ |
|||
/**
|
|||
* Copyright (C) 2021 Jessica James. |
|||
* |
|||
* Permission to use, copy, modify, and/or distribute this software for any |
|||
* purpose with or without fee is hereby granted, provided that the above |
|||
* copyright notice and this permission notice appear in all copies. |
|||
* |
|||
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
|||
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
|||
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY |
|||
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
|||
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION |
|||
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN |
|||
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
|||
* |
|||
* Written by Jessica James <jessica.aj@outlook.com> |
|||
*/ |
|||
|
|||
#pragma once |
|||
|
|||
#include <bit> |
|||
#include <string_view> |
|||
|
|||
namespace jessilib { |
|||
|
|||
static constexpr char32_t byte_order_mark_codepoint = 0xFEFF; |
|||
|
|||
static_assert(sizeof(wchar_t) == sizeof(char16_t) || sizeof(wchar_t) == sizeof(char32_t), |
|||
"Unexpected wchar_t size; neither char16 nor char32"); |
|||
|
|||
enum class text_encoding { |
|||
utf_8 = 0, |
|||
|
|||
utf_16_little = 1, |
|||
utf_16_big = 2, |
|||
utf_16_native = (std::endian::native == std::endian::little ? utf_16_little : utf_16_big), |
|||
utf_16_foreign = (std::endian::native == std::endian::little ? utf_16_big : utf_16_little), |
|||
utf_16 = utf_16_native, // Alias for native
|
|||
|
|||
utf_32_little = 3, |
|||
utf_32_big = 4, |
|||
utf_32_native = (std::endian::native == std::endian::little ? utf_32_little : utf_32_big), |
|||
utf_32_foreign = (std::endian::native == std::endian::little ? utf_32_big : utf_32_little), |
|||
utf_32 = utf_32_native, // Alias for native
|
|||
|
|||
wchar = 5, // essentially only really for std::wcout / std::wcout
|
|||
multibyte = 6, // essentially only really for std::cout / std::cin
|
|||
//wchar = (sizeof(wchar_t) == sizeof(char16_t) ? utf_16 : utf_32),
|
|||
|
|||
unknown |
|||
}; |
|||
|
|||
template<text_encoding EncodingV> |
|||
struct encoding_info; |
|||
|
|||
template<> |
|||
struct encoding_info<text_encoding::utf_8> { |
|||
using data_type = char8_t; |
|||
static constexpr bool is_little = false; |
|||
static constexpr bool is_big = false; |
|||
static constexpr bool is_native = true; |
|||
static constexpr bool is_foreign = !is_native; |
|||
static constexpr bool is_agnostic = true; |
|||
static constexpr size_t bom_byte_size = 3; |
|||
static constexpr text_encoding encoding = text_encoding::utf_8; |
|||
}; |
|||
|
|||
template<> |
|||
struct encoding_info<text_encoding::utf_16_little> { |
|||
using data_type = char16_t; |
|||
static constexpr bool is_little = true; |
|||
static constexpr bool is_big = false; |
|||
static constexpr bool is_native = std::endian::native == std::endian::little; |
|||
static constexpr bool is_foreign = !is_native; |
|||
static constexpr bool is_agnostic = false; |
|||
static constexpr size_t bom_byte_size = 2; |
|||
static constexpr text_encoding encoding = text_encoding::utf_16; |
|||
}; |
|||
|
|||
template<> |
|||
struct encoding_info<text_encoding::utf_16_big> { |
|||
using data_type = char16_t; |
|||
static constexpr bool is_little = false; |
|||
static constexpr bool is_big = true; |
|||
static constexpr bool is_native = std::endian::native == std::endian::big; |
|||
static constexpr bool is_foreign = !is_native; |
|||
static constexpr bool is_agnostic = false; |
|||
static constexpr size_t bom_byte_size = 2; |
|||
static constexpr text_encoding encoding = text_encoding::utf_16; |
|||
}; |
|||
|
|||
template<> |
|||
struct encoding_info<text_encoding::utf_32_little> { |
|||
using data_type = char32_t; |
|||
static constexpr bool is_little = true; |
|||
static constexpr bool is_big = false; |
|||
static constexpr bool is_native = std::endian::native == std::endian::little; |
|||
static constexpr bool is_foreign = !is_native; |
|||
static constexpr bool is_agnostic = false; |
|||
static constexpr size_t bom_byte_size = 4; |
|||
static constexpr text_encoding encoding = text_encoding::utf_32; |
|||
}; |
|||
|
|||
template<> |
|||
struct encoding_info<text_encoding::utf_32_big> { |
|||
using data_type = char32_t; |
|||
static constexpr bool is_little = false; |
|||
static constexpr bool is_big = true; |
|||
static constexpr bool is_native = std::endian::native == std::endian::big; |
|||
static constexpr bool is_foreign = !is_native; |
|||
static constexpr bool is_agnostic = false; |
|||
static constexpr size_t bom_byte_size = 4; |
|||
static constexpr text_encoding encoding = text_encoding::utf_32; |
|||
}; |
|||
|
|||
template<> |
|||
struct encoding_info<text_encoding::wchar> { |
|||
using data_type = wchar_t; |
|||
static constexpr bool is_little = std::endian::native == std::endian::little; |
|||
static constexpr bool is_big = std::endian::native == std::endian::big; |
|||
static constexpr bool is_native = true; |
|||
static constexpr bool is_foreign = !is_native; |
|||
static constexpr bool is_agnostic = true; // Not truly agnostic, but wchar_t is for system-local use only... probably.
|
|||
static constexpr size_t bom_byte_size = 0; // Not supporting for wchar at this time
|
|||
static constexpr text_encoding encoding = text_encoding::wchar; |
|||
}; |
|||
|
|||
template<> |
|||
struct encoding_info<text_encoding::multibyte> { |
|||
using data_type = char; |
|||
static constexpr bool is_little = false; |
|||
static constexpr bool is_big = false; |
|||
static constexpr bool is_native = true; |
|||
static constexpr bool is_foreign = !is_native; |
|||
static constexpr bool is_agnostic = true; |
|||
static constexpr size_t bom_byte_size = 0; |
|||
static constexpr text_encoding encoding = text_encoding::multibyte; |
|||
}; |
|||
|
|||
template<typename CharT> |
|||
struct default_encoding_info; |
|||
|
|||
template<> |
|||
struct default_encoding_info<char8_t> : public encoding_info<text_encoding::utf_8> { |
|||
}; |
|||
|
|||
template<> |
|||
struct default_encoding_info<char16_t> : public encoding_info<text_encoding::utf_16> { |
|||
}; |
|||
|
|||
template<> |
|||
struct default_encoding_info<char32_t> : public encoding_info<text_encoding::utf_32> { |
|||
}; |
|||
|
|||
template<> |
|||
struct default_encoding_info<wchar_t> : public encoding_info<text_encoding::wchar> { |
|||
}; |
|||
|
|||
/** Unicode byte-order markers */ |
|||
|
|||
enum class bom_encoding { |
|||
utf_8 = 0, |
|||
|
|||
utf_16_little = 1, |
|||
utf_16_big = 2, |
|||
utf_16_native = (std::endian::native == std::endian::little ? utf_16_little : utf_16_big), |
|||
utf_16 = utf_16_native, |
|||
|
|||
utf_32_little = 4, |
|||
utf_32_big = 5, |
|||
utf_32_native = (std::endian::native == std::endian::little ? utf_32_little : utf_32_big), |
|||
utf_32 = utf_32_native, |
|||
|
|||
unknown = 0xFF |
|||
}; |
|||
|
|||
// If this results in a non-native encoding, the swaps have to be done passing to decode_character
|
|||
constexpr bom_encoding peek_bom(std::string_view in_data) { |
|||
if (in_data.size() < 2) { |
|||
// Not enough space for any BOM
|
|||
return bom_encoding::unknown; |
|||
} |
|||
|
|||
// Try UTF-16 BE
|
|||
if (in_data[0] == '\xFE' && in_data[1] == '\xFF') { |
|||
return bom_encoding::utf_16_big; |
|||
} |
|||
|
|||
// Try UTF-16 LE
|
|||
if (in_data[0] == '\xFF' && in_data[1] == '\xFE') { |
|||
// Check UTF-32 LE
|
|||
if (in_data.size() >= 4 |
|||
&& in_data[2] == 0 && in_data[3] == 0) { |
|||
return bom_encoding::utf_32_little; |
|||
} |
|||
|
|||
return bom_encoding::utf_16_little; |
|||
} |
|||
|
|||
if (in_data.size() < 3) { |
|||
// Not enough space for any other BOMs
|
|||
return bom_encoding::unknown; |
|||
} |
|||
|
|||
// Try UTF-8
|
|||
if (in_data[0] == '\xEF' && in_data[1] == '\xBB' && in_data[2] == '\xBF') { |
|||
return bom_encoding::utf_8; |
|||
} |
|||
|
|||
if (in_data.size() < 4) { |
|||
// Not enough space for any other BOMs
|
|||
return bom_encoding::unknown; |
|||
} |
|||
|
|||
// Try UTF-32 BE
|
|||
if (in_data[0] == 0 && in_data[1] == 0 && in_data[2] == '\xFE' && in_data[3] == '\xFF') { |
|||
return bom_encoding::utf_32_big; |
|||
} |
|||
|
|||
// No matches; return unknown (mostly likely encoded as UTF-8)
|
|||
return bom_encoding::unknown; |
|||
} |
|||
|
|||
} // namespace jessilib
|
Loading…
Reference in new issue