diff options
author | Dominick Allen <djallen@librehumanitas.org> | 2024-09-22 12:41:28 -0500 |
---|---|---|
committer | Dominick Allen <djallen@librehumanitas.org> | 2024-09-22 12:41:28 -0500 |
commit | 7da829d48f9059c83ab9cada2c850621e8bbd3f3 (patch) | |
tree | 314e7a5b645e910d4997e3bee980bd2024f3087d /include/utf8.hpp | |
parent | bf81e34921e3e30b05313efbcf5c9fa839cb7c05 (diff) |
Basics of library.
Diffstat (limited to 'include/utf8.hpp')
-rw-r--r-- | include/utf8.hpp | 557 |
1 files changed, 0 insertions, 557 deletions
diff --git a/include/utf8.hpp b/include/utf8.hpp deleted file mode 100644 index c66d93c..0000000 --- a/include/utf8.hpp +++ /dev/null @@ -1,557 +0,0 @@ -/* - * libfud - * Copyright 2024 Dominick Allen - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef FUD_UTF8_HPP -#define FUD_UTF8_HPP - -#include "array.hpp" -#include "memory.hpp" -#include "status.hpp" -#include "unique_array.hpp" - -/* -#include "ext_hash.hpp" -#include "ext_set.hpp" -*/ - -#include <cstdint> -#include <optional> -#include <type_traits> - -namespace fud { - -using utf8 = unsigned char; - -struct StringView; - -constexpr uint8_t ASCII_MASK = 0x7F; - -constexpr uint8_t UTF8_MB_PATTERN_MASK = 0xC0; -constexpr uint8_t UTF8_MB_PATTERN = 0x80; -constexpr uint8_t UTF8_MB_MASK = static_cast<uint8_t>(~UTF8_MB_PATTERN_MASK); - -constexpr uint8_t UTF8_2B_PATTERN_MASK = 0xE0; -constexpr uint8_t UTF8_2B_PATTERN = 0xC0; -constexpr uint8_t UTF8_2B_MASK = static_cast<uint8_t>(~UTF8_2B_PATTERN_MASK); - -constexpr uint8_t UTF8_3B_PATTERN_MASK = 0xF0; -constexpr uint8_t UTF8_3B_PATTERN = 0xE0; -constexpr uint8_t UTF8_3B_MASK = static_cast<uint8_t>(~UTF8_3B_PATTERN_MASK); - -constexpr uint8_t UTF8_4B_PATTERN_MASK = 0xF8; -constexpr uint8_t UTF8_4B_PATTERN = 0xF0; -constexpr uint8_t UTF8_4B_MASK = static_cast<uint8_t>(~UTF8_4B_PATTERN_MASK); - -namespace privateImpl { -constexpr bool validUtf8MB(uint8_t code) noexcept -{ - return (code & UTF8_MB_PATTERN_MASK) == UTF8_MB_PATTERN; -} -} // namespace privateImpl - -struct Ascii { - Array<uint8_t, 1> characters; - - constexpr Ascii() noexcept = default; - - constexpr explicit Ascii(uint8_t chr) noexcept : characters{{chr}} - { - } - - [[nodiscard]] constexpr uint8_t character() const noexcept - { - return characters[0]; - } - - [[nodiscard]] constexpr char asChar() const noexcept - { - return static_cast<char>(characters[0]); - } - - static constexpr size_t size() noexcept - { - return 1; - } - - [[nodiscard]] constexpr bool valid() const noexcept - { - return valid(characters[0]); - } - - static constexpr bool valid(uint8_t character) noexcept - { - return static_cast<uint8_t>(character & ~ASCII_MASK) == 0; - } - - auto operator<=>(const Ascii& other) const noexcept = default; -}; - -static_assert(std::is_trivial_v<Ascii>); -static_assert(std::is_standard_layout_v<Ascii>); - -/* -| B | E | Byte 1 | Byte 2 | Byte 3 | Byte 4 -| U+0000 | U+007F | 0xxxxxxx | | | -| U+0080 | U+07FF | 110xxxxx | 10xxxxxx | | -| U+0800 | U+FFFF | 1110xxxx | 10xxxxxx | 10xxxxxx | -| U+10000 | U+10FFFF | 11110xxx | 10xxxxxx | 10xxxxxx | 10xxxxxx -*/ - -struct Utf82Byte { - constexpr Utf82Byte(uint8_t first, uint8_t second) noexcept : characters{{first, second}} - { - } - Array<uint8_t, 2> characters; - static constexpr size_t size() noexcept - { - return 2; - } - - [[nodiscard]] constexpr bool valid() const noexcept - { - return valid(first(), second()); - } - - static constexpr bool valid(uint8_t first, uint8_t second) noexcept - { - using privateImpl::validUtf8MB; - return ((first & UTF8_2B_PATTERN_MASK) == UTF8_2B_PATTERN) && validUtf8MB(second); - } - - [[nodiscard]] constexpr uint8_t first() const noexcept - { - return characters[0]; - } - - [[nodiscard]] constexpr uint8_t second() const noexcept - { - return characters[1]; - } - - auto operator<=>(const Utf82Byte& other) const noexcept = default; -}; - -struct Utf83Byte { - constexpr Utf83Byte(uint8_t first, uint8_t second, uint8_t third) noexcept : characters{{first, second, third}} - { - } - - Array<uint8_t, 3> characters; - - static constexpr size_t size() noexcept - { - return 3; - } - - [[nodiscard]] constexpr bool valid() const noexcept - { - return valid(first(), second(), third()); - } - - static constexpr bool valid(uint8_t first, uint8_t second, uint8_t third) noexcept - { - using privateImpl::validUtf8MB; - return ((first & UTF8_3B_PATTERN_MASK) == UTF8_3B_PATTERN) && validUtf8MB(second) && validUtf8MB(third); - } - - [[nodiscard]] constexpr uint8_t first() const noexcept - { - return characters[0]; - } - - [[nodiscard]] constexpr uint8_t second() const noexcept - { - return characters[1]; - } - - [[nodiscard]] constexpr uint8_t third() const noexcept - { - return characters[2]; - } - - auto operator<=>(const Utf83Byte& other) const noexcept = default; -}; - -struct Utf84Byte { - constexpr Utf84Byte(uint8_t first, uint8_t second, uint8_t third, uint8_t fourth) noexcept : - characters{{first, second, third, fourth}} - { - } - - Array<uint8_t, 4> characters; - - static constexpr size_t size() noexcept - { - return 4; - } - - [[nodiscard]] constexpr bool valid() const noexcept - { - return valid(first(), second(), third(), fourth()); - } - - static constexpr bool valid(uint8_t first, uint8_t second, uint8_t third, uint8_t fourth) noexcept - { - using privateImpl::validUtf8MB; - if ((first & UTF8_4B_PATTERN_MASK) != UTF8_4B_PATTERN) { - return false; - } - return validUtf8MB(second) && validUtf8MB(third) && validUtf8MB(fourth); - } - - [[nodiscard]] constexpr uint8_t first() const noexcept - { - return characters[0]; - } - - [[nodiscard]] constexpr uint8_t second() const noexcept - { - return characters[1]; - } - - [[nodiscard]] constexpr uint8_t third() const noexcept - { - return characters[2]; - } - - [[nodiscard]] constexpr uint8_t fourth() const noexcept - { - return characters[3]; - } - - auto operator<=>(const Utf84Byte& other) const noexcept = default; -}; - -using Utf8Variant = std::variant<Ascii, Utf82Byte, Utf83Byte, Utf84Byte>; - -constexpr auto ExtUtf8TypeSet{UniqueArray<size_t, 0, 1, 2, 3>{}}; -enum class ExtUtf8Type : uint8_t -{ - Ascii, - Utf82Byte, - Utf83Byte, - Utf84Byte, -}; -static_assert(ExtUtf8TypeSet.m_values[0] == static_cast<uint8_t>(ExtUtf8Type::Ascii)); -static_assert(ExtUtf8TypeSet.m_values[1] == static_cast<uint8_t>(ExtUtf8Type::Utf82Byte)); -static_assert(ExtUtf8TypeSet.m_values[2] == static_cast<uint8_t>(ExtUtf8Type::Utf83Byte)); -static_assert(ExtUtf8TypeSet.m_values[3] == static_cast<uint8_t>(ExtUtf8Type::Utf84Byte)); - -class String; -class StringView; - -struct ExtUtf8 { - Utf8Variant m_variant{Utf8Variant{Ascii{}}}; - - static constexpr Ascii invalidAsciiCode{Ascii{0xFF}}; - static ExtUtf8 fromString(const String& fudString, size_t index) noexcept; - static ExtUtf8 fromStringView(StringView&& fudView, size_t index) noexcept; - static ExtUtf8 fromStringView(const StringView& fudView, size_t index) noexcept; - - static constexpr ExtUtf8 makeUtf8(Array<utf8, 4>& data) - { - ExtUtf8 unicode{}; - if (Ascii::valid(data[0])) { - unicode.m_variant = Ascii{data[0]}; - } else if (Utf82Byte::valid(data[0], data[1])) { - unicode.m_variant = Utf82Byte{data[0], data[1]}; - } else if (Utf83Byte::valid(data[0], data[1], data[2])) { - unicode.m_variant = Utf83Byte{data[0], data[1], data[2]}; - } else if (Utf84Byte::valid(data[0], data[1], data[2], data[3])) { - unicode.m_variant = Utf84Byte{data[0], data[1], data[2], data[3]}; - } else { - unicode.m_variant = invalidAsciiCode; - } - return unicode; - } - - static constexpr ExtUtf8 makeUtf8(const Ascii& utf8Char) - { - ExtUtf8 unicode{{Utf8Variant{Ascii{}}}}; - if (utf8Char.valid()) { - unicode.m_variant = utf8Char; - } else { - unicode.m_variant = invalidAsciiCode; - } - return unicode; - } - - static constexpr ExtUtf8 invalidAscii() - { - ExtUtf8 utf8{}; - utf8.m_variant = Ascii{invalidAsciiCode}; - return utf8; - } - - [[nodiscard]] constexpr ExtUtf8Type getType() const - { - return static_cast<ExtUtf8Type>(m_variant.index()); - } - - [[nodiscard]] constexpr bool isAscii() const - { - return getType() == ExtUtf8Type::Ascii; - } - - [[nodiscard]] constexpr bool valid() const noexcept - { - switch (m_variant.index()) { - case static_cast<size_t>(ExtUtf8Type::Ascii): - return std::get<Ascii>(m_variant).valid(); - case static_cast<size_t>(ExtUtf8Type::Utf82Byte): - return std::get<Utf82Byte>(m_variant).valid(); - case static_cast<size_t>(ExtUtf8Type::Utf83Byte): - return std::get<Utf83Byte>(m_variant).valid(); - case static_cast<size_t>(ExtUtf8Type::Utf84Byte): - return std::get<Utf84Byte>(m_variant).valid(); - default: // unlikely - return false; - } - } - - [[nodiscard]] constexpr size_t size() const noexcept - { - if (!valid()) { - return 0; - } - switch (m_variant.index()) { - case static_cast<size_t>(ExtUtf8Type::Ascii): - return Ascii::size(); - case static_cast<size_t>(ExtUtf8Type::Utf82Byte): - return Utf82Byte::size(); - case static_cast<size_t>(ExtUtf8Type::Utf83Byte): - return Utf83Byte::size(); - case static_cast<size_t>(ExtUtf8Type::Utf84Byte): - return Utf84Byte::size(); - default: // unlikely - return 0; - } - } - - [[nodiscard]] constexpr const uint8_t* data() const noexcept - { - if (!valid()) { - return nullptr; - } - - switch (m_variant.index()) { - case static_cast<size_t>(ExtUtf8Type::Ascii): - return std::get<Ascii>(m_variant).characters.data(); - case static_cast<size_t>(ExtUtf8Type::Utf82Byte): - return std::get<Utf82Byte>(m_variant).characters.data(); - case static_cast<size_t>(ExtUtf8Type::Utf83Byte): - return std::get<Utf83Byte>(m_variant).characters.data(); - case static_cast<size_t>(ExtUtf8Type::Utf84Byte): - return std::get<Utf84Byte>(m_variant).characters.data(); - default: // unlikely - return nullptr; - } - } - - template <typename Func> - [[nodiscard]] bool transformAscii(Func&& transform) - { - if (isAscii()) { - std::forward<Func>(transform)(std::get<Ascii>(m_variant)); - return true; - } - return false; - } - - [[nodiscard]] constexpr int64_t hash() const noexcept - { - using fud::ExtUtf8Type; - using fud::Utf82Byte; - using fud::Utf83Byte; - using fud::Utf84Byte; - - if (!valid()) { - return -1; - } - - constexpr uint8_t OneByteShift = 8; - constexpr uint8_t TwoByteShift = 2 * OneByteShift; - constexpr uint8_t ThreeByteShift = 3 * OneByteShift; - - switch (static_cast<ExtUtf8Type>(m_variant.index())) { - case ExtUtf8Type::Ascii: - return std::get<Ascii>(m_variant).characters[0]; - case ExtUtf8Type::Utf82Byte: - return static_cast<int64_t>(std::get<Utf82Byte>(m_variant).characters[0]) << OneByteShift | - static_cast<int64_t>(std::get<Utf82Byte>(m_variant).characters[1]); - case ExtUtf8Type::Utf83Byte: - return static_cast<int64_t>(std::get<Utf83Byte>(m_variant).characters[0]) << TwoByteShift | - static_cast<int64_t>(std::get<Utf83Byte>(m_variant).characters[1]) << OneByteShift | - static_cast<int64_t>(std::get<Utf83Byte>(m_variant).characters[2]); - case ExtUtf8Type::Utf84Byte: - return static_cast<int64_t>(std::get<Utf84Byte>(m_variant).characters[0]) << ThreeByteShift | - static_cast<int64_t>(std::get<Utf84Byte>(m_variant).characters[1]) << TwoByteShift | - static_cast<int64_t>(std::get<Utf84Byte>(m_variant).characters[2]) << OneByteShift | - static_cast<int64_t>(std::get<Utf84Byte>(m_variant).characters[3]); - default: // unlikely - return -1; - } - } - - constexpr bool operator==(const ExtUtf8& other) const noexcept = default; - - constexpr auto operator<=>(const ExtUtf8& other) const noexcept - { - auto hasSameAlternative = []<typename T>(const ExtUtf8& lhs, const ExtUtf8& rhs) noexcept { - return std::holds_alternative<T>(lhs.m_variant) && std::holds_alternative<T>(rhs.m_variant); - }; - - auto getSameAlternative = []<typename T>(const ExtUtf8& lhs, const ExtUtf8& rhs) noexcept { - return std::get<T>(lhs.m_variant).operator<=>(std::get<T>(rhs.m_variant)); - }; - - if (hasSameAlternative.template operator()<Ascii>(*this, other)) { - return getSameAlternative.template operator()<Ascii>(*this, other); - } - - if (hasSameAlternative.template operator()<Utf82Byte>(*this, other)) { - return getSameAlternative.template operator()<Utf82Byte>(*this, other); - } - - if (hasSameAlternative.template operator()<Utf83Byte>(*this, other)) { - return getSameAlternative.template operator()<Utf83Byte>(*this, other); - } - - if (hasSameAlternative.template operator()<Utf84Byte>(*this, other)) { - return getSameAlternative.template operator()<Utf84Byte>(*this, other); - } - - if (std::holds_alternative<Ascii>(m_variant)) { - return std::strong_ordering::less; - } - - if (std::holds_alternative<Ascii>(other.m_variant)) { - return std::strong_ordering::greater; - } - - if (std::holds_alternative<Utf82Byte>(m_variant)) { - return std::strong_ordering::less; - } - - if (std::holds_alternative<Utf82Byte>(other.m_variant)) { - return std::strong_ordering::greater; - } - - if (std::holds_alternative<Utf83Byte>(m_variant)) { - return std::strong_ordering::less; - } - - return std::strong_ordering::greater; - } - - std::optional<Ascii> getAscii() const - { - if (m_variant.index() == static_cast<size_t>(ExtUtf8Type::Ascii)) { - return std::get<Ascii>(m_variant); - } - return std::nullopt; - } -}; - -/** \brief Checks if a character is ascii. */ -bool ext_lib_char_is_ascii(char character); - -FudStatus ext_lib_utf8_is_ascii(ExtUtf8& character, bool& isAscii); - -/** \brief Checks if a character is alphanumeric. */ -bool ext_lib_char_is_alphanumeric(char character); - -/** \brief Checks if a character is alphanumeric. */ -FudStatus ext_lib_utf8_is_alphanumeric(ExtUtf8* character, bool* pred); - -/** \brief Checks if a character is alphabetic. */ -bool ext_lib_char_is_alpha(char character); - -/** \brief Checks if a character is alphabetic. */ -FudStatus ext_lib_utf8_is_alpha(ExtUtf8* character, bool* pred); - -/** \brief Checks if a character is lowercase. */ -bool ext_lib_char_is_lowercase(char character); - -/** \brief Checks if a character is lowercase. */ -FudStatus ext_lib_utf8_is_lowercase(ExtUtf8* character, bool* pred); - -/** \brief Checks if a character is an uppercase character. */ -bool ext_lib_char_is_uppercase(char character); - -/** \brief Checks if a character is uppercase. */ -FudStatus ext_lib_utf8_is_uppercase(ExtUtf8* character, bool* pred); - -/** \brief Checks if a character is a digit. */ -bool ext_lib_char_is_digit(char character); - -/** \brief Checks if a character is a digit. */ -FudStatus ext_lib_utf8_is_digit(ExtUtf8* character, bool* pred); - -/** \brief Checks if a character is a hexadecimal character. */ -bool ext_lib_char_is_hex_digit(char character); - -/** \brief Checks if a character is a hexadecimal digit. */ -FudStatus ext_lib_utf8_is_hex_digit(ExtUtf8* character, bool* pred); - -/** \brief Checks if a character is a control character. */ -bool ext_lib_char_is_control(char character); - -/** \brief Checks if a character is a control character. */ -FudStatus ext_lib_utf8_is_control(ExtUtf8* character, bool* pred); - -/** \brief Checks if a character is a graphical character. */ -bool ext_lib_char_is_graphical(char character); - -/** \brief Checks if a character is a graphical character. */ -FudStatus ext_lib_utf8_is_graphical(ExtUtf8* character, bool* pred); - -/** \brief Checks if a character is a space character. */ -bool ext_lib_char_is_space(char character); - -/** \brief Checks if a character is a space character. */ -FudStatus ext_lib_utf8_is_space(ExtUtf8* character, bool* pred); - -/** \brief Checks if a character is a blank character. */ -bool ext_lib_char_is_blank(char character); - -/** \brief Checks if a character is a blank character. */ -FudStatus ext_lib_utf8_is_blank(ExtUtf8* character, bool* pred); - -/** \brief Checks if a character is a printable character. */ -bool ext_lib_char_is_printable(char character); - -/** \brief Checks if a character is a printable character. */ -FudStatus ext_lib_utf8_is_printable(ExtUtf8* character, bool* pred); - -/** \brief Checks if a character is a punctuation character. */ -bool ext_lib_char_is_punctuation(char character); - -/** \brief Checks if a character is a punctuation character. */ -FudStatus ext_lib_utf8_is_punctuation(ExtUtf8* character, bool* pred); - -uint8_t ext_lib_char_to_lower(uint8_t character); - -ExtUtf8* ext_lib_utf8_to_lower(ExtUtf8* character); - -uint8_t ext_lib_char_to_upper(uint8_t character); - -ExtUtf8* ext_lib_utf8_to_upper(ExtUtf8* character); - -} // namespace fud - -#endif |