/* * libfud * Copyright 2024 Dominick Allen * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef FUD_UTF8_HPP #define FUD_UTF8_HPP #include "fud_array.hpp" #include "fud_memory.hpp" #include "fud_status.hpp" #include "fud_unique_array.hpp" #include #include #include namespace fud { using utf8 = unsigned char; struct StringView; constexpr uint8_t ASCII_MASK = 0x7F; constexpr uint8_t UTF8_MB_PATTERN_MASK = 0xC0; constexpr uint8_t UTF8_MB_PATTERN = 0x80; constexpr uint8_t UTF8_MB_MASK = static_cast(~UTF8_MB_PATTERN_MASK); constexpr uint8_t UTF8_2B_PATTERN_MASK = 0xE0; constexpr uint8_t UTF8_2B_PATTERN = 0xC0; constexpr uint8_t UTF8_2B_MASK = static_cast(~UTF8_2B_PATTERN_MASK); constexpr uint8_t UTF8_3B_PATTERN_MASK = 0xF0; constexpr uint8_t UTF8_3B_PATTERN = 0xE0; constexpr uint8_t UTF8_3B_MASK = static_cast(~UTF8_3B_PATTERN_MASK); constexpr uint8_t UTF8_4B_PATTERN_MASK = 0xF8; constexpr uint8_t UTF8_4B_PATTERN = 0xF0; constexpr uint8_t UTF8_4B_MASK = static_cast(~UTF8_4B_PATTERN_MASK); namespace privateImpl { constexpr bool validUtf8MB(uint8_t code) noexcept { return (code & UTF8_MB_PATTERN_MASK) == UTF8_MB_PATTERN; } } // namespace privateImpl struct Ascii { Array characters; constexpr Ascii() noexcept = default; constexpr explicit Ascii(uint8_t chr) noexcept : characters{{chr}} { } [[nodiscard]] constexpr uint8_t character() const noexcept { return characters[0]; } [[nodiscard]] constexpr char asChar() const noexcept { return static_cast(characters[0]); } static constexpr size_t size() noexcept { return 1; } [[nodiscard]] constexpr bool valid() const noexcept { return valid(characters[0]); } static constexpr bool valid(uint8_t character) noexcept { return static_cast(character & ~ASCII_MASK) == 0; } auto operator<=>(const Ascii& other) const noexcept = default; }; static_assert(std::is_trivial_v); static_assert(std::is_standard_layout_v); /* | B | E | Byte 1 | Byte 2 | Byte 3 | Byte 4 | U+0000 | U+007F | 0xxxxxxx | | | | U+0080 | U+07FF | 110xxxxx | 10xxxxxx | | | U+0800 | U+FFFF | 1110xxxx | 10xxxxxx | 10xxxxxx | | U+10000 | U+10FFFF | 11110xxx | 10xxxxxx | 10xxxxxx | 10xxxxxx */ struct Utf82Byte { constexpr Utf82Byte(uint8_t first, uint8_t second) noexcept : characters{{first, second}} { } Array characters; static constexpr size_t size() noexcept { return 2; } [[nodiscard]] constexpr bool valid() const noexcept { return valid(first(), second()); } static constexpr bool valid(uint8_t first, uint8_t second) noexcept { using privateImpl::validUtf8MB; return ((first & UTF8_2B_PATTERN_MASK) == UTF8_2B_PATTERN) && validUtf8MB(second); } [[nodiscard]] constexpr uint8_t first() const noexcept { return characters[0]; } [[nodiscard]] constexpr uint8_t second() const noexcept { return characters[1]; } auto operator<=>(const Utf82Byte& other) const noexcept = default; }; struct Utf83Byte { constexpr Utf83Byte(uint8_t first, uint8_t second, uint8_t third) noexcept : characters{{first, second, third}} { } Array characters; static constexpr size_t size() noexcept { return 3; } [[nodiscard]] constexpr bool valid() const noexcept { return valid(first(), second(), third()); } static constexpr bool valid(uint8_t first, uint8_t second, uint8_t third) noexcept { using privateImpl::validUtf8MB; return ((first & UTF8_3B_PATTERN_MASK) == UTF8_3B_PATTERN) && validUtf8MB(second) && validUtf8MB(third); } [[nodiscard]] constexpr uint8_t first() const noexcept { return characters[0]; } [[nodiscard]] constexpr uint8_t second() const noexcept { return characters[1]; } [[nodiscard]] constexpr uint8_t third() const noexcept { return characters[2]; } auto operator<=>(const Utf83Byte& other) const noexcept = default; }; struct Utf84Byte { constexpr Utf84Byte(uint8_t first, uint8_t second, uint8_t third, uint8_t fourth) noexcept : characters{{first, second, third, fourth}} { } Array characters; static constexpr size_t size() noexcept { return 4; } [[nodiscard]] constexpr bool valid() const noexcept { return valid(first(), second(), third(), fourth()); } static constexpr bool valid(uint8_t first, uint8_t second, uint8_t third, uint8_t fourth) noexcept { using privateImpl::validUtf8MB; if ((first & UTF8_4B_PATTERN_MASK) != UTF8_4B_PATTERN) { return false; } return validUtf8MB(second) && validUtf8MB(third) && validUtf8MB(fourth); } [[nodiscard]] constexpr uint8_t first() const noexcept { return characters[0]; } [[nodiscard]] constexpr uint8_t second() const noexcept { return characters[1]; } [[nodiscard]] constexpr uint8_t third() const noexcept { return characters[2]; } [[nodiscard]] constexpr uint8_t fourth() const noexcept { return characters[3]; } auto operator<=>(const Utf84Byte& other) const noexcept = default; }; using Utf8Variant = std::variant; constexpr auto Utf8TypeSet{UniqueArray{}}; enum class Utf8Type : uint8_t { Ascii, Utf82Byte, Utf83Byte, Utf84Byte, }; static_assert(Utf8TypeSet.m_values[0] == static_cast(Utf8Type::Ascii)); static_assert(Utf8TypeSet.m_values[1] == static_cast(Utf8Type::Utf82Byte)); static_assert(Utf8TypeSet.m_values[2] == static_cast(Utf8Type::Utf83Byte)); static_assert(Utf8TypeSet.m_values[3] == static_cast(Utf8Type::Utf84Byte)); class String; class StringView; struct FudUtf8 { Utf8Variant m_variant{Utf8Variant{Ascii{}}}; static constexpr Ascii invalidAsciiCode{Ascii{0xFF}}; static FudUtf8 fromString(const String& fudString, size_t index) noexcept; static FudUtf8 fromStringView(StringView&& fudView, size_t index) noexcept; static FudUtf8 fromStringView(const StringView& fudView, size_t index) noexcept; static constexpr FudUtf8 makeUtf8(Array& data) { FudUtf8 unicode{}; if (Ascii::valid(data[0])) { unicode.m_variant = Ascii{data[0]}; } else if (Utf82Byte::valid(data[0], data[1])) { unicode.m_variant = Utf82Byte{data[0], data[1]}; } else if (Utf83Byte::valid(data[0], data[1], data[2])) { unicode.m_variant = Utf83Byte{data[0], data[1], data[2]}; } else if (Utf84Byte::valid(data[0], data[1], data[2], data[3])) { unicode.m_variant = Utf84Byte{data[0], data[1], data[2], data[3]}; } else { unicode.m_variant = invalidAsciiCode; } return unicode; } static constexpr FudUtf8 makeUtf8(const Ascii& utf8Char) { FudUtf8 unicode{{Utf8Variant{Ascii{}}}}; if (utf8Char.valid()) { unicode.m_variant = utf8Char; } else { unicode.m_variant = invalidAsciiCode; } return unicode; } static constexpr FudUtf8 invalidAscii() { FudUtf8 utf8{}; utf8.m_variant = Ascii{invalidAsciiCode}; return utf8; } [[nodiscard]] constexpr Utf8Type getType() const { return static_cast(m_variant.index()); } [[nodiscard]] constexpr bool isAscii() const { return getType() == Utf8Type::Ascii; } [[nodiscard]] constexpr bool valid() const noexcept { switch (m_variant.index()) { case static_cast(Utf8Type::Ascii): return std::get(m_variant).valid(); case static_cast(Utf8Type::Utf82Byte): return std::get(m_variant).valid(); case static_cast(Utf8Type::Utf83Byte): return std::get(m_variant).valid(); case static_cast(Utf8Type::Utf84Byte): return std::get(m_variant).valid(); default: // unlikely return false; } } [[nodiscard]] constexpr size_t size() const noexcept { if (!valid()) { return 0; } switch (m_variant.index()) { case static_cast(Utf8Type::Ascii): return Ascii::size(); case static_cast(Utf8Type::Utf82Byte): return Utf82Byte::size(); case static_cast(Utf8Type::Utf83Byte): return Utf83Byte::size(); case static_cast(Utf8Type::Utf84Byte): return Utf84Byte::size(); default: // unlikely return 0; } } [[nodiscard]] constexpr const uint8_t* data() const noexcept { if (!valid()) { return nullptr; } switch (m_variant.index()) { case static_cast(Utf8Type::Ascii): return std::get(m_variant).characters.data(); case static_cast(Utf8Type::Utf82Byte): return std::get(m_variant).characters.data(); case static_cast(Utf8Type::Utf83Byte): return std::get(m_variant).characters.data(); case static_cast(Utf8Type::Utf84Byte): return std::get(m_variant).characters.data(); default: // unlikely return nullptr; } } template [[nodiscard]] bool transformAscii(Func&& transform) { if (isAscii()) { std::forward(transform)(std::get(m_variant)); return true; } return false; } [[nodiscard]] constexpr int64_t hash() const noexcept { using fud::Utf8Type; using fud::Utf82Byte; using fud::Utf83Byte; using fud::Utf84Byte; if (!valid()) { return -1; } constexpr uint8_t OneByteShift = 8; constexpr uint8_t TwoByteShift = 2 * OneByteShift; constexpr uint8_t ThreeByteShift = 3 * OneByteShift; switch (static_cast(m_variant.index())) { case Utf8Type::Ascii: return std::get(m_variant).characters[0]; case Utf8Type::Utf82Byte: return static_cast(std::get(m_variant).characters[0]) << OneByteShift | static_cast(std::get(m_variant).characters[1]); case Utf8Type::Utf83Byte: return static_cast(std::get(m_variant).characters[0]) << TwoByteShift | static_cast(std::get(m_variant).characters[1]) << OneByteShift | static_cast(std::get(m_variant).characters[2]); case Utf8Type::Utf84Byte: return static_cast(std::get(m_variant).characters[0]) << ThreeByteShift | static_cast(std::get(m_variant).characters[1]) << TwoByteShift | static_cast(std::get(m_variant).characters[2]) << OneByteShift | static_cast(std::get(m_variant).characters[3]); default: // unlikely return -1; } } constexpr bool operator==(const FudUtf8& other) const noexcept = default; constexpr auto operator<=>(const FudUtf8& other) const noexcept { auto hasSameAlternative = [](const FudUtf8& lhs, const FudUtf8& rhs) noexcept { return std::holds_alternative(lhs.m_variant) && std::holds_alternative(rhs.m_variant); }; auto getSameAlternative = [](const FudUtf8& lhs, const FudUtf8& rhs) noexcept { return std::get(lhs.m_variant).operator<=>(std::get(rhs.m_variant)); }; if (hasSameAlternative.template operator()(*this, other)) { return getSameAlternative.template operator()(*this, other); } if (hasSameAlternative.template operator()(*this, other)) { return getSameAlternative.template operator()(*this, other); } if (hasSameAlternative.template operator()(*this, other)) { return getSameAlternative.template operator()(*this, other); } if (hasSameAlternative.template operator()(*this, other)) { return getSameAlternative.template operator()(*this, other); } if (std::holds_alternative(m_variant)) { return std::strong_ordering::less; } if (std::holds_alternative(other.m_variant)) { return std::strong_ordering::greater; } if (std::holds_alternative(m_variant)) { return std::strong_ordering::less; } if (std::holds_alternative(other.m_variant)) { return std::strong_ordering::greater; } if (std::holds_alternative(m_variant)) { return std::strong_ordering::less; } return std::strong_ordering::greater; } std::optional getAscii() const { if (m_variant.index() == static_cast(Utf8Type::Ascii)) { return std::get(m_variant); } return std::nullopt; } }; /** \brief Checks if a character is ascii. */ bool char_is_ascii(char character); FudStatus utf8_is_ascii(FudUtf8& character, bool& isAscii); /** \brief Checks if a character is alphanumeric. */ bool char_is_alphanumeric(char character); /** \brief Checks if a character is alphanumeric. */ FudStatus utf8_is_alphanumeric(FudUtf8* character, bool* pred); /** \brief Checks if a character is alphabetic. */ bool char_is_alpha(char character); /** \brief Checks if a character is alphabetic. */ FudStatus utf8_is_alpha(FudUtf8* character, bool* pred); /** \brief Checks if a character is lowercase. */ bool char_is_lowercase(char character); /** \brief Checks if a character is lowercase. */ FudStatus utf8_is_lowercase(FudUtf8* character, bool* pred); /** \brief Checks if a character is an uppercase character. */ bool char_is_uppercase(char character); /** \brief Checks if a character is uppercase. */ FudStatus utf8_is_uppercase(FudUtf8* character, bool* pred); /** \brief Checks if a character is a digit. */ bool char_is_digit(char character); /** \brief Checks if a character is a digit. */ FudStatus utf8_is_digit(FudUtf8* character, bool* pred); /** \brief Checks if a character is a hexadecimal character. */ bool char_is_hex_digit(char character); /** \brief Checks if a character is a hexadecimal digit. */ FudStatus utf8_is_hex_digit(FudUtf8* character, bool* pred); /** \brief Checks if a character is a control character. */ bool char_is_control(char character); /** \brief Checks if a character is a control character. */ FudStatus utf8_is_control(FudUtf8* character, bool* pred); /** \brief Checks if a character is a graphical character. */ bool char_is_graphical(char character); /** \brief Checks if a character is a graphical character. */ FudStatus utf8_is_graphical(FudUtf8* character, bool* pred); /** \brief Checks if a character is a space character. */ bool char_is_space(char character); /** \brief Checks if a character is a space character. */ FudStatus utf8_is_space(FudUtf8* character, bool* pred); /** \brief Checks if a character is a blank character. */ bool char_is_blank(char character); /** \brief Checks if a character is a blank character. */ FudStatus utf8_is_blank(FudUtf8* character, bool* pred); /** \brief Checks if a character is a printable character. */ bool char_is_printable(char character); /** \brief Checks if a character is a printable character. */ FudStatus utf8_is_printable(FudUtf8* character, bool* pred); /** \brief Checks if a character is a punctuation character. */ bool char_is_punctuation(char character); /** \brief Checks if a character is a punctuation character. */ FudStatus utf8_is_punctuation(FudUtf8* character, bool* pred); uint8_t char_to_lower(uint8_t character); FudUtf8* utf8_to_lower(FudUtf8* character); uint8_t char_to_upper(uint8_t character); FudUtf8* utf8_to_upper(FudUtf8* character); } // namespace fud #endif