/* * libfud * Copyright 2024 Dominick Allen * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef FUD_UTF8_HPP #define FUD_UTF8_HPP #include "fud_array.hpp" #include "fud_c_string.hpp" #include "fud_option.hpp" #include "fud_unique_array.hpp" #include #include #include namespace fud { using utf8 = char8_t; class String; struct StringView; constexpr utf8 ASCII_MASK = 0x7F; constexpr utf8 UTF8_MB_PATTERN_MASK = 0xC0; constexpr utf8 UTF8_MB_PATTERN = 0x80; constexpr utf8 UTF8_MB_MASK = static_cast(~UTF8_MB_PATTERN_MASK); constexpr utf8 UTF8_2B_PATTERN_MASK = 0xE0; constexpr utf8 UTF8_2B_PATTERN = 0xC0; constexpr utf8 UTF8_2B_MASK = static_cast(~UTF8_2B_PATTERN_MASK); constexpr utf8 UTF8_3B_PATTERN_MASK = 0xF0; constexpr utf8 UTF8_3B_PATTERN = 0xE0; constexpr utf8 UTF8_3B_MASK = static_cast(~UTF8_3B_PATTERN_MASK); constexpr utf8 UTF8_4B_PATTERN_MASK = 0xF8; constexpr utf8 UTF8_4B_PATTERN = 0xF0; constexpr utf8 UTF8_4B_MASK = static_cast(~UTF8_4B_PATTERN_MASK); namespace privateImpl { constexpr bool validUtf8MB(utf8 code) noexcept { return (code & UTF8_MB_PATTERN_MASK) == UTF8_MB_PATTERN; } } // namespace privateImpl struct Ascii { Array characters; constexpr Ascii() noexcept = default; // cppcheck-suppress uninitMemberVar constexpr explicit Ascii(utf8 chr) noexcept : characters{{chr}} { } [[nodiscard]] constexpr utf8 character() const noexcept { return characters[0]; } [[nodiscard]] constexpr char asChar() const noexcept { return static_cast(characters[0]); } static constexpr size_t size() noexcept { return 1; } [[nodiscard]] constexpr bool valid() const noexcept { return valid(characters[0]); } static constexpr bool valid(utf8 character) noexcept { return static_cast(character & ~ASCII_MASK) == 0; } auto operator<=>(const Ascii& other) const noexcept = default; }; static_assert(std::is_trivial_v); static_assert(std::is_standard_layout_v); /* | B | E | Byte 1 | Byte 2 | Byte 3 | Byte 4 | U+0000 | U+007F | 0xxxxxxx | | | | U+0080 | U+07FF | 110xxxxx | 10xxxxxx | | | U+0800 | U+FFFF | 1110xxxx | 10xxxxxx | 10xxxxxx | | U+10000 | U+10FFFF | 11110xxx | 10xxxxxx | 10xxxxxx | 10xxxxxx */ struct Utf82Byte { constexpr Utf82Byte(utf8 first, utf8 second) noexcept : characters{{first, second}} { } __attribute__((nonnull)) constexpr Utf82Byte(const char* letterStr) noexcept : characters{} { auto length = cStringLength(letterStr, 2); if (length < 2) { return; } characters[0] = static_cast(letterStr[0]); characters[1] = static_cast(letterStr[1]); } Array characters; static constexpr size_t size() noexcept { return 2; } [[nodiscard]] constexpr bool valid() const noexcept { return valid(first(), second()); } static constexpr bool valid(utf8 first, utf8 second) noexcept { using privateImpl::validUtf8MB; return ((first & UTF8_2B_PATTERN_MASK) == UTF8_2B_PATTERN) && validUtf8MB(second); } [[nodiscard]] constexpr utf8 first() const noexcept { return characters[0]; } [[nodiscard]] constexpr utf8 second() const noexcept { return characters[1]; } auto operator<=>(const Utf82Byte& other) const noexcept = default; }; struct Utf83Byte { constexpr Utf83Byte(utf8 first, utf8 second, utf8 third) noexcept : characters{{first, second, third}} { } __attribute__((nonnull)) constexpr Utf83Byte(const char* letterStr) noexcept : characters{} { auto length = cStringLength(letterStr, 3); if (length < 3) { return; } characters[0] = static_cast(letterStr[0]); characters[1] = static_cast(letterStr[1]); characters[2] = static_cast(letterStr[2]); } Array characters; static constexpr size_t size() noexcept { return 3; } [[nodiscard]] constexpr bool valid() const noexcept { return valid(first(), second(), third()); } static constexpr bool valid(utf8 first, utf8 second, utf8 third) noexcept { using privateImpl::validUtf8MB; return ((first & UTF8_3B_PATTERN_MASK) == UTF8_3B_PATTERN) && validUtf8MB(second) && validUtf8MB(third); } [[nodiscard]] constexpr utf8 first() const noexcept { return characters[0]; } [[nodiscard]] constexpr utf8 second() const noexcept { return characters[1]; } [[nodiscard]] constexpr utf8 third() const noexcept { return characters[2]; } auto operator<=>(const Utf83Byte& other) const noexcept = default; }; struct Utf84Byte { constexpr Utf84Byte(utf8 first, utf8 second, utf8 third, utf8 fourth) noexcept : characters{{first, second, third, fourth}} { } __attribute__((nonnull)) constexpr Utf84Byte(const char* letterStr) noexcept : characters{} { auto length = cStringLength(letterStr, 4); if (length < 4) { return; } characters[0] = static_cast(letterStr[0]); characters[1] = static_cast(letterStr[1]); characters[2] = static_cast(letterStr[2]); characters[3] = static_cast(letterStr[3]); } Array characters; static constexpr size_t size() noexcept { return 4; } [[nodiscard]] constexpr bool valid() const noexcept { return valid(first(), second(), third(), fourth()); } static constexpr bool valid(utf8 first, utf8 second, utf8 third, utf8 fourth) noexcept { using privateImpl::validUtf8MB; if ((first & UTF8_4B_PATTERN_MASK) != UTF8_4B_PATTERN) { return false; } return validUtf8MB(second) && validUtf8MB(third) && validUtf8MB(fourth); } [[nodiscard]] constexpr utf8 first() const noexcept { return characters[0]; } [[nodiscard]] constexpr utf8 second() const noexcept { return characters[1]; } [[nodiscard]] constexpr utf8 third() const noexcept { return characters[2]; } [[nodiscard]] constexpr utf8 fourth() const noexcept { return characters[3]; } auto operator<=>(const Utf84Byte& other) const noexcept = default; }; using Utf8Variant = std::variant; constexpr auto Utf8TypeSet{UniqueArray{}}; enum class Utf8Type : uint8_t { Ascii, Utf82Byte, Utf83Byte, Utf84Byte, }; static_assert(Utf8TypeSet.m_values[0] == static_cast(Utf8Type::Ascii)); static_assert(Utf8TypeSet.m_values[1] == static_cast(Utf8Type::Utf82Byte)); static_assert(Utf8TypeSet.m_values[2] == static_cast(Utf8Type::Utf83Byte)); static_assert(Utf8TypeSet.m_values[3] == static_cast(Utf8Type::Utf84Byte)); struct Utf8 { Utf8Variant m_variant{Utf8Variant{Ascii{}}}; static constexpr Ascii invalidAsciiCode{Ascii{0xFF}}; static Utf8 from(const String& fudString, size_t index) noexcept; static Utf8 from(StringView view, size_t index) noexcept; static constexpr Utf8 make(const Array& data) { Utf8 unicode{}; if (Ascii::valid(data[0])) { unicode.m_variant = Ascii{data[0]}; } else if (Utf82Byte::valid(data[0], data[1])) { unicode.m_variant = Utf82Byte{data[0], data[1]}; } else if (Utf83Byte::valid(data[0], data[1], data[2])) { unicode.m_variant = Utf83Byte{data[0], data[1], data[2]}; } else if (Utf84Byte::valid(data[0], data[1], data[2], data[3])) { unicode.m_variant = Utf84Byte{data[0], data[1], data[2], data[3]}; } else { unicode.m_variant = invalidAsciiCode; } return unicode; } static constexpr Utf8 make(utf8 utf8Char) { return make(Ascii{utf8Char}); } static constexpr Utf8 make(Ascii utf8Char) { Utf8 unicode{{Utf8Variant{Ascii{}}}}; if (utf8Char.valid()) { unicode.m_variant = utf8Char; } else { unicode.m_variant = invalidAsciiCode; } return unicode; } static constexpr Utf8 make(Utf8Variant utf8Variant) { Utf8 unicode{}; unicode.m_variant = utf8Variant; if (!std::visit([](auto arg) { return arg.valid(); }, utf8Variant)) { unicode.m_variant = invalidAsciiCode; } return unicode; } static constexpr Utf8 invalidAscii() { Utf8 character{}; character.m_variant = Ascii{invalidAsciiCode}; return character; } [[nodiscard]] constexpr Utf8Type getType() const { return static_cast(m_variant.index()); } [[nodiscard]] constexpr bool isAscii() const { return getType() == Utf8Type::Ascii; } [[nodiscard]] constexpr bool valid() const noexcept { switch (m_variant.index()) { case static_cast(Utf8Type::Ascii): return std::get(m_variant).valid(); case static_cast(Utf8Type::Utf82Byte): return std::get(m_variant).valid(); case static_cast(Utf8Type::Utf83Byte): return std::get(m_variant).valid(); case static_cast(Utf8Type::Utf84Byte): return std::get(m_variant).valid(); default: // unlikely return false; } } [[nodiscard]] constexpr size_t size() const noexcept { if (!valid()) { return 0; } switch (m_variant.index()) { case static_cast(Utf8Type::Ascii): return Ascii::size(); case static_cast(Utf8Type::Utf82Byte): return Utf82Byte::size(); case static_cast(Utf8Type::Utf83Byte): return Utf83Byte::size(); case static_cast(Utf8Type::Utf84Byte): return Utf84Byte::size(); default: // unlikely return 0; } } [[nodiscard]] constexpr const utf8* data() const noexcept { if (!valid()) { return nullptr; } switch (m_variant.index()) { case static_cast(Utf8Type::Ascii): return std::get(m_variant).characters.data(); case static_cast(Utf8Type::Utf82Byte): return std::get(m_variant).characters.data(); case static_cast(Utf8Type::Utf83Byte): return std::get(m_variant).characters.data(); case static_cast(Utf8Type::Utf84Byte): return std::get(m_variant).characters.data(); default: // unlikely return nullptr; } } template [[nodiscard]] bool transformAscii(Func&& transform) { if (isAscii()) { std::forward(transform)(std::get(m_variant)); return true; } return false; } [[nodiscard]] constexpr int64_t hash() const noexcept { using fud::Utf82Byte; using fud::Utf83Byte; using fud::Utf84Byte; using fud::Utf8Type; if (!valid()) { return -1; } constexpr uint8_t OneByteShift = 8; constexpr uint8_t TwoByteShift = 2 * OneByteShift; constexpr uint8_t ThreeByteShift = 3 * OneByteShift; switch (static_cast(m_variant.index())) { case Utf8Type::Ascii: return std::get(m_variant).characters[0]; case Utf8Type::Utf82Byte: return static_cast(std::get(m_variant).characters[0]) << OneByteShift | static_cast(std::get(m_variant).characters[1]); case Utf8Type::Utf83Byte: return static_cast(std::get(m_variant).characters[0]) << TwoByteShift | static_cast(std::get(m_variant).characters[1]) << OneByteShift | static_cast(std::get(m_variant).characters[2]); case Utf8Type::Utf84Byte: return static_cast(std::get(m_variant).characters[0]) << ThreeByteShift | static_cast(std::get(m_variant).characters[1]) << TwoByteShift | static_cast(std::get(m_variant).characters[2]) << OneByteShift | static_cast(std::get(m_variant).characters[3]); default: // unlikely return -1; } } constexpr bool operator==(const Utf8& other) const noexcept = default; constexpr auto operator<=>(const Utf8& other) const noexcept { auto hasSameAlternative = [](const Utf8& lhs, const Utf8& rhs) noexcept { return std::holds_alternative(lhs.m_variant) && std::holds_alternative(rhs.m_variant); }; auto getSameAlternative = [](const Utf8& lhs, const Utf8& rhs) noexcept { return std::get(lhs.m_variant).operator<=>(std::get(rhs.m_variant)); }; if (hasSameAlternative.template operator()(*this, other)) { return getSameAlternative.template operator()(*this, other); } if (hasSameAlternative.template operator()(*this, other)) { return getSameAlternative.template operator()(*this, other); } if (hasSameAlternative.template operator()(*this, other)) { return getSameAlternative.template operator()(*this, other); } if (hasSameAlternative.template operator()(*this, other)) { return getSameAlternative.template operator()(*this, other); } if (std::holds_alternative(m_variant)) { return std::strong_ordering::less; } if (std::holds_alternative(other.m_variant)) { return std::strong_ordering::greater; } if (std::holds_alternative(m_variant)) { return std::strong_ordering::less; } if (std::holds_alternative(other.m_variant)) { return std::strong_ordering::greater; } if (std::holds_alternative(m_variant)) { return std::strong_ordering::less; } return std::strong_ordering::greater; } Option getAscii() const { if (m_variant.index() == static_cast(Utf8Type::Ascii)) { return std::get(m_variant); } return NullOpt; } }; namespace classify { using CharPredicate = bool (*)(char); using Utf8Predicate = bool (*)(utf8); using FudUtf8Predicate = bool (*)(Utf8); /** \brief Checks if a character is ascii. */ [[nodiscard]] bool isAscii(char character); [[nodiscard]] bool isAscii(utf8 character); [[nodiscard]] bool isAscii(Utf8 character); /** \brief Checks if a character is alphanumeric. */ [[nodiscard]] bool isAlphanumeric(char character); /** \brief Checks if a character is alphanumeric. */ [[nodiscard]] bool isAlphanumeric(utf8 character); /** \brief Checks if a character is alphanumeric. */ [[nodiscard]] bool isAlphanumeric(Utf8 character); /** \brief Checks if a character is alphabetic. */ [[nodiscard]] bool isAlpha(char character); /** \brief Checks if a character is alphabetic. */ [[nodiscard]] bool isAlpha(utf8 character); /** \brief Checks if a character is alphabetic. */ [[nodiscard]] bool isAlpha(Utf8 character); /** \brief Checks if a character is lowercase. */ [[nodiscard]] bool isLowercase(char character); /** \brief Checks if a character is lowercase. */ [[nodiscard]] bool isLowercase(utf8 character); /** \brief Checks if a character is lowercase. */ [[nodiscard]] bool isLowercase(Utf8 character); /** \brief Checks if a character is uppercase. */ [[nodiscard]] bool isUppercase(char character); /** \brief Checks if a character is uppercase. */ [[nodiscard]] bool isUppercase(utf8 character); /** \brief Checks if a character is uppercase. */ [[nodiscard]] bool isUppercase(Utf8 character); /** \brief Checks if a character is a digit. */ [[nodiscard]] bool isDigit(char character); /** \brief Checks if a character is a digit. */ [[nodiscard]] bool isDigit(utf8 character); /** \brief Checks if a character is a digit. */ [[nodiscard]] bool isDigit(Utf8 character); /** \brief Checks if a character is a hexadecimal character. */ [[nodiscard]] bool isHexDigit(char character); /** \brief Checks if a character is a hexadecimal character. */ [[nodiscard]] bool isHexDigit(utf8 character); /** \brief Checks if a character is a hexadecimal digit. */ [[nodiscard]] bool isHexDigit(Utf8 character); /** \brief Checks if a character is a control character. */ [[nodiscard]] bool isControl(char character); /** \brief Checks if a character is a control character. */ [[nodiscard]] bool isControl(utf8 character); /** \brief Checks if a character is a control character. */ [[nodiscard]] bool isControl(Utf8 character); /** \brief Checks if a character is a graphical character. */ [[nodiscard]] bool isGraphical(char character); /** \brief Checks if a character is a graphical character. */ [[nodiscard]] bool isGraphical(utf8 character); /** \brief Checks if a character is a graphical character. */ [[nodiscard]] bool isGraphical(Utf8 character); /** \brief Checks if a character is a space character. */ [[nodiscard]] bool isSpace(char character); /** \brief Checks if a character is a space character. */ [[nodiscard]] bool isSpace(utf8 character); /** \brief Checks if a character is a space character. */ [[nodiscard]] bool isSpace(Utf8 character); /** \brief Checks if a character is a blank character. */ [[nodiscard]] bool isBlank(char character); /** \brief Checks if a character is a blank character. */ [[nodiscard]] bool isBlank(utf8 character); /** \brief Checks if a character is a blank character. */ [[nodiscard]] bool isBlank(Utf8 character); /** \brief Checks if a character is a printable character. */ [[nodiscard]] bool isPrintable(char character); /** \brief Checks if a character is a printable character. */ [[nodiscard]] bool isPrintable(utf8 character); /** \brief Checks if a character is a printable character. */ [[nodiscard]] bool isPrintable(Utf8 character); /** \brief Checks if a character is a punctuation character. */ [[nodiscard]] bool isPunctuation(char character); /** \brief Checks if a character is a punctuation character. */ [[nodiscard]] bool isPunctuation(utf8 character); /** \brief Checks if a character is a punctuation character. */ [[nodiscard]] bool isPunctuation(Utf8 character); } // namespace classify /** \brief Converts character to lowercase if valid. */ uint8_t charToLower(uint8_t character); /** \brief Converts character to lowercase if valid. */ Utf8 utf8ToLower(Utf8 character); /** \brief Converts character to uppercase if valid. */ uint8_t charToUpper(uint8_t character); /** \brief Converts character to uppercase if valid. */ Utf8 utf8ToUpper(Utf8 character); } // namespace fud #endif