diff options
Diffstat (limited to 'include/fud_utf8.hpp')
-rw-r--r-- | include/fud_utf8.hpp | 552 |
1 files changed, 552 insertions, 0 deletions
diff --git a/include/fud_utf8.hpp b/include/fud_utf8.hpp new file mode 100644 index 0000000..da1a5fe --- /dev/null +++ b/include/fud_utf8.hpp @@ -0,0 +1,552 @@ +/* + * libfud + * Copyright 2024 Dominick Allen + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FUD_UTF8_HPP +#define FUD_UTF8_HPP + +#include "fud_array.hpp" +#include "fud_memory.hpp" +#include "fud_status.hpp" +#include "fud_unique_array.hpp" + +#include <cstdint> +#include <optional> +#include <type_traits> + +namespace fud { + +using utf8 = unsigned char; + +struct StringView; + +constexpr uint8_t ASCII_MASK = 0x7F; + +constexpr uint8_t UTF8_MB_PATTERN_MASK = 0xC0; +constexpr uint8_t UTF8_MB_PATTERN = 0x80; +constexpr uint8_t UTF8_MB_MASK = static_cast<uint8_t>(~UTF8_MB_PATTERN_MASK); + +constexpr uint8_t UTF8_2B_PATTERN_MASK = 0xE0; +constexpr uint8_t UTF8_2B_PATTERN = 0xC0; +constexpr uint8_t UTF8_2B_MASK = static_cast<uint8_t>(~UTF8_2B_PATTERN_MASK); + +constexpr uint8_t UTF8_3B_PATTERN_MASK = 0xF0; +constexpr uint8_t UTF8_3B_PATTERN = 0xE0; +constexpr uint8_t UTF8_3B_MASK = static_cast<uint8_t>(~UTF8_3B_PATTERN_MASK); + +constexpr uint8_t UTF8_4B_PATTERN_MASK = 0xF8; +constexpr uint8_t UTF8_4B_PATTERN = 0xF0; +constexpr uint8_t UTF8_4B_MASK = static_cast<uint8_t>(~UTF8_4B_PATTERN_MASK); + +namespace privateImpl { +constexpr bool validUtf8MB(uint8_t code) noexcept +{ + return (code & UTF8_MB_PATTERN_MASK) == UTF8_MB_PATTERN; +} +} // namespace privateImpl + +struct Ascii { + Array<uint8_t, 1> characters; + + constexpr Ascii() noexcept = default; + + constexpr explicit Ascii(uint8_t chr) noexcept : characters{{chr}} + { + } + + [[nodiscard]] constexpr uint8_t character() const noexcept + { + return characters[0]; + } + + [[nodiscard]] constexpr char asChar() const noexcept + { + return static_cast<char>(characters[0]); + } + + static constexpr size_t size() noexcept + { + return 1; + } + + [[nodiscard]] constexpr bool valid() const noexcept + { + return valid(characters[0]); + } + + static constexpr bool valid(uint8_t character) noexcept + { + return static_cast<uint8_t>(character & ~ASCII_MASK) == 0; + } + + auto operator<=>(const Ascii& other) const noexcept = default; +}; + +static_assert(std::is_trivial_v<Ascii>); +static_assert(std::is_standard_layout_v<Ascii>); + +/* +| B | E | Byte 1 | Byte 2 | Byte 3 | Byte 4 +| U+0000 | U+007F | 0xxxxxxx | | | +| U+0080 | U+07FF | 110xxxxx | 10xxxxxx | | +| U+0800 | U+FFFF | 1110xxxx | 10xxxxxx | 10xxxxxx | +| U+10000 | U+10FFFF | 11110xxx | 10xxxxxx | 10xxxxxx | 10xxxxxx +*/ + +struct Utf82Byte { + constexpr Utf82Byte(uint8_t first, uint8_t second) noexcept : characters{{first, second}} + { + } + Array<uint8_t, 2> characters; + static constexpr size_t size() noexcept + { + return 2; + } + + [[nodiscard]] constexpr bool valid() const noexcept + { + return valid(first(), second()); + } + + static constexpr bool valid(uint8_t first, uint8_t second) noexcept + { + using privateImpl::validUtf8MB; + return ((first & UTF8_2B_PATTERN_MASK) == UTF8_2B_PATTERN) && validUtf8MB(second); + } + + [[nodiscard]] constexpr uint8_t first() const noexcept + { + return characters[0]; + } + + [[nodiscard]] constexpr uint8_t second() const noexcept + { + return characters[1]; + } + + auto operator<=>(const Utf82Byte& other) const noexcept = default; +}; + +struct Utf83Byte { + constexpr Utf83Byte(uint8_t first, uint8_t second, uint8_t third) noexcept : characters{{first, second, third}} + { + } + + Array<uint8_t, 3> characters; + + static constexpr size_t size() noexcept + { + return 3; + } + + [[nodiscard]] constexpr bool valid() const noexcept + { + return valid(first(), second(), third()); + } + + static constexpr bool valid(uint8_t first, uint8_t second, uint8_t third) noexcept + { + using privateImpl::validUtf8MB; + return ((first & UTF8_3B_PATTERN_MASK) == UTF8_3B_PATTERN) && validUtf8MB(second) && validUtf8MB(third); + } + + [[nodiscard]] constexpr uint8_t first() const noexcept + { + return characters[0]; + } + + [[nodiscard]] constexpr uint8_t second() const noexcept + { + return characters[1]; + } + + [[nodiscard]] constexpr uint8_t third() const noexcept + { + return characters[2]; + } + + auto operator<=>(const Utf83Byte& other) const noexcept = default; +}; + +struct Utf84Byte { + constexpr Utf84Byte(uint8_t first, uint8_t second, uint8_t third, uint8_t fourth) noexcept : + characters{{first, second, third, fourth}} + { + } + + Array<uint8_t, 4> characters; + + static constexpr size_t size() noexcept + { + return 4; + } + + [[nodiscard]] constexpr bool valid() const noexcept + { + return valid(first(), second(), third(), fourth()); + } + + static constexpr bool valid(uint8_t first, uint8_t second, uint8_t third, uint8_t fourth) noexcept + { + using privateImpl::validUtf8MB; + if ((first & UTF8_4B_PATTERN_MASK) != UTF8_4B_PATTERN) { + return false; + } + return validUtf8MB(second) && validUtf8MB(third) && validUtf8MB(fourth); + } + + [[nodiscard]] constexpr uint8_t first() const noexcept + { + return characters[0]; + } + + [[nodiscard]] constexpr uint8_t second() const noexcept + { + return characters[1]; + } + + [[nodiscard]] constexpr uint8_t third() const noexcept + { + return characters[2]; + } + + [[nodiscard]] constexpr uint8_t fourth() const noexcept + { + return characters[3]; + } + + auto operator<=>(const Utf84Byte& other) const noexcept = default; +}; + +using Utf8Variant = std::variant<Ascii, Utf82Byte, Utf83Byte, Utf84Byte>; + +constexpr auto Utf8TypeSet{UniqueArray<size_t, 0, 1, 2, 3>{}}; +enum class Utf8Type : uint8_t +{ + Ascii, + Utf82Byte, + Utf83Byte, + Utf84Byte, +}; +static_assert(Utf8TypeSet.m_values[0] == static_cast<uint8_t>(Utf8Type::Ascii)); +static_assert(Utf8TypeSet.m_values[1] == static_cast<uint8_t>(Utf8Type::Utf82Byte)); +static_assert(Utf8TypeSet.m_values[2] == static_cast<uint8_t>(Utf8Type::Utf83Byte)); +static_assert(Utf8TypeSet.m_values[3] == static_cast<uint8_t>(Utf8Type::Utf84Byte)); + +class String; +class StringView; + +struct FudUtf8 { + Utf8Variant m_variant{Utf8Variant{Ascii{}}}; + + static constexpr Ascii invalidAsciiCode{Ascii{0xFF}}; + static FudUtf8 fromString(const String& fudString, size_t index) noexcept; + static FudUtf8 fromStringView(StringView&& fudView, size_t index) noexcept; + static FudUtf8 fromStringView(const StringView& fudView, size_t index) noexcept; + + static constexpr FudUtf8 makeUtf8(Array<utf8, 4>& data) + { + FudUtf8 unicode{}; + if (Ascii::valid(data[0])) { + unicode.m_variant = Ascii{data[0]}; + } else if (Utf82Byte::valid(data[0], data[1])) { + unicode.m_variant = Utf82Byte{data[0], data[1]}; + } else if (Utf83Byte::valid(data[0], data[1], data[2])) { + unicode.m_variant = Utf83Byte{data[0], data[1], data[2]}; + } else if (Utf84Byte::valid(data[0], data[1], data[2], data[3])) { + unicode.m_variant = Utf84Byte{data[0], data[1], data[2], data[3]}; + } else { + unicode.m_variant = invalidAsciiCode; + } + return unicode; + } + + static constexpr FudUtf8 makeUtf8(const Ascii& utf8Char) + { + FudUtf8 unicode{{Utf8Variant{Ascii{}}}}; + if (utf8Char.valid()) { + unicode.m_variant = utf8Char; + } else { + unicode.m_variant = invalidAsciiCode; + } + return unicode; + } + + static constexpr FudUtf8 invalidAscii() + { + FudUtf8 utf8{}; + utf8.m_variant = Ascii{invalidAsciiCode}; + return utf8; + } + + [[nodiscard]] constexpr Utf8Type getType() const + { + return static_cast<Utf8Type>(m_variant.index()); + } + + [[nodiscard]] constexpr bool isAscii() const + { + return getType() == Utf8Type::Ascii; + } + + [[nodiscard]] constexpr bool valid() const noexcept + { + switch (m_variant.index()) { + case static_cast<size_t>(Utf8Type::Ascii): + return std::get<Ascii>(m_variant).valid(); + case static_cast<size_t>(Utf8Type::Utf82Byte): + return std::get<Utf82Byte>(m_variant).valid(); + case static_cast<size_t>(Utf8Type::Utf83Byte): + return std::get<Utf83Byte>(m_variant).valid(); + case static_cast<size_t>(Utf8Type::Utf84Byte): + return std::get<Utf84Byte>(m_variant).valid(); + default: // unlikely + return false; + } + } + + [[nodiscard]] constexpr size_t size() const noexcept + { + if (!valid()) { + return 0; + } + switch (m_variant.index()) { + case static_cast<size_t>(Utf8Type::Ascii): + return Ascii::size(); + case static_cast<size_t>(Utf8Type::Utf82Byte): + return Utf82Byte::size(); + case static_cast<size_t>(Utf8Type::Utf83Byte): + return Utf83Byte::size(); + case static_cast<size_t>(Utf8Type::Utf84Byte): + return Utf84Byte::size(); + default: // unlikely + return 0; + } + } + + [[nodiscard]] constexpr const uint8_t* data() const noexcept + { + if (!valid()) { + return nullptr; + } + + switch (m_variant.index()) { + case static_cast<size_t>(Utf8Type::Ascii): + return std::get<Ascii>(m_variant).characters.data(); + case static_cast<size_t>(Utf8Type::Utf82Byte): + return std::get<Utf82Byte>(m_variant).characters.data(); + case static_cast<size_t>(Utf8Type::Utf83Byte): + return std::get<Utf83Byte>(m_variant).characters.data(); + case static_cast<size_t>(Utf8Type::Utf84Byte): + return std::get<Utf84Byte>(m_variant).characters.data(); + default: // unlikely + return nullptr; + } + } + + template <typename Func> + [[nodiscard]] bool transformAscii(Func&& transform) + { + if (isAscii()) { + std::forward<Func>(transform)(std::get<Ascii>(m_variant)); + return true; + } + return false; + } + + [[nodiscard]] constexpr int64_t hash() const noexcept + { + using fud::Utf8Type; + using fud::Utf82Byte; + using fud::Utf83Byte; + using fud::Utf84Byte; + + if (!valid()) { + return -1; + } + + constexpr uint8_t OneByteShift = 8; + constexpr uint8_t TwoByteShift = 2 * OneByteShift; + constexpr uint8_t ThreeByteShift = 3 * OneByteShift; + + switch (static_cast<Utf8Type>(m_variant.index())) { + case Utf8Type::Ascii: + return std::get<Ascii>(m_variant).characters[0]; + case Utf8Type::Utf82Byte: + return static_cast<int64_t>(std::get<Utf82Byte>(m_variant).characters[0]) << OneByteShift | + static_cast<int64_t>(std::get<Utf82Byte>(m_variant).characters[1]); + case Utf8Type::Utf83Byte: + return static_cast<int64_t>(std::get<Utf83Byte>(m_variant).characters[0]) << TwoByteShift | + static_cast<int64_t>(std::get<Utf83Byte>(m_variant).characters[1]) << OneByteShift | + static_cast<int64_t>(std::get<Utf83Byte>(m_variant).characters[2]); + case Utf8Type::Utf84Byte: + return static_cast<int64_t>(std::get<Utf84Byte>(m_variant).characters[0]) << ThreeByteShift | + static_cast<int64_t>(std::get<Utf84Byte>(m_variant).characters[1]) << TwoByteShift | + static_cast<int64_t>(std::get<Utf84Byte>(m_variant).characters[2]) << OneByteShift | + static_cast<int64_t>(std::get<Utf84Byte>(m_variant).characters[3]); + default: // unlikely + return -1; + } + } + + constexpr bool operator==(const FudUtf8& other) const noexcept = default; + + constexpr auto operator<=>(const FudUtf8& other) const noexcept + { + auto hasSameAlternative = []<typename T>(const FudUtf8& lhs, const FudUtf8& rhs) noexcept { + return std::holds_alternative<T>(lhs.m_variant) && std::holds_alternative<T>(rhs.m_variant); + }; + + auto getSameAlternative = []<typename T>(const FudUtf8& lhs, const FudUtf8& rhs) noexcept { + return std::get<T>(lhs.m_variant).operator<=>(std::get<T>(rhs.m_variant)); + }; + + if (hasSameAlternative.template operator()<Ascii>(*this, other)) { + return getSameAlternative.template operator()<Ascii>(*this, other); + } + + if (hasSameAlternative.template operator()<Utf82Byte>(*this, other)) { + return getSameAlternative.template operator()<Utf82Byte>(*this, other); + } + + if (hasSameAlternative.template operator()<Utf83Byte>(*this, other)) { + return getSameAlternative.template operator()<Utf83Byte>(*this, other); + } + + if (hasSameAlternative.template operator()<Utf84Byte>(*this, other)) { + return getSameAlternative.template operator()<Utf84Byte>(*this, other); + } + + if (std::holds_alternative<Ascii>(m_variant)) { + return std::strong_ordering::less; + } + + if (std::holds_alternative<Ascii>(other.m_variant)) { + return std::strong_ordering::greater; + } + + if (std::holds_alternative<Utf82Byte>(m_variant)) { + return std::strong_ordering::less; + } + + if (std::holds_alternative<Utf82Byte>(other.m_variant)) { + return std::strong_ordering::greater; + } + + if (std::holds_alternative<Utf83Byte>(m_variant)) { + return std::strong_ordering::less; + } + + return std::strong_ordering::greater; + } + + std::optional<Ascii> getAscii() const + { + if (m_variant.index() == static_cast<size_t>(Utf8Type::Ascii)) { + return std::get<Ascii>(m_variant); + } + return std::nullopt; + } +}; + +/** \brief Checks if a character is ascii. */ +bool char_is_ascii(char character); + +FudStatus utf8_is_ascii(FudUtf8& character, bool& isAscii); + +/** \brief Checks if a character is alphanumeric. */ +bool char_is_alphanumeric(char character); + +/** \brief Checks if a character is alphanumeric. */ +FudStatus utf8_is_alphanumeric(FudUtf8* character, bool* pred); + +/** \brief Checks if a character is alphabetic. */ +bool char_is_alpha(char character); + +/** \brief Checks if a character is alphabetic. */ +FudStatus utf8_is_alpha(FudUtf8* character, bool* pred); + +/** \brief Checks if a character is lowercase. */ +bool char_is_lowercase(char character); + +/** \brief Checks if a character is lowercase. */ +FudStatus utf8_is_lowercase(FudUtf8* character, bool* pred); + +/** \brief Checks if a character is an uppercase character. */ +bool char_is_uppercase(char character); + +/** \brief Checks if a character is uppercase. */ +FudStatus utf8_is_uppercase(FudUtf8* character, bool* pred); + +/** \brief Checks if a character is a digit. */ +bool char_is_digit(char character); + +/** \brief Checks if a character is a digit. */ +FudStatus utf8_is_digit(FudUtf8* character, bool* pred); + +/** \brief Checks if a character is a hexadecimal character. */ +bool char_is_hex_digit(char character); + +/** \brief Checks if a character is a hexadecimal digit. */ +FudStatus utf8_is_hex_digit(FudUtf8* character, bool* pred); + +/** \brief Checks if a character is a control character. */ +bool char_is_control(char character); + +/** \brief Checks if a character is a control character. */ +FudStatus utf8_is_control(FudUtf8* character, bool* pred); + +/** \brief Checks if a character is a graphical character. */ +bool char_is_graphical(char character); + +/** \brief Checks if a character is a graphical character. */ +FudStatus utf8_is_graphical(FudUtf8* character, bool* pred); + +/** \brief Checks if a character is a space character. */ +bool char_is_space(char character); + +/** \brief Checks if a character is a space character. */ +FudStatus utf8_is_space(FudUtf8* character, bool* pred); + +/** \brief Checks if a character is a blank character. */ +bool char_is_blank(char character); + +/** \brief Checks if a character is a blank character. */ +FudStatus utf8_is_blank(FudUtf8* character, bool* pred); + +/** \brief Checks if a character is a printable character. */ +bool char_is_printable(char character); + +/** \brief Checks if a character is a printable character. */ +FudStatus utf8_is_printable(FudUtf8* character, bool* pred); + +/** \brief Checks if a character is a punctuation character. */ +bool char_is_punctuation(char character); + +/** \brief Checks if a character is a punctuation character. */ +FudStatus utf8_is_punctuation(FudUtf8* character, bool* pred); + +uint8_t char_to_lower(uint8_t character); + +FudUtf8* utf8_to_lower(FudUtf8* character); + +uint8_t char_to_upper(uint8_t character); + +FudUtf8* utf8_to_upper(FudUtf8* character); + +} // namespace fud + +#endif |