summaryrefslogtreecommitdiff
path: root/include/utf8.hpp
diff options
context:
space:
mode:
authorDominick Allen <djallen@librehumanitas.org>2024-09-22 10:19:15 -0500
committerDominick Allen <djallen@librehumanitas.org>2024-09-22 10:19:15 -0500
commitbf81e34921e3e30b05313efbcf5c9fa839cb7c05 (patch)
treeb56a343e59164bc347232669e8bb808cf3c4f4ef /include/utf8.hpp
Initial commit.
Diffstat (limited to 'include/utf8.hpp')
-rw-r--r--include/utf8.hpp557
1 files changed, 557 insertions, 0 deletions
diff --git a/include/utf8.hpp b/include/utf8.hpp
new file mode 100644
index 0000000..c66d93c
--- /dev/null
+++ b/include/utf8.hpp
@@ -0,0 +1,557 @@
+/*
+ * libfud
+ * Copyright 2024 Dominick Allen
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FUD_UTF8_HPP
+#define FUD_UTF8_HPP
+
+#include "array.hpp"
+#include "memory.hpp"
+#include "status.hpp"
+#include "unique_array.hpp"
+
+/*
+#include "ext_hash.hpp"
+#include "ext_set.hpp"
+*/
+
+#include <cstdint>
+#include <optional>
+#include <type_traits>
+
+namespace fud {
+
+using utf8 = unsigned char;
+
+struct StringView;
+
+constexpr uint8_t ASCII_MASK = 0x7F;
+
+constexpr uint8_t UTF8_MB_PATTERN_MASK = 0xC0;
+constexpr uint8_t UTF8_MB_PATTERN = 0x80;
+constexpr uint8_t UTF8_MB_MASK = static_cast<uint8_t>(~UTF8_MB_PATTERN_MASK);
+
+constexpr uint8_t UTF8_2B_PATTERN_MASK = 0xE0;
+constexpr uint8_t UTF8_2B_PATTERN = 0xC0;
+constexpr uint8_t UTF8_2B_MASK = static_cast<uint8_t>(~UTF8_2B_PATTERN_MASK);
+
+constexpr uint8_t UTF8_3B_PATTERN_MASK = 0xF0;
+constexpr uint8_t UTF8_3B_PATTERN = 0xE0;
+constexpr uint8_t UTF8_3B_MASK = static_cast<uint8_t>(~UTF8_3B_PATTERN_MASK);
+
+constexpr uint8_t UTF8_4B_PATTERN_MASK = 0xF8;
+constexpr uint8_t UTF8_4B_PATTERN = 0xF0;
+constexpr uint8_t UTF8_4B_MASK = static_cast<uint8_t>(~UTF8_4B_PATTERN_MASK);
+
+namespace privateImpl {
+constexpr bool validUtf8MB(uint8_t code) noexcept
+{
+ return (code & UTF8_MB_PATTERN_MASK) == UTF8_MB_PATTERN;
+}
+} // namespace privateImpl
+
+struct Ascii {
+ Array<uint8_t, 1> characters;
+
+ constexpr Ascii() noexcept = default;
+
+ constexpr explicit Ascii(uint8_t chr) noexcept : characters{{chr}}
+ {
+ }
+
+ [[nodiscard]] constexpr uint8_t character() const noexcept
+ {
+ return characters[0];
+ }
+
+ [[nodiscard]] constexpr char asChar() const noexcept
+ {
+ return static_cast<char>(characters[0]);
+ }
+
+ static constexpr size_t size() noexcept
+ {
+ return 1;
+ }
+
+ [[nodiscard]] constexpr bool valid() const noexcept
+ {
+ return valid(characters[0]);
+ }
+
+ static constexpr bool valid(uint8_t character) noexcept
+ {
+ return static_cast<uint8_t>(character & ~ASCII_MASK) == 0;
+ }
+
+ auto operator<=>(const Ascii& other) const noexcept = default;
+};
+
+static_assert(std::is_trivial_v<Ascii>);
+static_assert(std::is_standard_layout_v<Ascii>);
+
+/*
+| B | E | Byte 1 | Byte 2 | Byte 3 | Byte 4
+| U+0000 | U+007F | 0xxxxxxx | | |
+| U+0080 | U+07FF | 110xxxxx | 10xxxxxx | |
+| U+0800 | U+FFFF | 1110xxxx | 10xxxxxx | 10xxxxxx |
+| U+10000 | U+10FFFF | 11110xxx | 10xxxxxx | 10xxxxxx | 10xxxxxx
+*/
+
+struct Utf82Byte {
+ constexpr Utf82Byte(uint8_t first, uint8_t second) noexcept : characters{{first, second}}
+ {
+ }
+ Array<uint8_t, 2> characters;
+ static constexpr size_t size() noexcept
+ {
+ return 2;
+ }
+
+ [[nodiscard]] constexpr bool valid() const noexcept
+ {
+ return valid(first(), second());
+ }
+
+ static constexpr bool valid(uint8_t first, uint8_t second) noexcept
+ {
+ using privateImpl::validUtf8MB;
+ return ((first & UTF8_2B_PATTERN_MASK) == UTF8_2B_PATTERN) && validUtf8MB(second);
+ }
+
+ [[nodiscard]] constexpr uint8_t first() const noexcept
+ {
+ return characters[0];
+ }
+
+ [[nodiscard]] constexpr uint8_t second() const noexcept
+ {
+ return characters[1];
+ }
+
+ auto operator<=>(const Utf82Byte& other) const noexcept = default;
+};
+
+struct Utf83Byte {
+ constexpr Utf83Byte(uint8_t first, uint8_t second, uint8_t third) noexcept : characters{{first, second, third}}
+ {
+ }
+
+ Array<uint8_t, 3> characters;
+
+ static constexpr size_t size() noexcept
+ {
+ return 3;
+ }
+
+ [[nodiscard]] constexpr bool valid() const noexcept
+ {
+ return valid(first(), second(), third());
+ }
+
+ static constexpr bool valid(uint8_t first, uint8_t second, uint8_t third) noexcept
+ {
+ using privateImpl::validUtf8MB;
+ return ((first & UTF8_3B_PATTERN_MASK) == UTF8_3B_PATTERN) && validUtf8MB(second) && validUtf8MB(third);
+ }
+
+ [[nodiscard]] constexpr uint8_t first() const noexcept
+ {
+ return characters[0];
+ }
+
+ [[nodiscard]] constexpr uint8_t second() const noexcept
+ {
+ return characters[1];
+ }
+
+ [[nodiscard]] constexpr uint8_t third() const noexcept
+ {
+ return characters[2];
+ }
+
+ auto operator<=>(const Utf83Byte& other) const noexcept = default;
+};
+
+struct Utf84Byte {
+ constexpr Utf84Byte(uint8_t first, uint8_t second, uint8_t third, uint8_t fourth) noexcept :
+ characters{{first, second, third, fourth}}
+ {
+ }
+
+ Array<uint8_t, 4> characters;
+
+ static constexpr size_t size() noexcept
+ {
+ return 4;
+ }
+
+ [[nodiscard]] constexpr bool valid() const noexcept
+ {
+ return valid(first(), second(), third(), fourth());
+ }
+
+ static constexpr bool valid(uint8_t first, uint8_t second, uint8_t third, uint8_t fourth) noexcept
+ {
+ using privateImpl::validUtf8MB;
+ if ((first & UTF8_4B_PATTERN_MASK) != UTF8_4B_PATTERN) {
+ return false;
+ }
+ return validUtf8MB(second) && validUtf8MB(third) && validUtf8MB(fourth);
+ }
+
+ [[nodiscard]] constexpr uint8_t first() const noexcept
+ {
+ return characters[0];
+ }
+
+ [[nodiscard]] constexpr uint8_t second() const noexcept
+ {
+ return characters[1];
+ }
+
+ [[nodiscard]] constexpr uint8_t third() const noexcept
+ {
+ return characters[2];
+ }
+
+ [[nodiscard]] constexpr uint8_t fourth() const noexcept
+ {
+ return characters[3];
+ }
+
+ auto operator<=>(const Utf84Byte& other) const noexcept = default;
+};
+
+using Utf8Variant = std::variant<Ascii, Utf82Byte, Utf83Byte, Utf84Byte>;
+
+constexpr auto ExtUtf8TypeSet{UniqueArray<size_t, 0, 1, 2, 3>{}};
+enum class ExtUtf8Type : uint8_t
+{
+ Ascii,
+ Utf82Byte,
+ Utf83Byte,
+ Utf84Byte,
+};
+static_assert(ExtUtf8TypeSet.m_values[0] == static_cast<uint8_t>(ExtUtf8Type::Ascii));
+static_assert(ExtUtf8TypeSet.m_values[1] == static_cast<uint8_t>(ExtUtf8Type::Utf82Byte));
+static_assert(ExtUtf8TypeSet.m_values[2] == static_cast<uint8_t>(ExtUtf8Type::Utf83Byte));
+static_assert(ExtUtf8TypeSet.m_values[3] == static_cast<uint8_t>(ExtUtf8Type::Utf84Byte));
+
+class String;
+class StringView;
+
+struct ExtUtf8 {
+ Utf8Variant m_variant{Utf8Variant{Ascii{}}};
+
+ static constexpr Ascii invalidAsciiCode{Ascii{0xFF}};
+ static ExtUtf8 fromString(const String& fudString, size_t index) noexcept;
+ static ExtUtf8 fromStringView(StringView&& fudView, size_t index) noexcept;
+ static ExtUtf8 fromStringView(const StringView& fudView, size_t index) noexcept;
+
+ static constexpr ExtUtf8 makeUtf8(Array<utf8, 4>& data)
+ {
+ ExtUtf8 unicode{};
+ if (Ascii::valid(data[0])) {
+ unicode.m_variant = Ascii{data[0]};
+ } else if (Utf82Byte::valid(data[0], data[1])) {
+ unicode.m_variant = Utf82Byte{data[0], data[1]};
+ } else if (Utf83Byte::valid(data[0], data[1], data[2])) {
+ unicode.m_variant = Utf83Byte{data[0], data[1], data[2]};
+ } else if (Utf84Byte::valid(data[0], data[1], data[2], data[3])) {
+ unicode.m_variant = Utf84Byte{data[0], data[1], data[2], data[3]};
+ } else {
+ unicode.m_variant = invalidAsciiCode;
+ }
+ return unicode;
+ }
+
+ static constexpr ExtUtf8 makeUtf8(const Ascii& utf8Char)
+ {
+ ExtUtf8 unicode{{Utf8Variant{Ascii{}}}};
+ if (utf8Char.valid()) {
+ unicode.m_variant = utf8Char;
+ } else {
+ unicode.m_variant = invalidAsciiCode;
+ }
+ return unicode;
+ }
+
+ static constexpr ExtUtf8 invalidAscii()
+ {
+ ExtUtf8 utf8{};
+ utf8.m_variant = Ascii{invalidAsciiCode};
+ return utf8;
+ }
+
+ [[nodiscard]] constexpr ExtUtf8Type getType() const
+ {
+ return static_cast<ExtUtf8Type>(m_variant.index());
+ }
+
+ [[nodiscard]] constexpr bool isAscii() const
+ {
+ return getType() == ExtUtf8Type::Ascii;
+ }
+
+ [[nodiscard]] constexpr bool valid() const noexcept
+ {
+ switch (m_variant.index()) {
+ case static_cast<size_t>(ExtUtf8Type::Ascii):
+ return std::get<Ascii>(m_variant).valid();
+ case static_cast<size_t>(ExtUtf8Type::Utf82Byte):
+ return std::get<Utf82Byte>(m_variant).valid();
+ case static_cast<size_t>(ExtUtf8Type::Utf83Byte):
+ return std::get<Utf83Byte>(m_variant).valid();
+ case static_cast<size_t>(ExtUtf8Type::Utf84Byte):
+ return std::get<Utf84Byte>(m_variant).valid();
+ default: // unlikely
+ return false;
+ }
+ }
+
+ [[nodiscard]] constexpr size_t size() const noexcept
+ {
+ if (!valid()) {
+ return 0;
+ }
+ switch (m_variant.index()) {
+ case static_cast<size_t>(ExtUtf8Type::Ascii):
+ return Ascii::size();
+ case static_cast<size_t>(ExtUtf8Type::Utf82Byte):
+ return Utf82Byte::size();
+ case static_cast<size_t>(ExtUtf8Type::Utf83Byte):
+ return Utf83Byte::size();
+ case static_cast<size_t>(ExtUtf8Type::Utf84Byte):
+ return Utf84Byte::size();
+ default: // unlikely
+ return 0;
+ }
+ }
+
+ [[nodiscard]] constexpr const uint8_t* data() const noexcept
+ {
+ if (!valid()) {
+ return nullptr;
+ }
+
+ switch (m_variant.index()) {
+ case static_cast<size_t>(ExtUtf8Type::Ascii):
+ return std::get<Ascii>(m_variant).characters.data();
+ case static_cast<size_t>(ExtUtf8Type::Utf82Byte):
+ return std::get<Utf82Byte>(m_variant).characters.data();
+ case static_cast<size_t>(ExtUtf8Type::Utf83Byte):
+ return std::get<Utf83Byte>(m_variant).characters.data();
+ case static_cast<size_t>(ExtUtf8Type::Utf84Byte):
+ return std::get<Utf84Byte>(m_variant).characters.data();
+ default: // unlikely
+ return nullptr;
+ }
+ }
+
+ template <typename Func>
+ [[nodiscard]] bool transformAscii(Func&& transform)
+ {
+ if (isAscii()) {
+ std::forward<Func>(transform)(std::get<Ascii>(m_variant));
+ return true;
+ }
+ return false;
+ }
+
+ [[nodiscard]] constexpr int64_t hash() const noexcept
+ {
+ using fud::ExtUtf8Type;
+ using fud::Utf82Byte;
+ using fud::Utf83Byte;
+ using fud::Utf84Byte;
+
+ if (!valid()) {
+ return -1;
+ }
+
+ constexpr uint8_t OneByteShift = 8;
+ constexpr uint8_t TwoByteShift = 2 * OneByteShift;
+ constexpr uint8_t ThreeByteShift = 3 * OneByteShift;
+
+ switch (static_cast<ExtUtf8Type>(m_variant.index())) {
+ case ExtUtf8Type::Ascii:
+ return std::get<Ascii>(m_variant).characters[0];
+ case ExtUtf8Type::Utf82Byte:
+ return static_cast<int64_t>(std::get<Utf82Byte>(m_variant).characters[0]) << OneByteShift |
+ static_cast<int64_t>(std::get<Utf82Byte>(m_variant).characters[1]);
+ case ExtUtf8Type::Utf83Byte:
+ return static_cast<int64_t>(std::get<Utf83Byte>(m_variant).characters[0]) << TwoByteShift |
+ static_cast<int64_t>(std::get<Utf83Byte>(m_variant).characters[1]) << OneByteShift |
+ static_cast<int64_t>(std::get<Utf83Byte>(m_variant).characters[2]);
+ case ExtUtf8Type::Utf84Byte:
+ return static_cast<int64_t>(std::get<Utf84Byte>(m_variant).characters[0]) << ThreeByteShift |
+ static_cast<int64_t>(std::get<Utf84Byte>(m_variant).characters[1]) << TwoByteShift |
+ static_cast<int64_t>(std::get<Utf84Byte>(m_variant).characters[2]) << OneByteShift |
+ static_cast<int64_t>(std::get<Utf84Byte>(m_variant).characters[3]);
+ default: // unlikely
+ return -1;
+ }
+ }
+
+ constexpr bool operator==(const ExtUtf8& other) const noexcept = default;
+
+ constexpr auto operator<=>(const ExtUtf8& other) const noexcept
+ {
+ auto hasSameAlternative = []<typename T>(const ExtUtf8& lhs, const ExtUtf8& rhs) noexcept {
+ return std::holds_alternative<T>(lhs.m_variant) && std::holds_alternative<T>(rhs.m_variant);
+ };
+
+ auto getSameAlternative = []<typename T>(const ExtUtf8& lhs, const ExtUtf8& rhs) noexcept {
+ return std::get<T>(lhs.m_variant).operator<=>(std::get<T>(rhs.m_variant));
+ };
+
+ if (hasSameAlternative.template operator()<Ascii>(*this, other)) {
+ return getSameAlternative.template operator()<Ascii>(*this, other);
+ }
+
+ if (hasSameAlternative.template operator()<Utf82Byte>(*this, other)) {
+ return getSameAlternative.template operator()<Utf82Byte>(*this, other);
+ }
+
+ if (hasSameAlternative.template operator()<Utf83Byte>(*this, other)) {
+ return getSameAlternative.template operator()<Utf83Byte>(*this, other);
+ }
+
+ if (hasSameAlternative.template operator()<Utf84Byte>(*this, other)) {
+ return getSameAlternative.template operator()<Utf84Byte>(*this, other);
+ }
+
+ if (std::holds_alternative<Ascii>(m_variant)) {
+ return std::strong_ordering::less;
+ }
+
+ if (std::holds_alternative<Ascii>(other.m_variant)) {
+ return std::strong_ordering::greater;
+ }
+
+ if (std::holds_alternative<Utf82Byte>(m_variant)) {
+ return std::strong_ordering::less;
+ }
+
+ if (std::holds_alternative<Utf82Byte>(other.m_variant)) {
+ return std::strong_ordering::greater;
+ }
+
+ if (std::holds_alternative<Utf83Byte>(m_variant)) {
+ return std::strong_ordering::less;
+ }
+
+ return std::strong_ordering::greater;
+ }
+
+ std::optional<Ascii> getAscii() const
+ {
+ if (m_variant.index() == static_cast<size_t>(ExtUtf8Type::Ascii)) {
+ return std::get<Ascii>(m_variant);
+ }
+ return std::nullopt;
+ }
+};
+
+/** \brief Checks if a character is ascii. */
+bool ext_lib_char_is_ascii(char character);
+
+FudStatus ext_lib_utf8_is_ascii(ExtUtf8& character, bool& isAscii);
+
+/** \brief Checks if a character is alphanumeric. */
+bool ext_lib_char_is_alphanumeric(char character);
+
+/** \brief Checks if a character is alphanumeric. */
+FudStatus ext_lib_utf8_is_alphanumeric(ExtUtf8* character, bool* pred);
+
+/** \brief Checks if a character is alphabetic. */
+bool ext_lib_char_is_alpha(char character);
+
+/** \brief Checks if a character is alphabetic. */
+FudStatus ext_lib_utf8_is_alpha(ExtUtf8* character, bool* pred);
+
+/** \brief Checks if a character is lowercase. */
+bool ext_lib_char_is_lowercase(char character);
+
+/** \brief Checks if a character is lowercase. */
+FudStatus ext_lib_utf8_is_lowercase(ExtUtf8* character, bool* pred);
+
+/** \brief Checks if a character is an uppercase character. */
+bool ext_lib_char_is_uppercase(char character);
+
+/** \brief Checks if a character is uppercase. */
+FudStatus ext_lib_utf8_is_uppercase(ExtUtf8* character, bool* pred);
+
+/** \brief Checks if a character is a digit. */
+bool ext_lib_char_is_digit(char character);
+
+/** \brief Checks if a character is a digit. */
+FudStatus ext_lib_utf8_is_digit(ExtUtf8* character, bool* pred);
+
+/** \brief Checks if a character is a hexadecimal character. */
+bool ext_lib_char_is_hex_digit(char character);
+
+/** \brief Checks if a character is a hexadecimal digit. */
+FudStatus ext_lib_utf8_is_hex_digit(ExtUtf8* character, bool* pred);
+
+/** \brief Checks if a character is a control character. */
+bool ext_lib_char_is_control(char character);
+
+/** \brief Checks if a character is a control character. */
+FudStatus ext_lib_utf8_is_control(ExtUtf8* character, bool* pred);
+
+/** \brief Checks if a character is a graphical character. */
+bool ext_lib_char_is_graphical(char character);
+
+/** \brief Checks if a character is a graphical character. */
+FudStatus ext_lib_utf8_is_graphical(ExtUtf8* character, bool* pred);
+
+/** \brief Checks if a character is a space character. */
+bool ext_lib_char_is_space(char character);
+
+/** \brief Checks if a character is a space character. */
+FudStatus ext_lib_utf8_is_space(ExtUtf8* character, bool* pred);
+
+/** \brief Checks if a character is a blank character. */
+bool ext_lib_char_is_blank(char character);
+
+/** \brief Checks if a character is a blank character. */
+FudStatus ext_lib_utf8_is_blank(ExtUtf8* character, bool* pred);
+
+/** \brief Checks if a character is a printable character. */
+bool ext_lib_char_is_printable(char character);
+
+/** \brief Checks if a character is a printable character. */
+FudStatus ext_lib_utf8_is_printable(ExtUtf8* character, bool* pred);
+
+/** \brief Checks if a character is a punctuation character. */
+bool ext_lib_char_is_punctuation(char character);
+
+/** \brief Checks if a character is a punctuation character. */
+FudStatus ext_lib_utf8_is_punctuation(ExtUtf8* character, bool* pred);
+
+uint8_t ext_lib_char_to_lower(uint8_t character);
+
+ExtUtf8* ext_lib_utf8_to_lower(ExtUtf8* character);
+
+uint8_t ext_lib_char_to_upper(uint8_t character);
+
+ExtUtf8* ext_lib_utf8_to_upper(ExtUtf8* character);
+
+} // namespace fud
+
+#endif