summaryrefslogtreecommitdiff
path: root/include/utf8.hpp
diff options
context:
space:
mode:
authorDominick Allen <djallen@librehumanitas.org>2024-09-22 12:41:28 -0500
committerDominick Allen <djallen@librehumanitas.org>2024-09-22 12:41:28 -0500
commit7da829d48f9059c83ab9cada2c850621e8bbd3f3 (patch)
tree314e7a5b645e910d4997e3bee980bd2024f3087d /include/utf8.hpp
parentbf81e34921e3e30b05313efbcf5c9fa839cb7c05 (diff)
Basics of library.
Diffstat (limited to 'include/utf8.hpp')
-rw-r--r--include/utf8.hpp557
1 files changed, 0 insertions, 557 deletions
diff --git a/include/utf8.hpp b/include/utf8.hpp
deleted file mode 100644
index c66d93c..0000000
--- a/include/utf8.hpp
+++ /dev/null
@@ -1,557 +0,0 @@
-/*
- * libfud
- * Copyright 2024 Dominick Allen
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef FUD_UTF8_HPP
-#define FUD_UTF8_HPP
-
-#include "array.hpp"
-#include "memory.hpp"
-#include "status.hpp"
-#include "unique_array.hpp"
-
-/*
-#include "ext_hash.hpp"
-#include "ext_set.hpp"
-*/
-
-#include <cstdint>
-#include <optional>
-#include <type_traits>
-
-namespace fud {
-
-using utf8 = unsigned char;
-
-struct StringView;
-
-constexpr uint8_t ASCII_MASK = 0x7F;
-
-constexpr uint8_t UTF8_MB_PATTERN_MASK = 0xC0;
-constexpr uint8_t UTF8_MB_PATTERN = 0x80;
-constexpr uint8_t UTF8_MB_MASK = static_cast<uint8_t>(~UTF8_MB_PATTERN_MASK);
-
-constexpr uint8_t UTF8_2B_PATTERN_MASK = 0xE0;
-constexpr uint8_t UTF8_2B_PATTERN = 0xC0;
-constexpr uint8_t UTF8_2B_MASK = static_cast<uint8_t>(~UTF8_2B_PATTERN_MASK);
-
-constexpr uint8_t UTF8_3B_PATTERN_MASK = 0xF0;
-constexpr uint8_t UTF8_3B_PATTERN = 0xE0;
-constexpr uint8_t UTF8_3B_MASK = static_cast<uint8_t>(~UTF8_3B_PATTERN_MASK);
-
-constexpr uint8_t UTF8_4B_PATTERN_MASK = 0xF8;
-constexpr uint8_t UTF8_4B_PATTERN = 0xF0;
-constexpr uint8_t UTF8_4B_MASK = static_cast<uint8_t>(~UTF8_4B_PATTERN_MASK);
-
-namespace privateImpl {
-constexpr bool validUtf8MB(uint8_t code) noexcept
-{
- return (code & UTF8_MB_PATTERN_MASK) == UTF8_MB_PATTERN;
-}
-} // namespace privateImpl
-
-struct Ascii {
- Array<uint8_t, 1> characters;
-
- constexpr Ascii() noexcept = default;
-
- constexpr explicit Ascii(uint8_t chr) noexcept : characters{{chr}}
- {
- }
-
- [[nodiscard]] constexpr uint8_t character() const noexcept
- {
- return characters[0];
- }
-
- [[nodiscard]] constexpr char asChar() const noexcept
- {
- return static_cast<char>(characters[0]);
- }
-
- static constexpr size_t size() noexcept
- {
- return 1;
- }
-
- [[nodiscard]] constexpr bool valid() const noexcept
- {
- return valid(characters[0]);
- }
-
- static constexpr bool valid(uint8_t character) noexcept
- {
- return static_cast<uint8_t>(character & ~ASCII_MASK) == 0;
- }
-
- auto operator<=>(const Ascii& other) const noexcept = default;
-};
-
-static_assert(std::is_trivial_v<Ascii>);
-static_assert(std::is_standard_layout_v<Ascii>);
-
-/*
-| B | E | Byte 1 | Byte 2 | Byte 3 | Byte 4
-| U+0000 | U+007F | 0xxxxxxx | | |
-| U+0080 | U+07FF | 110xxxxx | 10xxxxxx | |
-| U+0800 | U+FFFF | 1110xxxx | 10xxxxxx | 10xxxxxx |
-| U+10000 | U+10FFFF | 11110xxx | 10xxxxxx | 10xxxxxx | 10xxxxxx
-*/
-
-struct Utf82Byte {
- constexpr Utf82Byte(uint8_t first, uint8_t second) noexcept : characters{{first, second}}
- {
- }
- Array<uint8_t, 2> characters;
- static constexpr size_t size() noexcept
- {
- return 2;
- }
-
- [[nodiscard]] constexpr bool valid() const noexcept
- {
- return valid(first(), second());
- }
-
- static constexpr bool valid(uint8_t first, uint8_t second) noexcept
- {
- using privateImpl::validUtf8MB;
- return ((first & UTF8_2B_PATTERN_MASK) == UTF8_2B_PATTERN) && validUtf8MB(second);
- }
-
- [[nodiscard]] constexpr uint8_t first() const noexcept
- {
- return characters[0];
- }
-
- [[nodiscard]] constexpr uint8_t second() const noexcept
- {
- return characters[1];
- }
-
- auto operator<=>(const Utf82Byte& other) const noexcept = default;
-};
-
-struct Utf83Byte {
- constexpr Utf83Byte(uint8_t first, uint8_t second, uint8_t third) noexcept : characters{{first, second, third}}
- {
- }
-
- Array<uint8_t, 3> characters;
-
- static constexpr size_t size() noexcept
- {
- return 3;
- }
-
- [[nodiscard]] constexpr bool valid() const noexcept
- {
- return valid(first(), second(), third());
- }
-
- static constexpr bool valid(uint8_t first, uint8_t second, uint8_t third) noexcept
- {
- using privateImpl::validUtf8MB;
- return ((first & UTF8_3B_PATTERN_MASK) == UTF8_3B_PATTERN) && validUtf8MB(second) && validUtf8MB(third);
- }
-
- [[nodiscard]] constexpr uint8_t first() const noexcept
- {
- return characters[0];
- }
-
- [[nodiscard]] constexpr uint8_t second() const noexcept
- {
- return characters[1];
- }
-
- [[nodiscard]] constexpr uint8_t third() const noexcept
- {
- return characters[2];
- }
-
- auto operator<=>(const Utf83Byte& other) const noexcept = default;
-};
-
-struct Utf84Byte {
- constexpr Utf84Byte(uint8_t first, uint8_t second, uint8_t third, uint8_t fourth) noexcept :
- characters{{first, second, third, fourth}}
- {
- }
-
- Array<uint8_t, 4> characters;
-
- static constexpr size_t size() noexcept
- {
- return 4;
- }
-
- [[nodiscard]] constexpr bool valid() const noexcept
- {
- return valid(first(), second(), third(), fourth());
- }
-
- static constexpr bool valid(uint8_t first, uint8_t second, uint8_t third, uint8_t fourth) noexcept
- {
- using privateImpl::validUtf8MB;
- if ((first & UTF8_4B_PATTERN_MASK) != UTF8_4B_PATTERN) {
- return false;
- }
- return validUtf8MB(second) && validUtf8MB(third) && validUtf8MB(fourth);
- }
-
- [[nodiscard]] constexpr uint8_t first() const noexcept
- {
- return characters[0];
- }
-
- [[nodiscard]] constexpr uint8_t second() const noexcept
- {
- return characters[1];
- }
-
- [[nodiscard]] constexpr uint8_t third() const noexcept
- {
- return characters[2];
- }
-
- [[nodiscard]] constexpr uint8_t fourth() const noexcept
- {
- return characters[3];
- }
-
- auto operator<=>(const Utf84Byte& other) const noexcept = default;
-};
-
-using Utf8Variant = std::variant<Ascii, Utf82Byte, Utf83Byte, Utf84Byte>;
-
-constexpr auto ExtUtf8TypeSet{UniqueArray<size_t, 0, 1, 2, 3>{}};
-enum class ExtUtf8Type : uint8_t
-{
- Ascii,
- Utf82Byte,
- Utf83Byte,
- Utf84Byte,
-};
-static_assert(ExtUtf8TypeSet.m_values[0] == static_cast<uint8_t>(ExtUtf8Type::Ascii));
-static_assert(ExtUtf8TypeSet.m_values[1] == static_cast<uint8_t>(ExtUtf8Type::Utf82Byte));
-static_assert(ExtUtf8TypeSet.m_values[2] == static_cast<uint8_t>(ExtUtf8Type::Utf83Byte));
-static_assert(ExtUtf8TypeSet.m_values[3] == static_cast<uint8_t>(ExtUtf8Type::Utf84Byte));
-
-class String;
-class StringView;
-
-struct ExtUtf8 {
- Utf8Variant m_variant{Utf8Variant{Ascii{}}};
-
- static constexpr Ascii invalidAsciiCode{Ascii{0xFF}};
- static ExtUtf8 fromString(const String& fudString, size_t index) noexcept;
- static ExtUtf8 fromStringView(StringView&& fudView, size_t index) noexcept;
- static ExtUtf8 fromStringView(const StringView& fudView, size_t index) noexcept;
-
- static constexpr ExtUtf8 makeUtf8(Array<utf8, 4>& data)
- {
- ExtUtf8 unicode{};
- if (Ascii::valid(data[0])) {
- unicode.m_variant = Ascii{data[0]};
- } else if (Utf82Byte::valid(data[0], data[1])) {
- unicode.m_variant = Utf82Byte{data[0], data[1]};
- } else if (Utf83Byte::valid(data[0], data[1], data[2])) {
- unicode.m_variant = Utf83Byte{data[0], data[1], data[2]};
- } else if (Utf84Byte::valid(data[0], data[1], data[2], data[3])) {
- unicode.m_variant = Utf84Byte{data[0], data[1], data[2], data[3]};
- } else {
- unicode.m_variant = invalidAsciiCode;
- }
- return unicode;
- }
-
- static constexpr ExtUtf8 makeUtf8(const Ascii& utf8Char)
- {
- ExtUtf8 unicode{{Utf8Variant{Ascii{}}}};
- if (utf8Char.valid()) {
- unicode.m_variant = utf8Char;
- } else {
- unicode.m_variant = invalidAsciiCode;
- }
- return unicode;
- }
-
- static constexpr ExtUtf8 invalidAscii()
- {
- ExtUtf8 utf8{};
- utf8.m_variant = Ascii{invalidAsciiCode};
- return utf8;
- }
-
- [[nodiscard]] constexpr ExtUtf8Type getType() const
- {
- return static_cast<ExtUtf8Type>(m_variant.index());
- }
-
- [[nodiscard]] constexpr bool isAscii() const
- {
- return getType() == ExtUtf8Type::Ascii;
- }
-
- [[nodiscard]] constexpr bool valid() const noexcept
- {
- switch (m_variant.index()) {
- case static_cast<size_t>(ExtUtf8Type::Ascii):
- return std::get<Ascii>(m_variant).valid();
- case static_cast<size_t>(ExtUtf8Type::Utf82Byte):
- return std::get<Utf82Byte>(m_variant).valid();
- case static_cast<size_t>(ExtUtf8Type::Utf83Byte):
- return std::get<Utf83Byte>(m_variant).valid();
- case static_cast<size_t>(ExtUtf8Type::Utf84Byte):
- return std::get<Utf84Byte>(m_variant).valid();
- default: // unlikely
- return false;
- }
- }
-
- [[nodiscard]] constexpr size_t size() const noexcept
- {
- if (!valid()) {
- return 0;
- }
- switch (m_variant.index()) {
- case static_cast<size_t>(ExtUtf8Type::Ascii):
- return Ascii::size();
- case static_cast<size_t>(ExtUtf8Type::Utf82Byte):
- return Utf82Byte::size();
- case static_cast<size_t>(ExtUtf8Type::Utf83Byte):
- return Utf83Byte::size();
- case static_cast<size_t>(ExtUtf8Type::Utf84Byte):
- return Utf84Byte::size();
- default: // unlikely
- return 0;
- }
- }
-
- [[nodiscard]] constexpr const uint8_t* data() const noexcept
- {
- if (!valid()) {
- return nullptr;
- }
-
- switch (m_variant.index()) {
- case static_cast<size_t>(ExtUtf8Type::Ascii):
- return std::get<Ascii>(m_variant).characters.data();
- case static_cast<size_t>(ExtUtf8Type::Utf82Byte):
- return std::get<Utf82Byte>(m_variant).characters.data();
- case static_cast<size_t>(ExtUtf8Type::Utf83Byte):
- return std::get<Utf83Byte>(m_variant).characters.data();
- case static_cast<size_t>(ExtUtf8Type::Utf84Byte):
- return std::get<Utf84Byte>(m_variant).characters.data();
- default: // unlikely
- return nullptr;
- }
- }
-
- template <typename Func>
- [[nodiscard]] bool transformAscii(Func&& transform)
- {
- if (isAscii()) {
- std::forward<Func>(transform)(std::get<Ascii>(m_variant));
- return true;
- }
- return false;
- }
-
- [[nodiscard]] constexpr int64_t hash() const noexcept
- {
- using fud::ExtUtf8Type;
- using fud::Utf82Byte;
- using fud::Utf83Byte;
- using fud::Utf84Byte;
-
- if (!valid()) {
- return -1;
- }
-
- constexpr uint8_t OneByteShift = 8;
- constexpr uint8_t TwoByteShift = 2 * OneByteShift;
- constexpr uint8_t ThreeByteShift = 3 * OneByteShift;
-
- switch (static_cast<ExtUtf8Type>(m_variant.index())) {
- case ExtUtf8Type::Ascii:
- return std::get<Ascii>(m_variant).characters[0];
- case ExtUtf8Type::Utf82Byte:
- return static_cast<int64_t>(std::get<Utf82Byte>(m_variant).characters[0]) << OneByteShift |
- static_cast<int64_t>(std::get<Utf82Byte>(m_variant).characters[1]);
- case ExtUtf8Type::Utf83Byte:
- return static_cast<int64_t>(std::get<Utf83Byte>(m_variant).characters[0]) << TwoByteShift |
- static_cast<int64_t>(std::get<Utf83Byte>(m_variant).characters[1]) << OneByteShift |
- static_cast<int64_t>(std::get<Utf83Byte>(m_variant).characters[2]);
- case ExtUtf8Type::Utf84Byte:
- return static_cast<int64_t>(std::get<Utf84Byte>(m_variant).characters[0]) << ThreeByteShift |
- static_cast<int64_t>(std::get<Utf84Byte>(m_variant).characters[1]) << TwoByteShift |
- static_cast<int64_t>(std::get<Utf84Byte>(m_variant).characters[2]) << OneByteShift |
- static_cast<int64_t>(std::get<Utf84Byte>(m_variant).characters[3]);
- default: // unlikely
- return -1;
- }
- }
-
- constexpr bool operator==(const ExtUtf8& other) const noexcept = default;
-
- constexpr auto operator<=>(const ExtUtf8& other) const noexcept
- {
- auto hasSameAlternative = []<typename T>(const ExtUtf8& lhs, const ExtUtf8& rhs) noexcept {
- return std::holds_alternative<T>(lhs.m_variant) && std::holds_alternative<T>(rhs.m_variant);
- };
-
- auto getSameAlternative = []<typename T>(const ExtUtf8& lhs, const ExtUtf8& rhs) noexcept {
- return std::get<T>(lhs.m_variant).operator<=>(std::get<T>(rhs.m_variant));
- };
-
- if (hasSameAlternative.template operator()<Ascii>(*this, other)) {
- return getSameAlternative.template operator()<Ascii>(*this, other);
- }
-
- if (hasSameAlternative.template operator()<Utf82Byte>(*this, other)) {
- return getSameAlternative.template operator()<Utf82Byte>(*this, other);
- }
-
- if (hasSameAlternative.template operator()<Utf83Byte>(*this, other)) {
- return getSameAlternative.template operator()<Utf83Byte>(*this, other);
- }
-
- if (hasSameAlternative.template operator()<Utf84Byte>(*this, other)) {
- return getSameAlternative.template operator()<Utf84Byte>(*this, other);
- }
-
- if (std::holds_alternative<Ascii>(m_variant)) {
- return std::strong_ordering::less;
- }
-
- if (std::holds_alternative<Ascii>(other.m_variant)) {
- return std::strong_ordering::greater;
- }
-
- if (std::holds_alternative<Utf82Byte>(m_variant)) {
- return std::strong_ordering::less;
- }
-
- if (std::holds_alternative<Utf82Byte>(other.m_variant)) {
- return std::strong_ordering::greater;
- }
-
- if (std::holds_alternative<Utf83Byte>(m_variant)) {
- return std::strong_ordering::less;
- }
-
- return std::strong_ordering::greater;
- }
-
- std::optional<Ascii> getAscii() const
- {
- if (m_variant.index() == static_cast<size_t>(ExtUtf8Type::Ascii)) {
- return std::get<Ascii>(m_variant);
- }
- return std::nullopt;
- }
-};
-
-/** \brief Checks if a character is ascii. */
-bool ext_lib_char_is_ascii(char character);
-
-FudStatus ext_lib_utf8_is_ascii(ExtUtf8& character, bool& isAscii);
-
-/** \brief Checks if a character is alphanumeric. */
-bool ext_lib_char_is_alphanumeric(char character);
-
-/** \brief Checks if a character is alphanumeric. */
-FudStatus ext_lib_utf8_is_alphanumeric(ExtUtf8* character, bool* pred);
-
-/** \brief Checks if a character is alphabetic. */
-bool ext_lib_char_is_alpha(char character);
-
-/** \brief Checks if a character is alphabetic. */
-FudStatus ext_lib_utf8_is_alpha(ExtUtf8* character, bool* pred);
-
-/** \brief Checks if a character is lowercase. */
-bool ext_lib_char_is_lowercase(char character);
-
-/** \brief Checks if a character is lowercase. */
-FudStatus ext_lib_utf8_is_lowercase(ExtUtf8* character, bool* pred);
-
-/** \brief Checks if a character is an uppercase character. */
-bool ext_lib_char_is_uppercase(char character);
-
-/** \brief Checks if a character is uppercase. */
-FudStatus ext_lib_utf8_is_uppercase(ExtUtf8* character, bool* pred);
-
-/** \brief Checks if a character is a digit. */
-bool ext_lib_char_is_digit(char character);
-
-/** \brief Checks if a character is a digit. */
-FudStatus ext_lib_utf8_is_digit(ExtUtf8* character, bool* pred);
-
-/** \brief Checks if a character is a hexadecimal character. */
-bool ext_lib_char_is_hex_digit(char character);
-
-/** \brief Checks if a character is a hexadecimal digit. */
-FudStatus ext_lib_utf8_is_hex_digit(ExtUtf8* character, bool* pred);
-
-/** \brief Checks if a character is a control character. */
-bool ext_lib_char_is_control(char character);
-
-/** \brief Checks if a character is a control character. */
-FudStatus ext_lib_utf8_is_control(ExtUtf8* character, bool* pred);
-
-/** \brief Checks if a character is a graphical character. */
-bool ext_lib_char_is_graphical(char character);
-
-/** \brief Checks if a character is a graphical character. */
-FudStatus ext_lib_utf8_is_graphical(ExtUtf8* character, bool* pred);
-
-/** \brief Checks if a character is a space character. */
-bool ext_lib_char_is_space(char character);
-
-/** \brief Checks if a character is a space character. */
-FudStatus ext_lib_utf8_is_space(ExtUtf8* character, bool* pred);
-
-/** \brief Checks if a character is a blank character. */
-bool ext_lib_char_is_blank(char character);
-
-/** \brief Checks if a character is a blank character. */
-FudStatus ext_lib_utf8_is_blank(ExtUtf8* character, bool* pred);
-
-/** \brief Checks if a character is a printable character. */
-bool ext_lib_char_is_printable(char character);
-
-/** \brief Checks if a character is a printable character. */
-FudStatus ext_lib_utf8_is_printable(ExtUtf8* character, bool* pred);
-
-/** \brief Checks if a character is a punctuation character. */
-bool ext_lib_char_is_punctuation(char character);
-
-/** \brief Checks if a character is a punctuation character. */
-FudStatus ext_lib_utf8_is_punctuation(ExtUtf8* character, bool* pred);
-
-uint8_t ext_lib_char_to_lower(uint8_t character);
-
-ExtUtf8* ext_lib_utf8_to_lower(ExtUtf8* character);
-
-uint8_t ext_lib_char_to_upper(uint8_t character);
-
-ExtUtf8* ext_lib_utf8_to_upper(ExtUtf8* character);
-
-} // namespace fud
-
-#endif