From bf81e34921e3e30b05313efbcf5c9fa839cb7c05 Mon Sep 17 00:00:00 2001 From: Dominick Allen Date: Sun, 22 Sep 2024 10:19:15 -0500 Subject: Initial commit. --- source/utf8.cpp | 343 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 343 insertions(+) create mode 100644 source/utf8.cpp (limited to 'source/utf8.cpp') diff --git a/source/utf8.cpp b/source/utf8.cpp new file mode 100644 index 0000000..c94ac1f --- /dev/null +++ b/source/utf8.cpp @@ -0,0 +1,343 @@ +/* + * libfud + * Copyright 2024 Dominick Allen + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "utf8.hpp" + +#include "string.hpp" + +#include // IWYU pragma: keep - this is for placement new overloads. + +namespace fud { + +ExtUtf8 ExtUtf8::fromString(const String& fudString, size_t index) noexcept +{ + if (!fudString.valid()) { + return invalidAscii(); + } + + + return fromStringView(StringView{fudString}, index); +} + +ExtUtf8 ExtUtf8::fromStringView(const StringView& view, size_t index) noexcept +{ + return fromStringView(StringView{view}, index); +} + +ExtUtf8 ExtUtf8::fromStringView(StringView&& view, size_t index) noexcept +{ + auto len = view.length(); + const auto* data = view.data(); + if (data == nullptr) { + return invalidAscii(); + } + + ExtUtf8 localChar{Ascii{data[index]}}; + if (localChar.valid()) { + return localChar; + } + + if (index + 1 < len) { + localChar.m_variant = Utf82Byte{data[index], data[index + 1]}; + } + if (localChar.valid()) { + return localChar; + } + + if (index + 2 < len) { + localChar.m_variant = Utf83Byte{data[index], data[index + 1], data[index + 2]}; + } + if (localChar.valid()) { + return localChar; + } + + if (index + 3 < len) { + localChar.m_variant = Utf84Byte{data[index], data[index + 1], data[index + 2], data[index + 3]}; + } + if (localChar.valid()) { + return localChar; + } + + return invalidAscii(); +} + +bool ext_lib_char_is_ascii(char character) +{ + return static_cast(character & ~ASCII_MASK) == 0; +} + +FudStatus ext_lib_utf8_is_ascii(ExtUtf8* character, bool* isAscii) +{ + if (anyAreNull(character, isAscii)) { + return FudStatus::NullPointer; + } + + *isAscii = character->getType() == ExtUtf8Type::Ascii && character->valid(); + + return FudStatus::Success; +} + +namespace impl { + +/* Assumes that predicate is not a null pointer! */ +template +inline FudStatus isAsciiPredicate(ExtUtf8* character, bool* pred, Predicate&& predicate) +{ + if (anyAreNull(character, pred)) { + return FudStatus::NullPointer; + } + + auto maybeAscii = character->getAscii(); + if (!maybeAscii.has_value()) { + return FudStatus::InvalidInput; + } + + auto asciiChar = *maybeAscii; + *pred = std::forward(predicate)(asciiChar.asChar()); + + return FudStatus::Success; +} + +} // namespace impl + +bool ext_lib_char_is_alphanumeric(char character) +{ + if (!ext_lib_char_is_ascii(character)) { + return false; + } + + if (ext_lib_char_is_alpha(character)) { + return true; + } + + return ext_lib_char_is_digit(character); +} + +FudStatus ext_lib_utf8_is_alphanumeric(ExtUtf8* character, bool* pred) +{ + return impl::isAsciiPredicate(character, pred, ext_lib_char_is_alphanumeric); +} + +bool ext_lib_char_is_alpha(char character) +{ + if (!ext_lib_char_is_ascii(character)) { + return false; + } + + if (ext_lib_char_is_uppercase(character)) { + return true; + } + + return ext_lib_char_is_lowercase(character); +} + +FudStatus ext_lib_utf8_is_alpha(ExtUtf8* character, bool* pred) +{ + return impl::isAsciiPredicate(character, pred, ext_lib_char_is_alpha); +} + +bool ext_lib_char_is_lowercase(char character) +{ + if (!ext_lib_char_is_ascii(character)) { + return false; + } + + return 'a' <= character && character <= 'z'; +} + +FudStatus ext_lib_utf8_is_lowercase(ExtUtf8* character, bool* pred) +{ + return impl::isAsciiPredicate(character, pred, ext_lib_char_is_lowercase); +} + +bool ext_lib_char_is_uppercase(char character) +{ + if (!ext_lib_char_is_ascii(character)) { + return false; + } + + return 'A' <= character && character <= 'Z'; +} + +FudStatus ext_lib_utf8_is_uppercase(ExtUtf8* character, bool* pred) +{ + return impl::isAsciiPredicate(character, pred, ext_lib_char_is_uppercase); +} + +bool ext_lib_char_is_digit(char character) +{ + if (!ext_lib_char_is_ascii(character)) { + return false; + } + + return '0' <= character && character <= '9'; +} + +FudStatus ext_lib_utf8_is_digit(ExtUtf8* character, bool* pred) +{ + return impl::isAsciiPredicate(character, pred, ext_lib_char_is_digit); +} + +bool ext_lib_char_is_hex_digit(char character) +{ + if (!ext_lib_char_is_ascii(character)) { + return false; + } + + return ('0' <= character && character <= '9') || ('a' <= character && character <= 'f') || + ('A' <= character && character <= 'F'); +} + +FudStatus ext_lib_utf8_is_hex_digit(ExtUtf8* character, bool* pred) +{ + return impl::isAsciiPredicate(character, pred, ext_lib_char_is_hex_digit); +} + +bool ext_lib_char_is_control(char character) +{ + if (!ext_lib_char_is_ascii(character)) { + return false; + } + + constexpr char maxControlChar = 0x1F; + constexpr const char deleteChar = 0x7F; + return ((static_cast(character) <= maxControlChar)) || character == deleteChar; +} + +FudStatus ext_lib_utf8_is_control(ExtUtf8* character, bool* pred) +{ + return impl::isAsciiPredicate(character, pred, ext_lib_char_is_control); +} + +bool ext_lib_char_is_graphical(char character) +{ + if (!ext_lib_char_is_ascii(character)) { + return false; + } + + return ext_lib_char_is_alphanumeric(character) || ext_lib_char_is_punctuation(character); +} + +FudStatus ext_lib_utf8_is_graphical(ExtUtf8* character, bool* pred) +{ + return impl::isAsciiPredicate(character, pred, ext_lib_char_is_graphical); +} + +bool ext_lib_char_is_space(char character) +{ + if (!ext_lib_char_is_ascii(character)) { + return false; + } + + return character == ' ' || character == '\t' || character == '\n' || character == '\r' || character == '\v'; +} + +FudStatus ext_lib_utf8_is_space(ExtUtf8* character, bool* pred) +{ + return impl::isAsciiPredicate(character, pred, ext_lib_char_is_space); +} + +bool ext_lib_char_is_blank(char character) +{ + if (!ext_lib_char_is_ascii(character)) { + return false; + } + + return character == ' ' || character == '\t'; +} + +FudStatus ext_lib_utf8_is_blank(ExtUtf8* character, bool* pred) +{ + return impl::isAsciiPredicate(character, pred, ext_lib_char_is_blank); +} + +bool ext_lib_char_is_printable(char character) +{ + if (!ext_lib_char_is_ascii(character)) { + return false; + } + + return (character >= ' ' && character <= '~'); +} + +FudStatus ext_lib_utf8_is_printable(ExtUtf8* character, bool* pred) +{ + return impl::isAsciiPredicate(character, pred, ext_lib_char_is_printable); +} + +bool ext_lib_char_is_punctuation(char character) +{ + if (!ext_lib_char_is_ascii(character)) { + return false; + } + + return (character >= '!' && character <= '/') || (character >= ':' && character <= '@') || + (character >= '[' && character <= '`') || (character >= '{' && character <= '~'); +} + +FudStatus ext_lib_utf8_is_punctuation(ExtUtf8* character, bool* pred) +{ + return impl::isAsciiPredicate(character, pred, ext_lib_char_is_punctuation); +} + +uint8_t ext_lib_char_to_lower(uint8_t character) +{ + if (ext_lib_char_is_uppercase(static_cast(character))) { + constexpr uint8_t lowerA = 'a'; + constexpr uint8_t upperA = 'A'; + return static_cast(character - upperA) + lowerA; + } + return character; +} + +ExtUtf8* ext_lib_utf8_to_lower(ExtUtf8* character) +{ + if (character == nullptr) { + return character; + } + + static_cast(character->transformAscii([](Ascii& ascii) { + ascii = Ascii{ext_lib_char_to_lower(static_cast(ascii.asChar()))}; + })); + + return character; +} + +uint8_t ext_lib_char_to_upper(uint8_t character) +{ + if (ext_lib_char_is_lowercase(static_cast(character))) { + constexpr uint8_t lowerA = 'a'; + constexpr uint8_t upperA = 'A'; + return static_cast(character - lowerA) + upperA; + } + return character; +} + +ExtUtf8* ext_lib_utf8_to_upper(ExtUtf8* character) +{ + if (character == nullptr) { + return character; + } + + static_cast(character->transformAscii([](Ascii& ascii) { + ascii = Ascii{ext_lib_char_to_upper(static_cast(ascii.asChar()))}; + })); + + return character; +} + +} // namespace ext_lib -- cgit v1.2.3