summaryrefslogtreecommitdiff
path: root/source/utf8.cpp
diff options
context:
space:
mode:
authorDominick Allen <djallen@librehumanitas.org>2024-09-22 10:19:15 -0500
committerDominick Allen <djallen@librehumanitas.org>2024-09-22 10:19:15 -0500
commitbf81e34921e3e30b05313efbcf5c9fa839cb7c05 (patch)
treeb56a343e59164bc347232669e8bb808cf3c4f4ef /source/utf8.cpp
Initial commit.
Diffstat (limited to 'source/utf8.cpp')
-rw-r--r--source/utf8.cpp343
1 files changed, 343 insertions, 0 deletions
diff --git a/source/utf8.cpp b/source/utf8.cpp
new file mode 100644
index 0000000..c94ac1f
--- /dev/null
+++ b/source/utf8.cpp
@@ -0,0 +1,343 @@
+/*
+ * libfud
+ * Copyright 2024 Dominick Allen
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utf8.hpp"
+
+#include "string.hpp"
+
+#include <new> // IWYU pragma: keep - this is for placement new overloads.
+
+namespace fud {
+
+ExtUtf8 ExtUtf8::fromString(const String& fudString, size_t index) noexcept
+{
+ if (!fudString.valid()) {
+ return invalidAscii();
+ }
+
+
+ return fromStringView(StringView{fudString}, index);
+}
+
+ExtUtf8 ExtUtf8::fromStringView(const StringView& view, size_t index) noexcept
+{
+ return fromStringView(StringView{view}, index);
+}
+
+ExtUtf8 ExtUtf8::fromStringView(StringView&& view, size_t index) noexcept
+{
+ auto len = view.length();
+ const auto* data = view.data();
+ if (data == nullptr) {
+ return invalidAscii();
+ }
+
+ ExtUtf8 localChar{Ascii{data[index]}};
+ if (localChar.valid()) {
+ return localChar;
+ }
+
+ if (index + 1 < len) {
+ localChar.m_variant = Utf82Byte{data[index], data[index + 1]};
+ }
+ if (localChar.valid()) {
+ return localChar;
+ }
+
+ if (index + 2 < len) {
+ localChar.m_variant = Utf83Byte{data[index], data[index + 1], data[index + 2]};
+ }
+ if (localChar.valid()) {
+ return localChar;
+ }
+
+ if (index + 3 < len) {
+ localChar.m_variant = Utf84Byte{data[index], data[index + 1], data[index + 2], data[index + 3]};
+ }
+ if (localChar.valid()) {
+ return localChar;
+ }
+
+ return invalidAscii();
+}
+
+bool ext_lib_char_is_ascii(char character)
+{
+ return static_cast<uint8_t>(character & ~ASCII_MASK) == 0;
+}
+
+FudStatus ext_lib_utf8_is_ascii(ExtUtf8* character, bool* isAscii)
+{
+ if (anyAreNull(character, isAscii)) {
+ return FudStatus::NullPointer;
+ }
+
+ *isAscii = character->getType() == ExtUtf8Type::Ascii && character->valid();
+
+ return FudStatus::Success;
+}
+
+namespace impl {
+
+/* Assumes that predicate is not a null pointer! */
+template <typename Predicate>
+inline FudStatus isAsciiPredicate(ExtUtf8* character, bool* pred, Predicate&& predicate)
+{
+ if (anyAreNull(character, pred)) {
+ return FudStatus::NullPointer;
+ }
+
+ auto maybeAscii = character->getAscii();
+ if (!maybeAscii.has_value()) {
+ return FudStatus::InvalidInput;
+ }
+
+ auto asciiChar = *maybeAscii;
+ *pred = std::forward<Predicate>(predicate)(asciiChar.asChar());
+
+ return FudStatus::Success;
+}
+
+} // namespace impl
+
+bool ext_lib_char_is_alphanumeric(char character)
+{
+ if (!ext_lib_char_is_ascii(character)) {
+ return false;
+ }
+
+ if (ext_lib_char_is_alpha(character)) {
+ return true;
+ }
+
+ return ext_lib_char_is_digit(character);
+}
+
+FudStatus ext_lib_utf8_is_alphanumeric(ExtUtf8* character, bool* pred)
+{
+ return impl::isAsciiPredicate(character, pred, ext_lib_char_is_alphanumeric);
+}
+
+bool ext_lib_char_is_alpha(char character)
+{
+ if (!ext_lib_char_is_ascii(character)) {
+ return false;
+ }
+
+ if (ext_lib_char_is_uppercase(character)) {
+ return true;
+ }
+
+ return ext_lib_char_is_lowercase(character);
+}
+
+FudStatus ext_lib_utf8_is_alpha(ExtUtf8* character, bool* pred)
+{
+ return impl::isAsciiPredicate(character, pred, ext_lib_char_is_alpha);
+}
+
+bool ext_lib_char_is_lowercase(char character)
+{
+ if (!ext_lib_char_is_ascii(character)) {
+ return false;
+ }
+
+ return 'a' <= character && character <= 'z';
+}
+
+FudStatus ext_lib_utf8_is_lowercase(ExtUtf8* character, bool* pred)
+{
+ return impl::isAsciiPredicate(character, pred, ext_lib_char_is_lowercase);
+}
+
+bool ext_lib_char_is_uppercase(char character)
+{
+ if (!ext_lib_char_is_ascii(character)) {
+ return false;
+ }
+
+ return 'A' <= character && character <= 'Z';
+}
+
+FudStatus ext_lib_utf8_is_uppercase(ExtUtf8* character, bool* pred)
+{
+ return impl::isAsciiPredicate(character, pred, ext_lib_char_is_uppercase);
+}
+
+bool ext_lib_char_is_digit(char character)
+{
+ if (!ext_lib_char_is_ascii(character)) {
+ return false;
+ }
+
+ return '0' <= character && character <= '9';
+}
+
+FudStatus ext_lib_utf8_is_digit(ExtUtf8* character, bool* pred)
+{
+ return impl::isAsciiPredicate(character, pred, ext_lib_char_is_digit);
+}
+
+bool ext_lib_char_is_hex_digit(char character)
+{
+ if (!ext_lib_char_is_ascii(character)) {
+ return false;
+ }
+
+ return ('0' <= character && character <= '9') || ('a' <= character && character <= 'f') ||
+ ('A' <= character && character <= 'F');
+}
+
+FudStatus ext_lib_utf8_is_hex_digit(ExtUtf8* character, bool* pred)
+{
+ return impl::isAsciiPredicate(character, pred, ext_lib_char_is_hex_digit);
+}
+
+bool ext_lib_char_is_control(char character)
+{
+ if (!ext_lib_char_is_ascii(character)) {
+ return false;
+ }
+
+ constexpr char maxControlChar = 0x1F;
+ constexpr const char deleteChar = 0x7F;
+ return ((static_cast<uint8_t>(character) <= maxControlChar)) || character == deleteChar;
+}
+
+FudStatus ext_lib_utf8_is_control(ExtUtf8* character, bool* pred)
+{
+ return impl::isAsciiPredicate(character, pred, ext_lib_char_is_control);
+}
+
+bool ext_lib_char_is_graphical(char character)
+{
+ if (!ext_lib_char_is_ascii(character)) {
+ return false;
+ }
+
+ return ext_lib_char_is_alphanumeric(character) || ext_lib_char_is_punctuation(character);
+}
+
+FudStatus ext_lib_utf8_is_graphical(ExtUtf8* character, bool* pred)
+{
+ return impl::isAsciiPredicate(character, pred, ext_lib_char_is_graphical);
+}
+
+bool ext_lib_char_is_space(char character)
+{
+ if (!ext_lib_char_is_ascii(character)) {
+ return false;
+ }
+
+ return character == ' ' || character == '\t' || character == '\n' || character == '\r' || character == '\v';
+}
+
+FudStatus ext_lib_utf8_is_space(ExtUtf8* character, bool* pred)
+{
+ return impl::isAsciiPredicate(character, pred, ext_lib_char_is_space);
+}
+
+bool ext_lib_char_is_blank(char character)
+{
+ if (!ext_lib_char_is_ascii(character)) {
+ return false;
+ }
+
+ return character == ' ' || character == '\t';
+}
+
+FudStatus ext_lib_utf8_is_blank(ExtUtf8* character, bool* pred)
+{
+ return impl::isAsciiPredicate(character, pred, ext_lib_char_is_blank);
+}
+
+bool ext_lib_char_is_printable(char character)
+{
+ if (!ext_lib_char_is_ascii(character)) {
+ return false;
+ }
+
+ return (character >= ' ' && character <= '~');
+}
+
+FudStatus ext_lib_utf8_is_printable(ExtUtf8* character, bool* pred)
+{
+ return impl::isAsciiPredicate(character, pred, ext_lib_char_is_printable);
+}
+
+bool ext_lib_char_is_punctuation(char character)
+{
+ if (!ext_lib_char_is_ascii(character)) {
+ return false;
+ }
+
+ return (character >= '!' && character <= '/') || (character >= ':' && character <= '@') ||
+ (character >= '[' && character <= '`') || (character >= '{' && character <= '~');
+}
+
+FudStatus ext_lib_utf8_is_punctuation(ExtUtf8* character, bool* pred)
+{
+ return impl::isAsciiPredicate(character, pred, ext_lib_char_is_punctuation);
+}
+
+uint8_t ext_lib_char_to_lower(uint8_t character)
+{
+ if (ext_lib_char_is_uppercase(static_cast<char>(character))) {
+ constexpr uint8_t lowerA = 'a';
+ constexpr uint8_t upperA = 'A';
+ return static_cast<uint8_t>(character - upperA) + lowerA;
+ }
+ return character;
+}
+
+ExtUtf8* ext_lib_utf8_to_lower(ExtUtf8* character)
+{
+ if (character == nullptr) {
+ return character;
+ }
+
+ static_cast<void>(character->transformAscii([](Ascii& ascii) {
+ ascii = Ascii{ext_lib_char_to_lower(static_cast<uint8_t>(ascii.asChar()))};
+ }));
+
+ return character;
+}
+
+uint8_t ext_lib_char_to_upper(uint8_t character)
+{
+ if (ext_lib_char_is_lowercase(static_cast<char>(character))) {
+ constexpr uint8_t lowerA = 'a';
+ constexpr uint8_t upperA = 'A';
+ return static_cast<uint8_t>(character - lowerA) + upperA;
+ }
+ return character;
+}
+
+ExtUtf8* ext_lib_utf8_to_upper(ExtUtf8* character)
+{
+ if (character == nullptr) {
+ return character;
+ }
+
+ static_cast<void>(character->transformAscii([](Ascii& ascii) {
+ ascii = Ascii{ext_lib_char_to_upper(static_cast<uint8_t>(ascii.asChar()))};
+ }));
+
+ return character;
+}
+
+} // namespace ext_lib