summaryrefslogtreecommitdiff
path: root/source/fud_utf8.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'source/fud_utf8.cpp')
-rw-r--r--source/fud_utf8.cpp343
1 files changed, 343 insertions, 0 deletions
diff --git a/source/fud_utf8.cpp b/source/fud_utf8.cpp
new file mode 100644
index 0000000..5dd5099
--- /dev/null
+++ b/source/fud_utf8.cpp
@@ -0,0 +1,343 @@
+/*
+ * libfud
+ * Copyright 2024 Dominick Allen
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fud_utf8.hpp"
+
+#include "fud_string.hpp"
+
+#include <new> // IWYU pragma: keep - this is for placement new overloads.
+
+namespace fud {
+
+FudUtf8 FudUtf8::fromString(const String& fudString, size_t index) noexcept
+{
+ if (!fudString.valid()) {
+ return invalidAscii();
+ }
+
+
+ return fromStringView(StringView{fudString}, index);
+}
+
+FudUtf8 FudUtf8::fromStringView(const StringView& view, size_t index) noexcept
+{
+ return fromStringView(StringView{view}, index);
+}
+
+FudUtf8 FudUtf8::fromStringView(StringView&& view, size_t index) noexcept
+{
+ auto len = view.length();
+ const auto* data = view.data();
+ if (data == nullptr) {
+ return invalidAscii();
+ }
+
+ FudUtf8 localChar{Ascii{data[index]}};
+ if (localChar.valid()) {
+ return localChar;
+ }
+
+ if (index + 1 < len) {
+ localChar.m_variant = Utf82Byte{data[index], data[index + 1]};
+ }
+ if (localChar.valid()) {
+ return localChar;
+ }
+
+ if (index + 2 < len) {
+ localChar.m_variant = Utf83Byte{data[index], data[index + 1], data[index + 2]};
+ }
+ if (localChar.valid()) {
+ return localChar;
+ }
+
+ if (index + 3 < len) {
+ localChar.m_variant = Utf84Byte{data[index], data[index + 1], data[index + 2], data[index + 3]};
+ }
+ if (localChar.valid()) {
+ return localChar;
+ }
+
+ return invalidAscii();
+}
+
+bool char_is_ascii(char character)
+{
+ return static_cast<uint8_t>(character & ~ASCII_MASK) == 0;
+}
+
+FudStatus utf8_is_ascii(FudUtf8* character, bool* isAscii)
+{
+ if (anyAreNull(character, isAscii)) {
+ return FudStatus::NullPointer;
+ }
+
+ *isAscii = character->getType() == Utf8Type::Ascii && character->valid();
+
+ return FudStatus::Success;
+}
+
+namespace impl {
+
+/* Assumes that predicate is not a null pointer! */
+template <typename Predicate>
+inline FudStatus isAsciiPredicate(FudUtf8* character, bool* pred, Predicate&& predicate)
+{
+ if (anyAreNull(character, pred)) {
+ return FudStatus::NullPointer;
+ }
+
+ auto maybeAscii = character->getAscii();
+ if (!maybeAscii.has_value()) {
+ return FudStatus::InvalidInput;
+ }
+
+ auto asciiChar = *maybeAscii;
+ *pred = std::forward<Predicate>(predicate)(asciiChar.asChar());
+
+ return FudStatus::Success;
+}
+
+} // namespace impl
+
+bool char_is_alphanumeric(char character)
+{
+ if (!char_is_ascii(character)) {
+ return false;
+ }
+
+ if (char_is_alpha(character)) {
+ return true;
+ }
+
+ return char_is_digit(character);
+}
+
+FudStatus utf8_is_alphanumeric(FudUtf8* character, bool* pred)
+{
+ return impl::isAsciiPredicate(character, pred, char_is_alphanumeric);
+}
+
+bool char_is_alpha(char character)
+{
+ if (!char_is_ascii(character)) {
+ return false;
+ }
+
+ if (char_is_uppercase(character)) {
+ return true;
+ }
+
+ return char_is_lowercase(character);
+}
+
+FudStatus utf8_is_alpha(FudUtf8* character, bool* pred)
+{
+ return impl::isAsciiPredicate(character, pred, char_is_alpha);
+}
+
+bool char_is_lowercase(char character)
+{
+ if (!char_is_ascii(character)) {
+ return false;
+ }
+
+ return 'a' <= character && character <= 'z';
+}
+
+FudStatus utf8_is_lowercase(FudUtf8* character, bool* pred)
+{
+ return impl::isAsciiPredicate(character, pred, char_is_lowercase);
+}
+
+bool char_is_uppercase(char character)
+{
+ if (!char_is_ascii(character)) {
+ return false;
+ }
+
+ return 'A' <= character && character <= 'Z';
+}
+
+FudStatus utf8_is_uppercase(FudUtf8* character, bool* pred)
+{
+ return impl::isAsciiPredicate(character, pred, char_is_uppercase);
+}
+
+bool char_is_digit(char character)
+{
+ if (!char_is_ascii(character)) {
+ return false;
+ }
+
+ return '0' <= character && character <= '9';
+}
+
+FudStatus utf8_is_digit(FudUtf8* character, bool* pred)
+{
+ return impl::isAsciiPredicate(character, pred, char_is_digit);
+}
+
+bool char_is_hex_digit(char character)
+{
+ if (!char_is_ascii(character)) {
+ return false;
+ }
+
+ return ('0' <= character && character <= '9') || ('a' <= character && character <= 'f') ||
+ ('A' <= character && character <= 'F');
+}
+
+FudStatus utf8_is_hex_digit(FudUtf8* character, bool* pred)
+{
+ return impl::isAsciiPredicate(character, pred, char_is_hex_digit);
+}
+
+bool char_is_control(char character)
+{
+ if (!char_is_ascii(character)) {
+ return false;
+ }
+
+ constexpr char maxControlChar = 0x1F;
+ constexpr const char deleteChar = 0x7F;
+ return ((static_cast<uint8_t>(character) <= maxControlChar)) || character == deleteChar;
+}
+
+FudStatus utf8_is_control(FudUtf8* character, bool* pred)
+{
+ return impl::isAsciiPredicate(character, pred, char_is_control);
+}
+
+bool char_is_graphical(char character)
+{
+ if (!char_is_ascii(character)) {
+ return false;
+ }
+
+ return char_is_alphanumeric(character) || char_is_punctuation(character);
+}
+
+FudStatus utf8_is_graphical(FudUtf8* character, bool* pred)
+{
+ return impl::isAsciiPredicate(character, pred, char_is_graphical);
+}
+
+bool char_is_space(char character)
+{
+ if (!char_is_ascii(character)) {
+ return false;
+ }
+
+ return character == ' ' || character == '\t' || character == '\n' || character == '\r' || character == '\v';
+}
+
+FudStatus utf8_is_space(FudUtf8* character, bool* pred)
+{
+ return impl::isAsciiPredicate(character, pred, char_is_space);
+}
+
+bool char_is_blank(char character)
+{
+ if (!char_is_ascii(character)) {
+ return false;
+ }
+
+ return character == ' ' || character == '\t';
+}
+
+FudStatus utf8_is_blank(FudUtf8* character, bool* pred)
+{
+ return impl::isAsciiPredicate(character, pred, char_is_blank);
+}
+
+bool char_is_printable(char character)
+{
+ if (!char_is_ascii(character)) {
+ return false;
+ }
+
+ return (character >= ' ' && character <= '~');
+}
+
+FudStatus utf8_is_printable(FudUtf8* character, bool* pred)
+{
+ return impl::isAsciiPredicate(character, pred, char_is_printable);
+}
+
+bool char_is_punctuation(char character)
+{
+ if (!char_is_ascii(character)) {
+ return false;
+ }
+
+ return (character >= '!' && character <= '/') || (character >= ':' && character <= '@') ||
+ (character >= '[' && character <= '`') || (character >= '{' && character <= '~');
+}
+
+FudStatus utf8_is_punctuation(FudUtf8* character, bool* pred)
+{
+ return impl::isAsciiPredicate(character, pred, char_is_punctuation);
+}
+
+uint8_t char_to_lower(uint8_t character)
+{
+ if (char_is_uppercase(static_cast<char>(character))) {
+ constexpr uint8_t lowerA = 'a';
+ constexpr uint8_t upperA = 'A';
+ return static_cast<uint8_t>(character - upperA) + lowerA;
+ }
+ return character;
+}
+
+FudUtf8* utf8_to_lower(FudUtf8* character)
+{
+ if (character == nullptr) {
+ return character;
+ }
+
+ static_cast<void>(character->transformAscii([](Ascii& ascii) {
+ ascii = Ascii{char_to_lower(static_cast<uint8_t>(ascii.asChar()))};
+ }));
+
+ return character;
+}
+
+uint8_t char_to_upper(uint8_t character)
+{
+ if (char_is_lowercase(static_cast<char>(character))) {
+ constexpr uint8_t lowerA = 'a';
+ constexpr uint8_t upperA = 'A';
+ return static_cast<uint8_t>(character - lowerA) + upperA;
+ }
+ return character;
+}
+
+FudUtf8* utf8_to_upper(FudUtf8* character)
+{
+ if (character == nullptr) {
+ return character;
+ }
+
+ static_cast<void>(character->transformAscii([](Ascii& ascii) {
+ ascii = Ascii{char_to_upper(static_cast<uint8_t>(ascii.asChar()))};
+ }));
+
+ return character;
+}
+
+} // namespace fud