summaryrefslogtreecommitdiff
path: root/include/fud_utf8.hpp
diff options
context:
space:
mode:
authorDominick Allen <djallen@librehumanitas.org>2025-01-02 15:11:51 -0600
committerDominick Allen <djallen@librehumanitas.org>2025-01-02 15:11:51 -0600
commit87071200872c2450c947047350132aee493033c1 (patch)
tree49109532d9bbd148b4e59043120037684093be33 /include/fud_utf8.hpp
parent16379362c02a2472f00fac49cad62788547c9519 (diff)
Get basic CSV parser operating.
Diffstat (limited to 'include/fud_utf8.hpp')
-rw-r--r--include/fud_utf8.hpp26
1 files changed, 26 insertions, 0 deletions
diff --git a/include/fud_utf8.hpp b/include/fud_utf8.hpp
index 119640c..030164d 100644
--- a/include/fud_utf8.hpp
+++ b/include/fud_utf8.hpp
@@ -275,12 +275,37 @@ enum class Utf8Type : uint8_t
Utf82Byte,
Utf83Byte,
Utf84Byte,
+ Invalid
};
static_assert(Utf8TypeSet.m_values[0] == static_cast<uint8_t>(Utf8Type::Ascii));
static_assert(Utf8TypeSet.m_values[1] == static_cast<uint8_t>(Utf8Type::Utf82Byte));
static_assert(Utf8TypeSet.m_values[2] == static_cast<uint8_t>(Utf8Type::Utf83Byte));
static_assert(Utf8TypeSet.m_values[3] == static_cast<uint8_t>(Utf8Type::Utf84Byte));
+/*
+| B | E | Byte 1 | Byte 2 | Byte 3 | Byte 4
+| U+0000 | U+007F | 0xxxxxxx | | |
+| U+0080 | U+07FF | 110xxxxx | 10xxxxxx | |
+| U+0800 | U+FFFF | 1110xxxx | 10xxxxxx | 10xxxxxx |
+| U+10000 | U+10FFFF | 11110xxx | 10xxxxxx | 10xxxxxx | 10xxxxxx
+*/
+constexpr Utf8Type utf8TypeFromByte(utf8 input) {
+ if ((input >> 7) == 0) {
+ return Utf8Type::Ascii;
+ }
+ if ((input >> 5) == 0b110) {
+ return Utf8Type::Utf82Byte;
+ }
+ if ((input >> 4) == 0b1110) {
+ return Utf8Type::Utf83Byte;
+ }
+ if ((input >> 3) == 0b11110) {
+ return Utf8Type::Utf84Byte;
+ }
+
+ return Utf8Type::Invalid;
+}
+
struct Utf8 {
Utf8Variant m_variant{Utf8Variant{Ascii{}}};
@@ -445,6 +470,7 @@ struct Utf8 {
static_cast<int64_t>(std::get<Utf84Byte>(m_variant).characters[1]) << TwoByteShift |
static_cast<int64_t>(std::get<Utf84Byte>(m_variant).characters[2]) << OneByteShift |
static_cast<int64_t>(std::get<Utf84Byte>(m_variant).characters[3]);
+ case Utf8Type::Invalid:
default: // unlikely
return -1;
}