diff options
author | Dominick Allen <djallen@librehumanitas.org> | 2025-01-02 15:11:51 -0600 |
---|---|---|
committer | Dominick Allen <djallen@librehumanitas.org> | 2025-01-02 15:11:51 -0600 |
commit | 87071200872c2450c947047350132aee493033c1 (patch) | |
tree | 49109532d9bbd148b4e59043120037684093be33 /include/fud_utf8.hpp | |
parent | 16379362c02a2472f00fac49cad62788547c9519 (diff) |
Get basic CSV parser operating.
Diffstat (limited to 'include/fud_utf8.hpp')
-rw-r--r-- | include/fud_utf8.hpp | 26 |
1 files changed, 26 insertions, 0 deletions
diff --git a/include/fud_utf8.hpp b/include/fud_utf8.hpp index 119640c..030164d 100644 --- a/include/fud_utf8.hpp +++ b/include/fud_utf8.hpp @@ -275,12 +275,37 @@ enum class Utf8Type : uint8_t Utf82Byte, Utf83Byte, Utf84Byte, + Invalid }; static_assert(Utf8TypeSet.m_values[0] == static_cast<uint8_t>(Utf8Type::Ascii)); static_assert(Utf8TypeSet.m_values[1] == static_cast<uint8_t>(Utf8Type::Utf82Byte)); static_assert(Utf8TypeSet.m_values[2] == static_cast<uint8_t>(Utf8Type::Utf83Byte)); static_assert(Utf8TypeSet.m_values[3] == static_cast<uint8_t>(Utf8Type::Utf84Byte)); +/* +| B | E | Byte 1 | Byte 2 | Byte 3 | Byte 4 +| U+0000 | U+007F | 0xxxxxxx | | | +| U+0080 | U+07FF | 110xxxxx | 10xxxxxx | | +| U+0800 | U+FFFF | 1110xxxx | 10xxxxxx | 10xxxxxx | +| U+10000 | U+10FFFF | 11110xxx | 10xxxxxx | 10xxxxxx | 10xxxxxx +*/ +constexpr Utf8Type utf8TypeFromByte(utf8 input) { + if ((input >> 7) == 0) { + return Utf8Type::Ascii; + } + if ((input >> 5) == 0b110) { + return Utf8Type::Utf82Byte; + } + if ((input >> 4) == 0b1110) { + return Utf8Type::Utf83Byte; + } + if ((input >> 3) == 0b11110) { + return Utf8Type::Utf84Byte; + } + + return Utf8Type::Invalid; +} + struct Utf8 { Utf8Variant m_variant{Utf8Variant{Ascii{}}}; @@ -445,6 +470,7 @@ struct Utf8 { static_cast<int64_t>(std::get<Utf84Byte>(m_variant).characters[1]) << TwoByteShift | static_cast<int64_t>(std::get<Utf84Byte>(m_variant).characters[2]) << OneByteShift | static_cast<int64_t>(std::get<Utf84Byte>(m_variant).characters[3]); + case Utf8Type::Invalid: default: // unlikely return -1; } |