diff options
author | Dominick Allen <djallen@librehumanitas.org> | 2025-01-02 15:11:51 -0600 |
---|---|---|
committer | Dominick Allen <djallen@librehumanitas.org> | 2025-01-02 15:11:51 -0600 |
commit | 87071200872c2450c947047350132aee493033c1 (patch) | |
tree | 49109532d9bbd148b4e59043120037684093be33 /source/fud_csv.cpp | |
parent | 16379362c02a2472f00fac49cad62788547c9519 (diff) |
Get basic CSV parser operating.
Diffstat (limited to 'source/fud_csv.cpp')
-rw-r--r-- | source/fud_csv.cpp | 416 |
1 files changed, 384 insertions, 32 deletions
diff --git a/source/fud_csv.cpp b/source/fud_csv.cpp index 031fcbc..198aeca 100644 --- a/source/fud_csv.cpp +++ b/source/fud_csv.cpp @@ -17,76 +17,428 @@ #include "fud_csv.hpp" +#include "fud_print.hpp" + namespace fud { -FudStatus Csv::parseCsvFromFilename( +} // namespace fud + +namespace fud { + +Csv Csv::makeDefault() +{ + return Csv::makeWithSingleAllocator(globalFudAllocator); +} + +Csv Csv::makeWithSingleAllocator(Allocator& allocator) +{ + return Csv::make(allocator, allocator); +} + +Csv Csv::make(Allocator& entryAllocator, Allocator& bufferAllocator) +{ + Csv csv{}; + csv.entries = Vector<StringView>(entryAllocator); + csv.buffer = Vector<utf8>(bufferAllocator); + return csv; +} + +FudStatus Csv::parseFromFilename( Csv& csv, Option<TextBuffer&&> bufferOption, StringView filename, OpenFlags flags, - Option<int> dirFdOption) + Option<int> dirFdOption, + Option<size_t> maxExtraAttempts) { + debugPrint(u8"Enter parse from filename\n"); auto fileResult{RegularFile::open(filename, FileAccessMode::Read, flags, dirFdOption)}; if (fileResult.isError()) { + debugPrint(u8"Error opening file: {}\n", FudStatusToString(fileResult.getError())); return fileResult.takeError(); } if (bufferOption.hasValue()) { auto bufferedFile{BufferedRegularFile::make(fileResult.takeOkay(), std::move(bufferOption.value()))}; - return parseCsvFromBufferedFile(csv, bufferedFile); + return parseFromBufferedFile(csv, bufferedFile, maxExtraAttempts); } auto unbufferedFile{fileResult.takeOkay()}; - return parseCsvFromUnbufferedFile(csv, std::move(unbufferedFile)); + return parseFromUnbufferedFile(csv, std::move(unbufferedFile), maxExtraAttempts); } -enum class CsvTextState : uint8_t -{ - UnquotedField, - QuotedField, - Separator, - Newline, -}; +template <typename File> +DrainResult readHeader(Csv& csv, File& file, Option<size_t> maxExtraAttempts, size_t& rawSize); + +template <typename File> +DrainResult scanLine(const Csv& csv, File& file, Option<size_t> maxExtraAttempts, size_t& rawSize); -FudStatus Csv::parseCsvFromBufferedFile(Csv& csv, BufferedRegularFile& file) +template <typename File> +FudStatus fillBuffer(Csv& csv, File& file, Option<size_t> maxExtraAttempts, size_t rawSize); + +FudStatus Csv::parseFromBufferedFile(Csv& csv, BufferedRegularFile& file, Option<size_t> maxExtraAttempts) { - auto lineEnding{newlineText(csv.newlineDelimiter)}; - static_cast<void>(lineEnding); - DrainResult readResult{}; - while (true) { - utf8 letter{}; - auto drainResult = file.read(reinterpret_cast<std::byte*>(&letter), sizeof(letter), NullOpt); - readResult.status = drainResult.status; - readResult.bytesDrained += drainResult.bytesDrained; - // if (status - // REMOVE - break; + size_t rawSize = 0; + DrainResult readResult{readHeader(csv, file, maxExtraAttempts, rawSize)}; + if (readResult.status == FudStatus::Partial) { + // fix this up with filling out the first row + return readResult.status; } - size_t rawSize = 0; + if (readResult.status != FudStatus::Success && readResult.status != FudStatus::Partial) { + return readResult.status; + } + + debugPrint(u8"Working with {} columns\n", csv.numColumns); - while (true) { - rawSize++; - // REMOVE - break; + csv.numLines = 1; + while (readResult.status == FudStatus::Success) { + auto lineResult = scanLine(csv, file, maxExtraAttempts, rawSize); + readResult.status = lineResult.status; + readResult.bytesDrained += lineResult.bytesDrained; + if (readResult.status == FudStatus::Success || readResult.status == FudStatus::Partial) { + csv.numLines++; + debugPrint(u8"Read additional line - total of {}\n", csv.numLines); + } } - auto reserveStatus = csv.buffer.reserve(rawSize); + if (readResult.status == FudStatus::Empty || readResult.status == FudStatus::Partial) { + readResult.status = FudStatus::Success; + } + + if (readResult.status != FudStatus::Success) { + return readResult.status; + } + + if (std::numeric_limits<size_t>::max() / csv.numLines < csv.numColumns) { + debugPrint(u8"Fail: csv.numlines = {}, csv.numColumns = {}\n", csv.numLines, csv.numColumns); + return FudStatus::Failure; + } + + debugPrint(u8"Working with {} lines\n", csv.numLines); + + auto reserveStatus = csv.entries.reserve(csv.numLines * csv.numColumns); + if (reserveStatus != FudStatus::Success) { + return reserveStatus; + } + + reserveStatus = csv.buffer.reserve(rawSize); if (reserveStatus != FudStatus::Success) { return reserveStatus; } - return FudStatus::NotImplemented; + debugPrint(u8"Reserved space - {}\n", rawSize + 1); + + auto fillStatus = fillBuffer(csv, file, maxExtraAttempts, rawSize); + + if (fillStatus != FudStatus::Success) { + return fillStatus; + } + + return FudStatus::Success; } -FudStatus Csv::parseCsvFromUnbufferedFile(Csv& csv, RegularFile&& file) +FudStatus Csv::parseFromUnbufferedFile(Csv& csv, RegularFile&& file, Option<size_t> maxExtraAttempts) { static_cast<void>(csv); constexpr size_t BufferSize = 256; SimpleStackAllocator<BufferSize> stackAllocator{}; auto bufferedFile{BufferedRegularFile::make(std::move(file), TextBuffer{stackAllocator})}; - return parseCsvFromBufferedFile(csv, bufferedFile); + return parseFromBufferedFile(csv, bufferedFile, maxExtraAttempts); +} + +// NOLINTBEGIN(readability-function-cognitive-complexity) +template <typename File> +DrainResult readHeader(Csv& csv, File& file, Option<size_t> maxExtraAttempts, size_t& rawSize) +{ + debugPrint(u8"Entered read header\n"); + + DrainResult readResult{}; + + auto lineEnding{newlineText(csv.newlineDelimiter)}; + bool endOfLine{false}; + bool maybeNewline{false}; + bool inQuote{false}; + bool sawQuote{false}; + + while (not endOfLine) { + Utf8 utf8Char{}; + + auto drainResult{file.readUtf8(utf8Char, maxExtraAttempts)}; + readResult.status = drainResult.status; + readResult.bytesDrained += drainResult.bytesDrained; + if (readResult.status != FudStatus::Success) { + debugPrint(u8"Failed to read: {}", FudStatusToString(readResult.status)); + return readResult; + } + + if (csv.strictUtf8 && not utf8Char.valid()) { + debugPrint(u8"UTF8 invalid\n"); + readResult.status = FudStatus::Utf8Invalid; + return readResult; + } + + if (maybeNewline and utf8Char != Utf8{Ascii{lineEnding[1]}}) { + rawSize += 2; + maybeNewline = false; + } else if (inQuote and utf8Char == csv.quoteCharacter) { + inQuote = false; + sawQuote = true; + } else if (inQuote) { + sawQuote = false; + rawSize += utf8Char.size(); + } else if (utf8Char == csv.quoteCharacter) { + inQuote = true; + if (sawQuote) { + rawSize += utf8Char.size(); + sawQuote = false; + } + } else if (utf8Char == csv.columnDelimiter) { + csv.numColumns++; + } else if (not maybeNewline and utf8Char == Utf8{Ascii{lineEnding[0]}}) { + if (lineEnding.length() == 1) { + csv.numColumns++; + endOfLine = true; + } else { + maybeNewline = true; + } + } else if (maybeNewline and utf8Char == Utf8{Ascii{lineEnding[1]}}) { + maybeNewline = false; + endOfLine = true; + } else { + rawSize += utf8Char.size(); + } + } + + return readResult; +} + +template <typename File> +DrainResult scanLine(const Csv& csv, File& file, Option<size_t> maxExtraAttempts, size_t& rawSize) +{ + debugPrint(u8"Entered scanLine\n"); + DrainResult readResult{}; + + auto lineEnding{newlineText(csv.newlineDelimiter)}; + bool endOfLine{false}; + bool maybeNewline{false}; + bool inQuote{false}; + bool sawQuote{false}; + bool addToSize{}; + size_t numColumns{0}; + + while (not endOfLine) { + addToSize = false; + Utf8 utf8Char{}; + auto drainResult{file.readUtf8(utf8Char, maxExtraAttempts)}; + readResult.status = drainResult.status; + readResult.bytesDrained += drainResult.bytesDrained; + if (readResult.status != FudStatus::Success) { + break; + } + + if (csv.strictUtf8 && not utf8Char.valid()) { + readResult.status = FudStatus::Utf8Invalid; + break; + } + + debugPrint(u8"{}", utf8Char.data()); + + if (maybeNewline and utf8Char != Utf8{Ascii{lineEnding[1]}}) { + rawSize += 2; + maybeNewline = false; + } else if (inQuote and utf8Char == csv.quoteCharacter) { + inQuote = false; + sawQuote = true; + } else if (inQuote) { + addToSize = true; + sawQuote = false; + } else if (utf8Char == csv.quoteCharacter) { + inQuote = true; + if (sawQuote) { + addToSize = true; + sawQuote = false; + } + } else if (utf8Char == csv.columnDelimiter) { + numColumns++; + } else if (not maybeNewline and utf8Char == Utf8{Ascii{lineEnding[0]}}) { + if (lineEnding.length() == 1) { + numColumns++; + endOfLine = true; + } else { + maybeNewline = true; + } + } else if (maybeNewline and utf8Char == Utf8{Ascii{lineEnding[1]}}) { + maybeNewline = false; + endOfLine = true; + } else { + addToSize = true; + } + + if (addToSize and numColumns < csv.numColumns) { + rawSize += utf8Char.size(); + } + + if (numColumns > csv.numColumns and csv.strictColumns) { + readResult.status = FudStatus::FormatInvalid; + break; + } + } + + if (numColumns > 0) { + debugPrint(u8"\n - Read line above with {} columns \n", numColumns); + } + + if (numColumns == 0) { + debugPrint(u8"Read no additional columns\n"); + readResult.status = FudStatus::Empty; + return readResult; + } + + if (numColumns != csv.numColumns and csv.strictColumns) { + readResult.status = FudStatus::FormatInvalid; + return readResult; + } + + return readResult; +} + +template <typename File> +FudStatus fillBuffer(Csv& csv, File& file, Option<size_t> maxExtraAttempts, size_t rawSize) +{ + static_cast<void>(rawSize); + + auto seekStatus = file.seekStart(); + if (seekStatus != FudStatus::Success) { + return seekStatus; + } + + auto lineEnding{newlineText(csv.newlineDelimiter)}; + + size_t sizeCounter{0}; + size_t numColumns{0}; + bool maybeNewline{false}; + bool inQuote{false}; + bool sawQuote{false}; + bool addToSize{}; + bool terminateEntry{false}; + bool endLine{false}; + + StringView currentEntry{}; + currentEntry.m_data = csv.buffer.data() + sizeCounter; + + size_t numEntries = csv.numLines * csv.numColumns; + debugPrint(u8"Working over {} entries\n", numEntries); + for (size_t entryIndex = 0; entryIndex < numEntries;) { + addToSize = false; + Utf8 utf8Char{}; + auto drainResult{file.readUtf8(utf8Char, maxExtraAttempts)}; + if (drainResult.status != FudStatus::Success) { + return drainResult.status; + } + + if (maybeNewline and utf8Char != Utf8{Ascii{lineEnding[1]}}) { + addToSize = true; + // TODO: this actually needs proper handling still + maybeNewline = false; + } else if (inQuote and utf8Char == csv.quoteCharacter) { + inQuote = false; + sawQuote = true; + } else if (inQuote) { + sawQuote = false; + addToSize = true; + } else if (utf8Char == csv.quoteCharacter) { + inQuote = true; + if (sawQuote) { + addToSize = true; + sawQuote = false; + } + } else if (utf8Char == csv.columnDelimiter) { + terminateEntry = true; + } else if (not maybeNewline and utf8Char == Utf8{Ascii{lineEnding[0]}}) { + if (lineEnding.length() == 1) { + terminateEntry = true; + endLine = true; + } else { + maybeNewline = true; + } + } else if (maybeNewline and utf8Char == Utf8{Ascii{lineEnding[1]}}) { + maybeNewline = false; + terminateEntry = true; + endLine = true; + } else { + addToSize = true; + } + + if (not inQuote and sawQuote and utf8Char != csv.quoteCharacter) { + sawQuote = false; + } + + if (terminateEntry) { + auto pushStatus = csv.entries.pushBack(currentEntry); + if (pushStatus != FudStatus::Success) { + return pushStatus; + } + currentEntry.m_length = 0; + terminateEntry = false; + entryIndex++; + if (endLine) { + numColumns = 0; + endLine = false; + } + } + + if (addToSize and numColumns < csv.numColumns) { + FudStatus extendStatus{FudStatus::Success}; + auto charSize = utf8Char.size(); + switch (charSize) { + case 1: + extendStatus = csv.buffer.extend(Span<const utf8, 1>{utf8Char.data(), 1}); + break; + case 2: + extendStatus = csv.buffer.extend(Span<const utf8, 2>{utf8Char.data(), 2}); + break; + case 3: + extendStatus = csv.buffer.extend(Span<const utf8, 3>{utf8Char.data(), 3}); + break; + case 4: + extendStatus = csv.buffer.extend(Span<const utf8, 4>{utf8Char.data(), 4}); + break; + case 0: + default: + debugPrint(u8"Char size is ??? {}\n", charSize); + charSize = 0; + break; + } + if (extendStatus != FudStatus::Success) { + return extendStatus; + } + sizeCounter += charSize; + currentEntry.m_length += charSize; + } + } + + debugPrint( + u8"Buffer, with current size = {} and sizeCounter, = {}:\n-----\n{}\n-----\n", + csv.buffer.size(), + sizeCounter, + StringView{csv.buffer.size(), csv.buffer.data()}); + + const auto* data = csv.buffer.data(); + for (auto& entry : csv.entries) { + entry.m_data = data; + data += entry.m_length; + debugPrint(u8"Entry = {}\n", entry); + } + + if (rawSize != sizeCounter) { + debugPrint(u8"Invalid: rawSize == {}, sizeCounter == {}"); + return FudStatus::Failure; + } + + return FudStatus::Success; } +// NOLINTEND(readability-function-cognitive-complexity) } // namespace fud |