From 87071200872c2450c947047350132aee493033c1 Mon Sep 17 00:00:00 2001 From: Dominick Allen Date: Thu, 2 Jan 2025 15:11:51 -0600 Subject: Get basic CSV parser operating. --- source/fud_csv.cpp | 416 +++++++++++++++++++++++++++++++++++++++++---- source/fud_file.cpp | 118 +++++++++---- source/fud_string.cpp | 4 - source/fud_string_view.cpp | 2 +- 4 files changed, 467 insertions(+), 73 deletions(-) (limited to 'source') diff --git a/source/fud_csv.cpp b/source/fud_csv.cpp index 031fcbc..198aeca 100644 --- a/source/fud_csv.cpp +++ b/source/fud_csv.cpp @@ -17,76 +17,428 @@ #include "fud_csv.hpp" +#include "fud_print.hpp" + namespace fud { -FudStatus Csv::parseCsvFromFilename( +} // namespace fud + +namespace fud { + +Csv Csv::makeDefault() +{ + return Csv::makeWithSingleAllocator(globalFudAllocator); +} + +Csv Csv::makeWithSingleAllocator(Allocator& allocator) +{ + return Csv::make(allocator, allocator); +} + +Csv Csv::make(Allocator& entryAllocator, Allocator& bufferAllocator) +{ + Csv csv{}; + csv.entries = Vector(entryAllocator); + csv.buffer = Vector(bufferAllocator); + return csv; +} + +FudStatus Csv::parseFromFilename( Csv& csv, Option bufferOption, StringView filename, OpenFlags flags, - Option dirFdOption) + Option dirFdOption, + Option maxExtraAttempts) { + debugPrint(u8"Enter parse from filename\n"); auto fileResult{RegularFile::open(filename, FileAccessMode::Read, flags, dirFdOption)}; if (fileResult.isError()) { + debugPrint(u8"Error opening file: {}\n", FudStatusToString(fileResult.getError())); return fileResult.takeError(); } if (bufferOption.hasValue()) { auto bufferedFile{BufferedRegularFile::make(fileResult.takeOkay(), std::move(bufferOption.value()))}; - return parseCsvFromBufferedFile(csv, bufferedFile); + return parseFromBufferedFile(csv, bufferedFile, maxExtraAttempts); } auto unbufferedFile{fileResult.takeOkay()}; - return parseCsvFromUnbufferedFile(csv, std::move(unbufferedFile)); + return parseFromUnbufferedFile(csv, std::move(unbufferedFile), maxExtraAttempts); } -enum class CsvTextState : uint8_t -{ - UnquotedField, - QuotedField, - Separator, - Newline, -}; +template +DrainResult readHeader(Csv& csv, File& file, Option maxExtraAttempts, size_t& rawSize); + +template +DrainResult scanLine(const Csv& csv, File& file, Option maxExtraAttempts, size_t& rawSize); -FudStatus Csv::parseCsvFromBufferedFile(Csv& csv, BufferedRegularFile& file) +template +FudStatus fillBuffer(Csv& csv, File& file, Option maxExtraAttempts, size_t rawSize); + +FudStatus Csv::parseFromBufferedFile(Csv& csv, BufferedRegularFile& file, Option maxExtraAttempts) { - auto lineEnding{newlineText(csv.newlineDelimiter)}; - static_cast(lineEnding); - DrainResult readResult{}; - while (true) { - utf8 letter{}; - auto drainResult = file.read(reinterpret_cast(&letter), sizeof(letter), NullOpt); - readResult.status = drainResult.status; - readResult.bytesDrained += drainResult.bytesDrained; - // if (status - // REMOVE - break; + size_t rawSize = 0; + DrainResult readResult{readHeader(csv, file, maxExtraAttempts, rawSize)}; + if (readResult.status == FudStatus::Partial) { + // fix this up with filling out the first row + return readResult.status; } - size_t rawSize = 0; + if (readResult.status != FudStatus::Success && readResult.status != FudStatus::Partial) { + return readResult.status; + } + + debugPrint(u8"Working with {} columns\n", csv.numColumns); - while (true) { - rawSize++; - // REMOVE - break; + csv.numLines = 1; + while (readResult.status == FudStatus::Success) { + auto lineResult = scanLine(csv, file, maxExtraAttempts, rawSize); + readResult.status = lineResult.status; + readResult.bytesDrained += lineResult.bytesDrained; + if (readResult.status == FudStatus::Success || readResult.status == FudStatus::Partial) { + csv.numLines++; + debugPrint(u8"Read additional line - total of {}\n", csv.numLines); + } } - auto reserveStatus = csv.buffer.reserve(rawSize); + if (readResult.status == FudStatus::Empty || readResult.status == FudStatus::Partial) { + readResult.status = FudStatus::Success; + } + + if (readResult.status != FudStatus::Success) { + return readResult.status; + } + + if (std::numeric_limits::max() / csv.numLines < csv.numColumns) { + debugPrint(u8"Fail: csv.numlines = {}, csv.numColumns = {}\n", csv.numLines, csv.numColumns); + return FudStatus::Failure; + } + + debugPrint(u8"Working with {} lines\n", csv.numLines); + + auto reserveStatus = csv.entries.reserve(csv.numLines * csv.numColumns); + if (reserveStatus != FudStatus::Success) { + return reserveStatus; + } + + reserveStatus = csv.buffer.reserve(rawSize); if (reserveStatus != FudStatus::Success) { return reserveStatus; } - return FudStatus::NotImplemented; + debugPrint(u8"Reserved space - {}\n", rawSize + 1); + + auto fillStatus = fillBuffer(csv, file, maxExtraAttempts, rawSize); + + if (fillStatus != FudStatus::Success) { + return fillStatus; + } + + return FudStatus::Success; } -FudStatus Csv::parseCsvFromUnbufferedFile(Csv& csv, RegularFile&& file) +FudStatus Csv::parseFromUnbufferedFile(Csv& csv, RegularFile&& file, Option maxExtraAttempts) { static_cast(csv); constexpr size_t BufferSize = 256; SimpleStackAllocator stackAllocator{}; auto bufferedFile{BufferedRegularFile::make(std::move(file), TextBuffer{stackAllocator})}; - return parseCsvFromBufferedFile(csv, bufferedFile); + return parseFromBufferedFile(csv, bufferedFile, maxExtraAttempts); +} + +// NOLINTBEGIN(readability-function-cognitive-complexity) +template +DrainResult readHeader(Csv& csv, File& file, Option maxExtraAttempts, size_t& rawSize) +{ + debugPrint(u8"Entered read header\n"); + + DrainResult readResult{}; + + auto lineEnding{newlineText(csv.newlineDelimiter)}; + bool endOfLine{false}; + bool maybeNewline{false}; + bool inQuote{false}; + bool sawQuote{false}; + + while (not endOfLine) { + Utf8 utf8Char{}; + + auto drainResult{file.readUtf8(utf8Char, maxExtraAttempts)}; + readResult.status = drainResult.status; + readResult.bytesDrained += drainResult.bytesDrained; + if (readResult.status != FudStatus::Success) { + debugPrint(u8"Failed to read: {}", FudStatusToString(readResult.status)); + return readResult; + } + + if (csv.strictUtf8 && not utf8Char.valid()) { + debugPrint(u8"UTF8 invalid\n"); + readResult.status = FudStatus::Utf8Invalid; + return readResult; + } + + if (maybeNewline and utf8Char != Utf8{Ascii{lineEnding[1]}}) { + rawSize += 2; + maybeNewline = false; + } else if (inQuote and utf8Char == csv.quoteCharacter) { + inQuote = false; + sawQuote = true; + } else if (inQuote) { + sawQuote = false; + rawSize += utf8Char.size(); + } else if (utf8Char == csv.quoteCharacter) { + inQuote = true; + if (sawQuote) { + rawSize += utf8Char.size(); + sawQuote = false; + } + } else if (utf8Char == csv.columnDelimiter) { + csv.numColumns++; + } else if (not maybeNewline and utf8Char == Utf8{Ascii{lineEnding[0]}}) { + if (lineEnding.length() == 1) { + csv.numColumns++; + endOfLine = true; + } else { + maybeNewline = true; + } + } else if (maybeNewline and utf8Char == Utf8{Ascii{lineEnding[1]}}) { + maybeNewline = false; + endOfLine = true; + } else { + rawSize += utf8Char.size(); + } + } + + return readResult; +} + +template +DrainResult scanLine(const Csv& csv, File& file, Option maxExtraAttempts, size_t& rawSize) +{ + debugPrint(u8"Entered scanLine\n"); + DrainResult readResult{}; + + auto lineEnding{newlineText(csv.newlineDelimiter)}; + bool endOfLine{false}; + bool maybeNewline{false}; + bool inQuote{false}; + bool sawQuote{false}; + bool addToSize{}; + size_t numColumns{0}; + + while (not endOfLine) { + addToSize = false; + Utf8 utf8Char{}; + auto drainResult{file.readUtf8(utf8Char, maxExtraAttempts)}; + readResult.status = drainResult.status; + readResult.bytesDrained += drainResult.bytesDrained; + if (readResult.status != FudStatus::Success) { + break; + } + + if (csv.strictUtf8 && not utf8Char.valid()) { + readResult.status = FudStatus::Utf8Invalid; + break; + } + + debugPrint(u8"{}", utf8Char.data()); + + if (maybeNewline and utf8Char != Utf8{Ascii{lineEnding[1]}}) { + rawSize += 2; + maybeNewline = false; + } else if (inQuote and utf8Char == csv.quoteCharacter) { + inQuote = false; + sawQuote = true; + } else if (inQuote) { + addToSize = true; + sawQuote = false; + } else if (utf8Char == csv.quoteCharacter) { + inQuote = true; + if (sawQuote) { + addToSize = true; + sawQuote = false; + } + } else if (utf8Char == csv.columnDelimiter) { + numColumns++; + } else if (not maybeNewline and utf8Char == Utf8{Ascii{lineEnding[0]}}) { + if (lineEnding.length() == 1) { + numColumns++; + endOfLine = true; + } else { + maybeNewline = true; + } + } else if (maybeNewline and utf8Char == Utf8{Ascii{lineEnding[1]}}) { + maybeNewline = false; + endOfLine = true; + } else { + addToSize = true; + } + + if (addToSize and numColumns < csv.numColumns) { + rawSize += utf8Char.size(); + } + + if (numColumns > csv.numColumns and csv.strictColumns) { + readResult.status = FudStatus::FormatInvalid; + break; + } + } + + if (numColumns > 0) { + debugPrint(u8"\n - Read line above with {} columns \n", numColumns); + } + + if (numColumns == 0) { + debugPrint(u8"Read no additional columns\n"); + readResult.status = FudStatus::Empty; + return readResult; + } + + if (numColumns != csv.numColumns and csv.strictColumns) { + readResult.status = FudStatus::FormatInvalid; + return readResult; + } + + return readResult; +} + +template +FudStatus fillBuffer(Csv& csv, File& file, Option maxExtraAttempts, size_t rawSize) +{ + static_cast(rawSize); + + auto seekStatus = file.seekStart(); + if (seekStatus != FudStatus::Success) { + return seekStatus; + } + + auto lineEnding{newlineText(csv.newlineDelimiter)}; + + size_t sizeCounter{0}; + size_t numColumns{0}; + bool maybeNewline{false}; + bool inQuote{false}; + bool sawQuote{false}; + bool addToSize{}; + bool terminateEntry{false}; + bool endLine{false}; + + StringView currentEntry{}; + currentEntry.m_data = csv.buffer.data() + sizeCounter; + + size_t numEntries = csv.numLines * csv.numColumns; + debugPrint(u8"Working over {} entries\n", numEntries); + for (size_t entryIndex = 0; entryIndex < numEntries;) { + addToSize = false; + Utf8 utf8Char{}; + auto drainResult{file.readUtf8(utf8Char, maxExtraAttempts)}; + if (drainResult.status != FudStatus::Success) { + return drainResult.status; + } + + if (maybeNewline and utf8Char != Utf8{Ascii{lineEnding[1]}}) { + addToSize = true; + // TODO: this actually needs proper handling still + maybeNewline = false; + } else if (inQuote and utf8Char == csv.quoteCharacter) { + inQuote = false; + sawQuote = true; + } else if (inQuote) { + sawQuote = false; + addToSize = true; + } else if (utf8Char == csv.quoteCharacter) { + inQuote = true; + if (sawQuote) { + addToSize = true; + sawQuote = false; + } + } else if (utf8Char == csv.columnDelimiter) { + terminateEntry = true; + } else if (not maybeNewline and utf8Char == Utf8{Ascii{lineEnding[0]}}) { + if (lineEnding.length() == 1) { + terminateEntry = true; + endLine = true; + } else { + maybeNewline = true; + } + } else if (maybeNewline and utf8Char == Utf8{Ascii{lineEnding[1]}}) { + maybeNewline = false; + terminateEntry = true; + endLine = true; + } else { + addToSize = true; + } + + if (not inQuote and sawQuote and utf8Char != csv.quoteCharacter) { + sawQuote = false; + } + + if (terminateEntry) { + auto pushStatus = csv.entries.pushBack(currentEntry); + if (pushStatus != FudStatus::Success) { + return pushStatus; + } + currentEntry.m_length = 0; + terminateEntry = false; + entryIndex++; + if (endLine) { + numColumns = 0; + endLine = false; + } + } + + if (addToSize and numColumns < csv.numColumns) { + FudStatus extendStatus{FudStatus::Success}; + auto charSize = utf8Char.size(); + switch (charSize) { + case 1: + extendStatus = csv.buffer.extend(Span{utf8Char.data(), 1}); + break; + case 2: + extendStatus = csv.buffer.extend(Span{utf8Char.data(), 2}); + break; + case 3: + extendStatus = csv.buffer.extend(Span{utf8Char.data(), 3}); + break; + case 4: + extendStatus = csv.buffer.extend(Span{utf8Char.data(), 4}); + break; + case 0: + default: + debugPrint(u8"Char size is ??? {}\n", charSize); + charSize = 0; + break; + } + if (extendStatus != FudStatus::Success) { + return extendStatus; + } + sizeCounter += charSize; + currentEntry.m_length += charSize; + } + } + + debugPrint( + u8"Buffer, with current size = {} and sizeCounter, = {}:\n-----\n{}\n-----\n", + csv.buffer.size(), + sizeCounter, + StringView{csv.buffer.size(), csv.buffer.data()}); + + const auto* data = csv.buffer.data(); + for (auto& entry : csv.entries) { + entry.m_data = data; + data += entry.m_length; + debugPrint(u8"Entry = {}\n", entry); + } + + if (rawSize != sizeCounter) { + debugPrint(u8"Invalid: rawSize == {}, sizeCounter == {}"); + return FudStatus::Failure; + } + + return FudStatus::Success; } +// NOLINTEND(readability-function-cognitive-complexity) } // namespace fud diff --git a/source/fud_file.cpp b/source/fud_file.cpp index caf0f5a..ca6404d 100644 --- a/source/fud_file.cpp +++ b/source/fud_file.cpp @@ -552,11 +552,11 @@ FudStatus BufferedRegularFile::resizeBuffer(size_t size) return m_buffer.resize(size); } -DrainResult BufferedRegularFile::write(const std::byte* source, size_t length, Option maxExtraAttempts) +DrainResult BufferedRegularFile::validateBufferedIO(const std::byte* pointer, Operation requestedOperation) { DrainResult result{0, FudStatus::Success}; - if (source == nullptr) { + if (pointer == nullptr) { result.status = FudStatus::NullPointer; return result; } @@ -566,9 +566,25 @@ DrainResult BufferedRegularFile::write(const std::byte* source, size_t length, O return result; } - if (m_lastOperation != Operation::Write) { + if (requestedOperation == Operation::Read && m_lastOperation == Operation::Write && m_bufferLength > 0) { + result.status = FudStatus::OperationInvalid; + return result; + } + + if (m_lastOperation != requestedOperation) { m_bufferLength = 0; - m_lastOperation = Operation::Write; + m_bufferPosition = 0; + m_lastOperation = requestedOperation; + } + + return result; +} + +DrainResult BufferedRegularFile::write(const std::byte* source, size_t length, Option maxExtraAttempts) +{ + DrainResult result{validateBufferedIO(source, Operation::Write)}; + if (result.status != FudStatus::Success) { + return result; } if (length == 0) { @@ -628,46 +644,17 @@ DrainResult BufferedRegularFile::write(const std::byte* source, size_t length, O DrainResult BufferedRegularFile::read(std::byte* sink, size_t length, Option maxExtraAttempts) { auto extraAttempts = maxExtraAttempts.valueOr(0); - DrainResult result{0, FudStatus::Success}; + DrainResult result{validateBufferedIO(sink, Operation::Read)}; - if (sink == nullptr) { - result.status = FudStatus::NullPointer; - return result; - } - - if (not m_file.isOpen()) { - result.status = FudStatus::HandleInvalid; - return result; - } - - if (m_lastOperation == Operation::Write && m_bufferLength > 0) { - result.status = FudStatus::OperationInvalid; + if (result.status != FudStatus::Success) { return result; } - if (m_lastOperation != Operation::Read) { - m_lastOperation = Operation::Read; - m_bufferPosition = 0; - m_bufferLength = 0; - } - if (length == 0) { return result; } - if (m_bufferLength > 0 && m_bufferPosition < m_bufferLength) { - auto remainingLength = m_bufferLength - m_bufferPosition; - auto count = min(length, remainingLength); - - auto copyStatus = copyMem(sink, length, m_buffer.data() + m_bufferPosition, count); - fudAssert(copyStatus == FudStatus::Success); - - sink += count; - length -= count; - - m_bufferPosition += count; - result.bytesDrained += count; - } + drainReadBuffer(sink, length, result); fudAssert(length == 0 || m_bufferPosition == m_bufferLength); @@ -727,6 +714,65 @@ DrainResult BufferedRegularFile::read(std::byte* sink, size_t length, Option maxExtraAttempts) +{ + size_t extraAttempts{maxExtraAttempts.valueOr(0)}; + Array utf8Data{}; + auto drainResult = read(reinterpret_cast(utf8Data.data()), 1, maxExtraAttempts); + if (drainResult.status != FudStatus::Success) { + return drainResult; + } + + auto utf8Type = utf8TypeFromByte(utf8Data[0]); + uint8_t bytesToRead{0}; + switch (utf8Type) { + case Utf8Type::Ascii: + break; + case Utf8Type::Utf82Byte: + bytesToRead = 1; + break; + case Utf8Type::Utf83Byte: + bytesToRead = 2; + break; + case Utf8Type::Utf84Byte: + bytesToRead = 3; + break; + case Utf8Type::Invalid: + default: + sink = Utf8{Ascii{utf8Data[0]}}; + drainResult.status = FudStatus::Utf8Invalid; + return drainResult; + } + + if (bytesToRead > 0) { + auto utf8ReadResult = read(reinterpret_cast(utf8Data.data() + 1), bytesToRead, extraAttempts); + drainResult.status = utf8ReadResult.status; + drainResult.bytesDrained += utf8ReadResult.bytesDrained; + } + + sink = Utf8::make(utf8Data); + + return drainResult; +} + +void BufferedRegularFile::drainReadBuffer(std::byte*& sink, size_t& length, DrainResult& result) +{ + if (m_bufferLength > 0 && m_bufferPosition < m_bufferLength) { + auto remainingLength = m_bufferLength - m_bufferPosition; + auto count = min(length, remainingLength); + + auto copyStatus = copyMem(sink, length, m_buffer.data() + m_bufferPosition, count); + fudAssert(copyStatus == FudStatus::Success); + + sink += count; + length -= count; + + m_bufferPosition += count; + result.bytesDrained += count; + } +} + FudStatus BufferedRegularFile::setBuffer(Vector&& buffer, bool discardOldBuffer) { static_cast(buffer); diff --git a/source/fud_string.cpp b/source/fud_string.cpp index cd3e918..2a17201 100644 --- a/source/fud_string.cpp +++ b/source/fud_string.cpp @@ -131,10 +131,6 @@ StringResult String::from(StringView view, Allocator* allocator) return StringResult::error(FudStatus::ArgumentInvalid); } - if (view.nullTerminated()) { - return StringResult::error(FudStatus::ArgumentInvalid); - } - String output{}; output.m_allocator = reinterpret_cast(allocator); size_t outputCapacity = view.length() + 1U; diff --git a/source/fud_string_view.cpp b/source/fud_string_view.cpp index 090dd6d..ba88ad4 100644 --- a/source/fud_string_view.cpp +++ b/source/fud_string_view.cpp @@ -21,7 +21,7 @@ namespace fud { -StringView::StringView(const String& fudString) noexcept : StringView(fudString.length(), fudString.data()) +StringView::StringView(const String& fudString) noexcept : StringView{fudString.length(), fudString.data()} { } -- cgit v1.2.3