summaryrefslogtreecommitdiff
path: root/source/fud_csv.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'source/fud_csv.cpp')
-rw-r--r--source/fud_csv.cpp416
1 files changed, 384 insertions, 32 deletions
diff --git a/source/fud_csv.cpp b/source/fud_csv.cpp
index 031fcbc..198aeca 100644
--- a/source/fud_csv.cpp
+++ b/source/fud_csv.cpp
@@ -17,76 +17,428 @@
#include "fud_csv.hpp"
+#include "fud_print.hpp"
+
namespace fud {
-FudStatus Csv::parseCsvFromFilename(
+} // namespace fud
+
+namespace fud {
+
+Csv Csv::makeDefault()
+{
+ return Csv::makeWithSingleAllocator(globalFudAllocator);
+}
+
+Csv Csv::makeWithSingleAllocator(Allocator& allocator)
+{
+ return Csv::make(allocator, allocator);
+}
+
+Csv Csv::make(Allocator& entryAllocator, Allocator& bufferAllocator)
+{
+ Csv csv{};
+ csv.entries = Vector<StringView>(entryAllocator);
+ csv.buffer = Vector<utf8>(bufferAllocator);
+ return csv;
+}
+
+FudStatus Csv::parseFromFilename(
Csv& csv,
Option<TextBuffer&&> bufferOption,
StringView filename,
OpenFlags flags,
- Option<int> dirFdOption)
+ Option<int> dirFdOption,
+ Option<size_t> maxExtraAttempts)
{
+ debugPrint(u8"Enter parse from filename\n");
auto fileResult{RegularFile::open(filename, FileAccessMode::Read, flags, dirFdOption)};
if (fileResult.isError()) {
+ debugPrint(u8"Error opening file: {}\n", FudStatusToString(fileResult.getError()));
return fileResult.takeError();
}
if (bufferOption.hasValue()) {
auto bufferedFile{BufferedRegularFile::make(fileResult.takeOkay(), std::move(bufferOption.value()))};
- return parseCsvFromBufferedFile(csv, bufferedFile);
+ return parseFromBufferedFile(csv, bufferedFile, maxExtraAttempts);
}
auto unbufferedFile{fileResult.takeOkay()};
- return parseCsvFromUnbufferedFile(csv, std::move(unbufferedFile));
+ return parseFromUnbufferedFile(csv, std::move(unbufferedFile), maxExtraAttempts);
}
-enum class CsvTextState : uint8_t
-{
- UnquotedField,
- QuotedField,
- Separator,
- Newline,
-};
+template <typename File>
+DrainResult readHeader(Csv& csv, File& file, Option<size_t> maxExtraAttempts, size_t& rawSize);
+
+template <typename File>
+DrainResult scanLine(const Csv& csv, File& file, Option<size_t> maxExtraAttempts, size_t& rawSize);
-FudStatus Csv::parseCsvFromBufferedFile(Csv& csv, BufferedRegularFile& file)
+template <typename File>
+FudStatus fillBuffer(Csv& csv, File& file, Option<size_t> maxExtraAttempts, size_t rawSize);
+
+FudStatus Csv::parseFromBufferedFile(Csv& csv, BufferedRegularFile& file, Option<size_t> maxExtraAttempts)
{
- auto lineEnding{newlineText(csv.newlineDelimiter)};
- static_cast<void>(lineEnding);
- DrainResult readResult{};
- while (true) {
- utf8 letter{};
- auto drainResult = file.read(reinterpret_cast<std::byte*>(&letter), sizeof(letter), NullOpt);
- readResult.status = drainResult.status;
- readResult.bytesDrained += drainResult.bytesDrained;
- // if (status
- // REMOVE
- break;
+ size_t rawSize = 0;
+ DrainResult readResult{readHeader(csv, file, maxExtraAttempts, rawSize)};
+ if (readResult.status == FudStatus::Partial) {
+ // fix this up with filling out the first row
+ return readResult.status;
}
- size_t rawSize = 0;
+ if (readResult.status != FudStatus::Success && readResult.status != FudStatus::Partial) {
+ return readResult.status;
+ }
+
+ debugPrint(u8"Working with {} columns\n", csv.numColumns);
- while (true) {
- rawSize++;
- // REMOVE
- break;
+ csv.numLines = 1;
+ while (readResult.status == FudStatus::Success) {
+ auto lineResult = scanLine(csv, file, maxExtraAttempts, rawSize);
+ readResult.status = lineResult.status;
+ readResult.bytesDrained += lineResult.bytesDrained;
+ if (readResult.status == FudStatus::Success || readResult.status == FudStatus::Partial) {
+ csv.numLines++;
+ debugPrint(u8"Read additional line - total of {}\n", csv.numLines);
+ }
}
- auto reserveStatus = csv.buffer.reserve(rawSize);
+ if (readResult.status == FudStatus::Empty || readResult.status == FudStatus::Partial) {
+ readResult.status = FudStatus::Success;
+ }
+
+ if (readResult.status != FudStatus::Success) {
+ return readResult.status;
+ }
+
+ if (std::numeric_limits<size_t>::max() / csv.numLines < csv.numColumns) {
+ debugPrint(u8"Fail: csv.numlines = {}, csv.numColumns = {}\n", csv.numLines, csv.numColumns);
+ return FudStatus::Failure;
+ }
+
+ debugPrint(u8"Working with {} lines\n", csv.numLines);
+
+ auto reserveStatus = csv.entries.reserve(csv.numLines * csv.numColumns);
+ if (reserveStatus != FudStatus::Success) {
+ return reserveStatus;
+ }
+
+ reserveStatus = csv.buffer.reserve(rawSize);
if (reserveStatus != FudStatus::Success) {
return reserveStatus;
}
- return FudStatus::NotImplemented;
+ debugPrint(u8"Reserved space - {}\n", rawSize + 1);
+
+ auto fillStatus = fillBuffer(csv, file, maxExtraAttempts, rawSize);
+
+ if (fillStatus != FudStatus::Success) {
+ return fillStatus;
+ }
+
+ return FudStatus::Success;
}
-FudStatus Csv::parseCsvFromUnbufferedFile(Csv& csv, RegularFile&& file)
+FudStatus Csv::parseFromUnbufferedFile(Csv& csv, RegularFile&& file, Option<size_t> maxExtraAttempts)
{
static_cast<void>(csv);
constexpr size_t BufferSize = 256;
SimpleStackAllocator<BufferSize> stackAllocator{};
auto bufferedFile{BufferedRegularFile::make(std::move(file), TextBuffer{stackAllocator})};
- return parseCsvFromBufferedFile(csv, bufferedFile);
+ return parseFromBufferedFile(csv, bufferedFile, maxExtraAttempts);
+}
+
+// NOLINTBEGIN(readability-function-cognitive-complexity)
+template <typename File>
+DrainResult readHeader(Csv& csv, File& file, Option<size_t> maxExtraAttempts, size_t& rawSize)
+{
+ debugPrint(u8"Entered read header\n");
+
+ DrainResult readResult{};
+
+ auto lineEnding{newlineText(csv.newlineDelimiter)};
+ bool endOfLine{false};
+ bool maybeNewline{false};
+ bool inQuote{false};
+ bool sawQuote{false};
+
+ while (not endOfLine) {
+ Utf8 utf8Char{};
+
+ auto drainResult{file.readUtf8(utf8Char, maxExtraAttempts)};
+ readResult.status = drainResult.status;
+ readResult.bytesDrained += drainResult.bytesDrained;
+ if (readResult.status != FudStatus::Success) {
+ debugPrint(u8"Failed to read: {}", FudStatusToString(readResult.status));
+ return readResult;
+ }
+
+ if (csv.strictUtf8 && not utf8Char.valid()) {
+ debugPrint(u8"UTF8 invalid\n");
+ readResult.status = FudStatus::Utf8Invalid;
+ return readResult;
+ }
+
+ if (maybeNewline and utf8Char != Utf8{Ascii{lineEnding[1]}}) {
+ rawSize += 2;
+ maybeNewline = false;
+ } else if (inQuote and utf8Char == csv.quoteCharacter) {
+ inQuote = false;
+ sawQuote = true;
+ } else if (inQuote) {
+ sawQuote = false;
+ rawSize += utf8Char.size();
+ } else if (utf8Char == csv.quoteCharacter) {
+ inQuote = true;
+ if (sawQuote) {
+ rawSize += utf8Char.size();
+ sawQuote = false;
+ }
+ } else if (utf8Char == csv.columnDelimiter) {
+ csv.numColumns++;
+ } else if (not maybeNewline and utf8Char == Utf8{Ascii{lineEnding[0]}}) {
+ if (lineEnding.length() == 1) {
+ csv.numColumns++;
+ endOfLine = true;
+ } else {
+ maybeNewline = true;
+ }
+ } else if (maybeNewline and utf8Char == Utf8{Ascii{lineEnding[1]}}) {
+ maybeNewline = false;
+ endOfLine = true;
+ } else {
+ rawSize += utf8Char.size();
+ }
+ }
+
+ return readResult;
+}
+
+template <typename File>
+DrainResult scanLine(const Csv& csv, File& file, Option<size_t> maxExtraAttempts, size_t& rawSize)
+{
+ debugPrint(u8"Entered scanLine\n");
+ DrainResult readResult{};
+
+ auto lineEnding{newlineText(csv.newlineDelimiter)};
+ bool endOfLine{false};
+ bool maybeNewline{false};
+ bool inQuote{false};
+ bool sawQuote{false};
+ bool addToSize{};
+ size_t numColumns{0};
+
+ while (not endOfLine) {
+ addToSize = false;
+ Utf8 utf8Char{};
+ auto drainResult{file.readUtf8(utf8Char, maxExtraAttempts)};
+ readResult.status = drainResult.status;
+ readResult.bytesDrained += drainResult.bytesDrained;
+ if (readResult.status != FudStatus::Success) {
+ break;
+ }
+
+ if (csv.strictUtf8 && not utf8Char.valid()) {
+ readResult.status = FudStatus::Utf8Invalid;
+ break;
+ }
+
+ debugPrint(u8"{}", utf8Char.data());
+
+ if (maybeNewline and utf8Char != Utf8{Ascii{lineEnding[1]}}) {
+ rawSize += 2;
+ maybeNewline = false;
+ } else if (inQuote and utf8Char == csv.quoteCharacter) {
+ inQuote = false;
+ sawQuote = true;
+ } else if (inQuote) {
+ addToSize = true;
+ sawQuote = false;
+ } else if (utf8Char == csv.quoteCharacter) {
+ inQuote = true;
+ if (sawQuote) {
+ addToSize = true;
+ sawQuote = false;
+ }
+ } else if (utf8Char == csv.columnDelimiter) {
+ numColumns++;
+ } else if (not maybeNewline and utf8Char == Utf8{Ascii{lineEnding[0]}}) {
+ if (lineEnding.length() == 1) {
+ numColumns++;
+ endOfLine = true;
+ } else {
+ maybeNewline = true;
+ }
+ } else if (maybeNewline and utf8Char == Utf8{Ascii{lineEnding[1]}}) {
+ maybeNewline = false;
+ endOfLine = true;
+ } else {
+ addToSize = true;
+ }
+
+ if (addToSize and numColumns < csv.numColumns) {
+ rawSize += utf8Char.size();
+ }
+
+ if (numColumns > csv.numColumns and csv.strictColumns) {
+ readResult.status = FudStatus::FormatInvalid;
+ break;
+ }
+ }
+
+ if (numColumns > 0) {
+ debugPrint(u8"\n - Read line above with {} columns \n", numColumns);
+ }
+
+ if (numColumns == 0) {
+ debugPrint(u8"Read no additional columns\n");
+ readResult.status = FudStatus::Empty;
+ return readResult;
+ }
+
+ if (numColumns != csv.numColumns and csv.strictColumns) {
+ readResult.status = FudStatus::FormatInvalid;
+ return readResult;
+ }
+
+ return readResult;
+}
+
+template <typename File>
+FudStatus fillBuffer(Csv& csv, File& file, Option<size_t> maxExtraAttempts, size_t rawSize)
+{
+ static_cast<void>(rawSize);
+
+ auto seekStatus = file.seekStart();
+ if (seekStatus != FudStatus::Success) {
+ return seekStatus;
+ }
+
+ auto lineEnding{newlineText(csv.newlineDelimiter)};
+
+ size_t sizeCounter{0};
+ size_t numColumns{0};
+ bool maybeNewline{false};
+ bool inQuote{false};
+ bool sawQuote{false};
+ bool addToSize{};
+ bool terminateEntry{false};
+ bool endLine{false};
+
+ StringView currentEntry{};
+ currentEntry.m_data = csv.buffer.data() + sizeCounter;
+
+ size_t numEntries = csv.numLines * csv.numColumns;
+ debugPrint(u8"Working over {} entries\n", numEntries);
+ for (size_t entryIndex = 0; entryIndex < numEntries;) {
+ addToSize = false;
+ Utf8 utf8Char{};
+ auto drainResult{file.readUtf8(utf8Char, maxExtraAttempts)};
+ if (drainResult.status != FudStatus::Success) {
+ return drainResult.status;
+ }
+
+ if (maybeNewline and utf8Char != Utf8{Ascii{lineEnding[1]}}) {
+ addToSize = true;
+ // TODO: this actually needs proper handling still
+ maybeNewline = false;
+ } else if (inQuote and utf8Char == csv.quoteCharacter) {
+ inQuote = false;
+ sawQuote = true;
+ } else if (inQuote) {
+ sawQuote = false;
+ addToSize = true;
+ } else if (utf8Char == csv.quoteCharacter) {
+ inQuote = true;
+ if (sawQuote) {
+ addToSize = true;
+ sawQuote = false;
+ }
+ } else if (utf8Char == csv.columnDelimiter) {
+ terminateEntry = true;
+ } else if (not maybeNewline and utf8Char == Utf8{Ascii{lineEnding[0]}}) {
+ if (lineEnding.length() == 1) {
+ terminateEntry = true;
+ endLine = true;
+ } else {
+ maybeNewline = true;
+ }
+ } else if (maybeNewline and utf8Char == Utf8{Ascii{lineEnding[1]}}) {
+ maybeNewline = false;
+ terminateEntry = true;
+ endLine = true;
+ } else {
+ addToSize = true;
+ }
+
+ if (not inQuote and sawQuote and utf8Char != csv.quoteCharacter) {
+ sawQuote = false;
+ }
+
+ if (terminateEntry) {
+ auto pushStatus = csv.entries.pushBack(currentEntry);
+ if (pushStatus != FudStatus::Success) {
+ return pushStatus;
+ }
+ currentEntry.m_length = 0;
+ terminateEntry = false;
+ entryIndex++;
+ if (endLine) {
+ numColumns = 0;
+ endLine = false;
+ }
+ }
+
+ if (addToSize and numColumns < csv.numColumns) {
+ FudStatus extendStatus{FudStatus::Success};
+ auto charSize = utf8Char.size();
+ switch (charSize) {
+ case 1:
+ extendStatus = csv.buffer.extend(Span<const utf8, 1>{utf8Char.data(), 1});
+ break;
+ case 2:
+ extendStatus = csv.buffer.extend(Span<const utf8, 2>{utf8Char.data(), 2});
+ break;
+ case 3:
+ extendStatus = csv.buffer.extend(Span<const utf8, 3>{utf8Char.data(), 3});
+ break;
+ case 4:
+ extendStatus = csv.buffer.extend(Span<const utf8, 4>{utf8Char.data(), 4});
+ break;
+ case 0:
+ default:
+ debugPrint(u8"Char size is ??? {}\n", charSize);
+ charSize = 0;
+ break;
+ }
+ if (extendStatus != FudStatus::Success) {
+ return extendStatus;
+ }
+ sizeCounter += charSize;
+ currentEntry.m_length += charSize;
+ }
+ }
+
+ debugPrint(
+ u8"Buffer, with current size = {} and sizeCounter, = {}:\n-----\n{}\n-----\n",
+ csv.buffer.size(),
+ sizeCounter,
+ StringView{csv.buffer.size(), csv.buffer.data()});
+
+ const auto* data = csv.buffer.data();
+ for (auto& entry : csv.entries) {
+ entry.m_data = data;
+ data += entry.m_length;
+ debugPrint(u8"Entry = {}\n", entry);
+ }
+
+ if (rawSize != sizeCounter) {
+ debugPrint(u8"Invalid: rawSize == {}, sizeCounter == {}");
+ return FudStatus::Failure;
+ }
+
+ return FudStatus::Success;
}
+// NOLINTEND(readability-function-cognitive-complexity)
} // namespace fud