/* * libfud * Copyright 2025 Dominick Allen * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "fud_csv.hpp" namespace fud { } // namespace fud namespace fud { Csv Csv::makeDefault() { return Csv::makeWithSingleAllocator(globalFudAllocator); } Csv Csv::makeWithSingleAllocator(Allocator& allocator) { return Csv::make(allocator, allocator); } Csv Csv::make(Allocator& entryAllocator, Allocator& bufferAllocator) { Csv csv{}; csv.entries = Vector(entryAllocator); csv.buffer = Vector(bufferAllocator); return csv; } FudStatus Csv::parseFromFilenameUnbuffered( Csv& csv, StringView filename, OpenFlags flags, Option dirFdOption, size_t maxExtraAttempts) { auto fileResult{RegularFile::open(filename, FileAccessMode::Read, flags, std::move(dirFdOption))}; if (fileResult.isError()) { return fileResult.takeError(); } auto unbufferedFile{fileResult.takeOkay()}; return parseFromUnbufferedFile(csv, std::move(unbufferedFile), maxExtraAttempts); } FudStatus Csv::parseFromFilenameBuffered( Csv& csv, TextBuffer&& bufferOption, StringView filename, OpenFlags flags, Option dirFdOption, size_t maxExtraAttempts) { auto fileResult{RegularFile::open(filename, FileAccessMode::Read, flags, std::move(dirFdOption))}; if (fileResult.isError()) { return fileResult.takeError(); } auto bufferedFile{BufferedRegularFile::make(fileResult.takeOkay(), std::move(bufferOption))}; return parseFromBufferedFile(csv, bufferedFile, maxExtraAttempts); } template DrainResult readHeader(Csv& csv, File& file, size_t maxExtraAttempts, size_t& rawSize); template DrainResult scanLine(const Csv& csv, File& file, size_t maxExtraAttempts, size_t& rawSize); template FudStatus fillBuffer(Csv& csv, File& file, size_t maxExtraAttempts, size_t rawSize); FudStatus Csv::parseFromBufferedFile(Csv& csv, BufferedRegularFile& file, size_t maxExtraAttempts) { size_t rawSize = 0; DrainResult readResult{readHeader(csv, file, maxExtraAttempts, rawSize)}; if (readResult.status == FudStatus::Partial) { // fix this up with filling out the first row return readResult.status; } if (readResult.status != FudStatus::Success && readResult.status != FudStatus::Partial) { return readResult.status; } csv.numLines = 1; while (readResult.status == FudStatus::Success) { auto lineResult = scanLine(csv, file, maxExtraAttempts, rawSize); readResult.status = lineResult.status; readResult.bytesDrained += lineResult.bytesDrained; if (readResult.status == FudStatus::Success || readResult.status == FudStatus::Partial) { csv.numLines++; } } if (readResult.status == FudStatus::Empty || readResult.status == FudStatus::Partial) { readResult.status = FudStatus::Success; } if (readResult.status != FudStatus::Success) { return readResult.status; } if (std::numeric_limits::max() / csv.numLines < csv.numColumns) { return FudStatus::Failure; } auto reserveStatus = csv.entries.reserve(csv.numLines * csv.numColumns); if (reserveStatus != FudStatus::Success) { return reserveStatus; } reserveStatus = csv.buffer.reserve(rawSize); if (reserveStatus != FudStatus::Success) { return reserveStatus; } auto fillStatus = fillBuffer(csv, file, maxExtraAttempts, rawSize); if (fillStatus != FudStatus::Success) { return fillStatus; } return FudStatus::Success; } FudStatus Csv::parseFromUnbufferedFile(Csv& csv, RegularFile&& file, size_t maxExtraAttempts) { static_cast(csv); constexpr size_t BufferSize = 256; SimpleStackAllocator stackAllocator{}; auto bufferedFile{BufferedRegularFile::make(std::move(file), TextBuffer{stackAllocator})}; return parseFromBufferedFile(csv, bufferedFile, maxExtraAttempts); } // NOLINTBEGIN(readability-function-cognitive-complexity) template DrainResult readHeader(Csv& csv, File& file, size_t maxExtraAttempts, size_t& rawSize) { DrainResult readResult{}; auto lineEnding{newlineText(csv.newlineDelimiter)}; bool endOfLine{false}; bool maybeNewline{false}; bool inQuote{false}; bool sawQuote{false}; while (not endOfLine) { Utf8 utf8Char{}; auto drainResult{file.readUtf8(utf8Char, maxExtraAttempts)}; readResult.status = drainResult.status; readResult.bytesDrained += drainResult.bytesDrained; if (readResult.status != FudStatus::Success) { return readResult; } if (csv.strictUtf8 && not utf8Char.valid()) { readResult.status = FudStatus::Utf8Invalid; return readResult; } if (maybeNewline and utf8Char != Utf8{Ascii{lineEnding[1]}}) { rawSize += 2; maybeNewline = false; } else if (inQuote and utf8Char == csv.quoteCharacter) { inQuote = false; sawQuote = true; } else if (inQuote) { sawQuote = false; rawSize += utf8Char.size(); } else if (utf8Char == csv.quoteCharacter) { inQuote = true; if (sawQuote) { rawSize += utf8Char.size(); sawQuote = false; } } else if (utf8Char == csv.columnDelimiter) { csv.numColumns++; } else if (not maybeNewline and utf8Char == Utf8{Ascii{lineEnding[0]}}) { if (lineEnding.length() == 1) { csv.numColumns++; endOfLine = true; } else { maybeNewline = true; } } else if (maybeNewline and utf8Char == Utf8{Ascii{lineEnding[1]}}) { maybeNewline = false; endOfLine = true; } else { rawSize += utf8Char.size(); } } return readResult; } template DrainResult scanLine(const Csv& csv, File& file, size_t maxExtraAttempts, size_t& rawSize) { DrainResult readResult{}; auto lineEnding{newlineText(csv.newlineDelimiter)}; bool endOfLine{false}; bool maybeNewline{false}; bool inQuote{false}; bool sawQuote{false}; bool addToSize{}; size_t numColumns{0}; while (not endOfLine) { addToSize = false; Utf8 utf8Char{}; auto drainResult{file.readUtf8(utf8Char, maxExtraAttempts)}; readResult.status = drainResult.status; readResult.bytesDrained += drainResult.bytesDrained; if (readResult.status != FudStatus::Success) { break; } if (csv.strictUtf8 && not utf8Char.valid()) { readResult.status = FudStatus::Utf8Invalid; break; } if (maybeNewline and utf8Char != Utf8{Ascii{lineEnding[1]}}) { rawSize += 2; maybeNewline = false; } else if (inQuote and utf8Char == csv.quoteCharacter) { inQuote = false; sawQuote = true; } else if (inQuote) { addToSize = true; sawQuote = false; } else if (utf8Char == csv.quoteCharacter) { inQuote = true; if (sawQuote) { addToSize = true; sawQuote = false; } } else if (utf8Char == csv.columnDelimiter) { numColumns++; } else if (not maybeNewline and utf8Char == Utf8{Ascii{lineEnding[0]}}) { if (lineEnding.length() == 1) { numColumns++; endOfLine = true; } else { maybeNewline = true; } } else if (maybeNewline and utf8Char == Utf8{Ascii{lineEnding[1]}}) { maybeNewline = false; endOfLine = true; } else { addToSize = true; } if (addToSize and numColumns < csv.numColumns) { rawSize += utf8Char.size(); } if (numColumns > csv.numColumns and csv.strictColumns) { readResult.status = FudStatus::FormatInvalid; break; } } if (numColumns == 0) { readResult.status = FudStatus::Empty; return readResult; } if (numColumns != csv.numColumns and csv.strictColumns) { readResult.status = FudStatus::FormatInvalid; return readResult; } return readResult; } template FudStatus fillBuffer(Csv& csv, File& file, size_t maxExtraAttempts, size_t rawSize) { static_cast(rawSize); auto seekStatus = file.seekStart(); if (seekStatus != FudStatus::Success) { return seekStatus; } auto lineEnding{newlineText(csv.newlineDelimiter)}; size_t sizeCounter{0}; size_t numColumns{0}; bool maybeNewline{false}; bool inQuote{false}; bool sawQuote{false}; bool addToSize{}; bool terminateEntry{false}; bool endLine{false}; StringView currentEntry{}; currentEntry.m_data = csv.buffer.data() + sizeCounter; size_t numEntries = csv.numLines * csv.numColumns; for (size_t entryIndex = 0; entryIndex < numEntries;) { addToSize = false; Utf8 utf8Char{}; auto drainResult{file.readUtf8(utf8Char, maxExtraAttempts)}; if (drainResult.status != FudStatus::Success) { return drainResult.status; } if (maybeNewline and utf8Char != Utf8{Ascii{lineEnding[1]}}) { addToSize = true; // TODO: this actually needs proper handling still maybeNewline = false; } else if (inQuote and utf8Char == csv.quoteCharacter) { inQuote = false; sawQuote = true; } else if (inQuote) { sawQuote = false; addToSize = true; } else if (utf8Char == csv.quoteCharacter) { inQuote = true; if (sawQuote) { addToSize = true; sawQuote = false; } } else if (utf8Char == csv.columnDelimiter) { terminateEntry = true; } else if (not maybeNewline and utf8Char == Utf8{Ascii{lineEnding[0]}}) { if (lineEnding.length() == 1) { terminateEntry = true; endLine = true; } else { maybeNewline = true; } } else if (maybeNewline and utf8Char == Utf8{Ascii{lineEnding[1]}}) { maybeNewline = false; terminateEntry = true; endLine = true; } else { addToSize = true; } if (not inQuote and sawQuote and utf8Char != csv.quoteCharacter) { sawQuote = false; } if (terminateEntry) { auto pushStatus = csv.entries.pushBack(currentEntry); if (pushStatus != FudStatus::Success) { return pushStatus; } currentEntry.m_length = 0; terminateEntry = false; entryIndex++; if (endLine) { numColumns = 0; endLine = false; } } if (addToSize and numColumns < csv.numColumns) { FudStatus extendStatus{FudStatus::Success}; auto charSize = utf8Char.size(); switch (charSize) { case 1: extendStatus = csv.buffer.extend(Span{utf8Char.data(), 1}); break; case 2: extendStatus = csv.buffer.extend(Span{utf8Char.data(), 2}); break; case 3: extendStatus = csv.buffer.extend(Span{utf8Char.data(), 3}); break; case 4: extendStatus = csv.buffer.extend(Span{utf8Char.data(), 4}); break; case 0: default: charSize = 0; break; } if (extendStatus != FudStatus::Success) { return extendStatus; } sizeCounter += charSize; currentEntry.m_length += charSize; } } const auto* data = csv.buffer.data(); for (auto& entry : csv.entries) { entry.m_data = data; data += entry.m_length; } if (rawSize != sizeCounter) { return FudStatus::Failure; } return FudStatus::Success; } // NOLINTEND(readability-function-cognitive-complexity) [[nodiscard]] Result, FudStatus> Csv::entry(size_t line, size_t column) const { if (line > numLines || column > numColumns) { return Error{FudStatus::IndexInvalid}; } StringView entry = entries[(line * numColumns) + column]; if (entry.length() == 0) { return Okay{Option{NullOpt}}; } return Okay{Option{entry}}; } } // namespace fud