diff options
author | Spartan322 <Megacake1234@gmail.com> | 2023-11-28 11:09:26 +0100 |
---|---|---|
committer | Spartan322 <Megacake1234@gmail.com> | 2024-05-09 22:11:26 +0200 |
commit | 757114a3c5b748567b42f273c7b78ca039ae983c (patch) | |
tree | e07390b682052129c91f4b157068bcdd84ceecb4 /src/openvic-dataloader/csv | |
parent | 7211a228e68c8a6b1ad1c1c5ec68c8d720b6d2ba (diff) |
Add `deps/dryad` -> https://github.com/Spartan322/dryadadd/dryad
Add `deps/fmt` -> https://github.com/fmtlib/fmt
Add `deps/range-v3` -> https://github.com/ericniebler/range-v3
Improve parser error and warning support
Update .clang-format
Update `deps/SCsub`
Diffstat (limited to 'src/openvic-dataloader/csv')
-rw-r--r-- | src/openvic-dataloader/csv/CsvGrammar.hpp | 201 | ||||
-rw-r--r-- | src/openvic-dataloader/csv/CsvParseState.hpp | 28 | ||||
-rw-r--r-- | src/openvic-dataloader/csv/Parser.cpp | 161 |
3 files changed, 231 insertions, 159 deletions
diff --git a/src/openvic-dataloader/csv/CsvGrammar.hpp b/src/openvic-dataloader/csv/CsvGrammar.hpp index 712bddc..5451f26 100644 --- a/src/openvic-dataloader/csv/CsvGrammar.hpp +++ b/src/openvic-dataloader/csv/CsvGrammar.hpp @@ -7,14 +7,17 @@ #include <vector> #include <openvic-dataloader/csv/LineObject.hpp> +#include <openvic-dataloader/csv/Parser.hpp> #include <lexy/callback.hpp> #include <lexy/dsl.hpp> -#include "detail/LexyLitRange.hpp" +#include "detail/dsl.hpp" // Grammar Definitions // namespace ovdl::csv::grammar { + using EncodingType = ovdl::csv::EncodingType; + template<typename T> concept ParseChars = requires() { { T::character }; @@ -51,123 +54,117 @@ namespace ovdl::csv::grammar { .map<'"'>('"'); template<ParseOptions Options> - struct StringValue { - static constexpr auto rule = [] { - // Arbitrary code points - auto c = Options.character - Options.control; - - auto back_escape = lexy::dsl::backslash_escape // - .symbol<escaped_symbols>(); + struct CsvGrammar { + struct StringValue { + static constexpr auto rule = [] { + // Arbitrary code points + auto c = Options.character - Options.control; - auto quote_escape = lexy::dsl::escape(lexy::dsl::lit_c<'"'>) // - .template symbol<escaped_quote>(); + auto back_escape = lexy::dsl::backslash_escape // + .symbol<escaped_symbols>(); - return lexy::dsl::delimited(lexy::dsl::lit_c<'"'>, lexy::dsl::not_followed_by(lexy::dsl::lit_c<'"'>, lexy::dsl::lit_c<'"'>))(c, back_escape, quote_escape); - }(); - - static constexpr auto value = lexy::as_string<std::string>; - }; + auto quote_escape = lexy::dsl::escape(lexy::dsl::lit_c<'"'>) // + .template symbol<escaped_quote>(); - template<ParseOptions Options> - struct PlainValue { - static constexpr auto rule = [] { - if constexpr (Options.SupportStrings) { - return lexy::dsl::identifier(Options.character - (lexy::dsl::lit_b<Options.SepChar> / lexy::dsl::ascii::newline)); - } else { - auto escape_check_char = Options.character - (lexy::dsl::lit_b<Options.SepChar> / lexy::dsl::ascii::newline); - auto id_check_char = escape_check_char - lexy::dsl::lit_b<'\\'>; - auto id_segment = lexy::dsl::identifier(id_check_char); - auto escape_segement = lexy::dsl::token(escape_check_char); - auto escape_sym = lexy::dsl::symbol<escaped_symbols>(escape_segement); - auto escape_rule = lexy::dsl::lit_b<'\\'> >> escape_sym; - return lexy::dsl::list(id_segment | escape_rule); - } - }(); - static constexpr auto value = lexy::as_string<std::string>; - }; + return lexy::dsl::delimited(lexy::dsl::lit_c<'"'>, lexy::dsl::not_followed_by(lexy::dsl::lit_c<'"'>, lexy::dsl::lit_c<'"'>))(c, back_escape, quote_escape); + }(); - template<ParseOptions Options> - struct Value { - static constexpr auto rule = [] { - if constexpr (Options.SupportStrings) { - return lexy::dsl::p<StringValue<Options>> | lexy::dsl::p<PlainValue<Options>>; - } else { - return lexy::dsl::p<PlainValue<Options>>; - } - }(); - static constexpr auto value = lexy::forward<std::string>; - }; - - template<ParseOptions Options> - struct SepConst { - static constexpr auto rule = lexy::dsl::lit_b<Options.SepChar>; - static constexpr auto value = lexy::constant(1); - }; + static constexpr auto value = lexy::as_string<std::string>; + }; - template<ParseOptions Options> - struct Seperator { - static constexpr auto rule = lexy::dsl::list(lexy::dsl::p<SepConst<Options>>); - static constexpr auto value = lexy::count; - }; + struct PlainValue { + static constexpr auto rule = [] { + if constexpr (Options.SupportStrings) { + return lexy::dsl::identifier(Options.character - (lexy::dsl::lit_b<Options.SepChar> / lexy::dsl::ascii::newline)); + } else { + auto escape_check_char = Options.character - (lexy::dsl::lit_b<Options.SepChar> / lexy::dsl::ascii::newline); + auto id_check_char = escape_check_char - lexy::dsl::lit_b<'\\'>; + auto id_segment = lexy::dsl::identifier(id_check_char); + auto escape_segement = lexy::dsl::token(escape_check_char); + auto escape_sym = lexy::dsl::symbol<escaped_symbols>(escape_segement); + auto escape_rule = lexy::dsl::lit_b<'\\'> >> escape_sym; + return lexy::dsl::list(id_segment | escape_rule); + } + }(); + static constexpr auto value = lexy::as_string<std::string>; + }; - template<ParseOptions Options> - struct LineEnd { - static constexpr auto rule = lexy::dsl::list(lexy::dsl::p<Value<Options>>, lexy::dsl::trailing_sep(lexy::dsl::p<Seperator<Options>>)); - static constexpr auto value = lexy::fold_inplace<ovdl::csv::LineObject>( - std::initializer_list<ovdl::csv::LineObject::value_type> {}, - [](ovdl::csv::LineObject& result, auto&& arg) { - if constexpr (std::is_same_v<std::decay_t<decltype(arg)>, std::size_t>) { - // Count seperators, adds to previous value, making it a position - using position_type = ovdl::csv::LineObject::position_type; - result.emplace_back(static_cast<position_type>(arg + result.back().first), ""); + struct Value { + static constexpr auto rule = [] { + if constexpr (Options.SupportStrings) { + return lexy::dsl::p<StringValue> | lexy::dsl::p<PlainValue>; } else { - if (result.empty()) result.emplace_back(0u, LEXY_MOV(arg)); - else { - auto& [pos, value] = result.back(); - value = arg; - } + return lexy::dsl::p<PlainValue>; } - }); - }; + }(); + static constexpr auto value = lexy::forward<std::string>; + }; - template<ParseOptions Options> - struct Line { - static constexpr auto suffix_setter(ovdl::csv::LineObject& line) { - auto& [position, value] = line.back(); - if (value.empty()) { - line.set_suffix_end(position); - line.pop_back(); - } else { - line.set_suffix_end(position + 1); - } + struct SepConst { + static constexpr auto rule = lexy::dsl::lit_b<Options.SepChar>; + static constexpr auto value = lexy::constant(1); }; - static constexpr auto rule = lexy::dsl::p<LineEnd<Options>> | lexy::dsl::p<Seperator<Options>> >> lexy::dsl::opt(lexy::dsl::p<LineEnd<Options>>); - static constexpr auto value = - lexy::callback<ovdl::csv::LineObject>( - [](ovdl::csv::LineObject&& line) { - suffix_setter(line); - return LEXY_MOV(line); - }, - [](std::size_t prefix_count, ovdl::csv::LineObject&& line) { - line.set_prefix_end(prefix_count); - // position needs to be adjusted to prefix - for (auto& [position, value] : line) { - position += prefix_count; + struct Seperator { + static constexpr auto rule = lexy::dsl::list(lexy::dsl::p<SepConst>); + static constexpr auto value = lexy::count; + }; + + struct LineEnd { + static constexpr auto rule = lexy::dsl::list(lexy::dsl::p<Value>, lexy::dsl::trailing_sep(lexy::dsl::p<Seperator>)); + static constexpr auto value = lexy::fold_inplace<ovdl::csv::LineObject>( + std::initializer_list<ovdl::csv::LineObject::value_type> {}, + [](ovdl::csv::LineObject& result, auto&& arg) { + if constexpr (std::is_same_v<std::decay_t<decltype(arg)>, std::size_t>) { + // Count seperators, adds to previous value, making it a position + using position_type = ovdl::csv::LineObject::position_type; + result.emplace_back(static_cast<position_type>(arg + result.back().first), ""); + } else { + if (result.empty()) result.emplace_back(0u, LEXY_MOV(arg)); + else { + auto& [pos, value] = result.back(); + value = arg; + } } - suffix_setter(line); - return LEXY_MOV(line); - }, - [](std::size_t suffix_count, lexy::nullopt = {}) { - return ovdl::csv::LineObject(0, {}, suffix_count + 1); }); + }; + + struct Line { + static constexpr auto suffix_setter(ovdl::csv::LineObject& line) { + auto& [position, value] = line.back(); + if (value.empty()) { + line.set_suffix_end(position); + line.pop_back(); + } else { + line.set_suffix_end(position + 1); + } + }; + + static constexpr auto rule = lexy::dsl::p<LineEnd> | lexy::dsl::p<Seperator> >> lexy::dsl::opt(lexy::dsl::p<LineEnd>); + static constexpr auto value = + lexy::callback<ovdl::csv::LineObject>( + [](ovdl::csv::LineObject&& line) { + suffix_setter(line); + return LEXY_MOV(line); + }, + [](std::size_t prefix_count, ovdl::csv::LineObject&& line) { + line.set_prefix_end(prefix_count); + // position needs to be adjusted to prefix + for (auto& [position, value] : line) { + position += prefix_count; + } + suffix_setter(line); + return LEXY_MOV(line); + }, + [](std::size_t suffix_count, lexy::nullopt = {}) { + return ovdl::csv::LineObject(0, {}, suffix_count + 1); + }); + }; }; template<ParseOptions Options> struct File { - static constexpr auto rule = lexy::dsl::terminator(lexy::dsl::eof).opt_list( - lexy::dsl::p<Line<Options>> | lexy::dsl::newline - ); + static constexpr auto rule = lexy::dsl::terminator(lexy::dsl::eof).opt_list(lexy::dsl::p<typename CsvGrammar<Options>::Line> | lexy::dsl::newline); static constexpr auto value = lexy::as_list<std::vector<ovdl::csv::LineObject>>; }; @@ -199,7 +196,7 @@ namespace ovdl::csv::grammar { namespace ovdl::csv::grammar::windows1252 { struct windows1252_t { - static constexpr auto character = detail::lexydsl::make_range<0x01, 0xFF>(); + static constexpr auto character = dsl::make_range<0x01, 0xFF>(); static constexpr auto control = lexy::dsl::ascii::control / lexy::dsl::lit_b<0x81> / lexy::dsl::lit_b<0x8D> / lexy::dsl::lit_b<0x8F> / diff --git a/src/openvic-dataloader/csv/CsvParseState.hpp b/src/openvic-dataloader/csv/CsvParseState.hpp new file mode 100644 index 0000000..2390453 --- /dev/null +++ b/src/openvic-dataloader/csv/CsvParseState.hpp @@ -0,0 +1,28 @@ +#pragma once + +#include <openvic-dataloader/File.hpp> +#include <openvic-dataloader/ParseState.hpp> +#include <openvic-dataloader/csv/LineObject.hpp> +#include <openvic-dataloader/csv/Parser.hpp> + +#include <lexy/encoding.hpp> + +template<ovdl::csv::EncodingType> +struct LexyEncodingFrom { +}; + +template<> +struct LexyEncodingFrom<ovdl::csv::EncodingType::Windows1252> { + using encoding = lexy::default_encoding; +}; + +template<> +struct LexyEncodingFrom<ovdl::csv::EncodingType::Utf8> { + using encoding = lexy::utf8_char_encoding; +}; + +template<ovdl::csv::EncodingType Encoding> +using CsvFile = ovdl::BasicFile<typename LexyEncodingFrom<Encoding>::encoding, std::vector<ovdl::csv::LineObject>>; + +template<ovdl::csv::EncodingType Encoding> +using CsvParseState = ovdl::FileParseState<CsvFile<Encoding>>;
\ No newline at end of file diff --git a/src/openvic-dataloader/csv/Parser.cpp b/src/openvic-dataloader/csv/Parser.cpp index 0ca3402..849ea05 100644 --- a/src/openvic-dataloader/csv/Parser.cpp +++ b/src/openvic-dataloader/csv/Parser.cpp @@ -1,47 +1,34 @@ -#include <memory> #include <vector> +#include <openvic-dataloader/File.hpp> #include <openvic-dataloader/csv/LineObject.hpp> #include <openvic-dataloader/csv/Parser.hpp> -#include <openvic-dataloader/detail/ClassExport.hpp> +#include <openvic-dataloader/detail/LexyReportError.hpp> +#include <openvic-dataloader/detail/OStreamOutputIterator.hpp> +#include <openvic-dataloader/detail/utility/Utility.hpp> #include <lexy/action/parse.hpp> #include <lexy/encoding.hpp> #include <lexy/input/buffer.hpp> #include <lexy/input/file.hpp> -#include "csv/CsvGrammar.hpp" -#include "detail/BasicBufferHandler.hpp" -#include "detail/LexyReportError.hpp" -#include "detail/OStreamOutputIterator.hpp" +#include "CsvGrammar.hpp" +#include "CsvParseState.hpp" +#include "detail/NullBuff.hpp" +#include "detail/ParseHandler.hpp" using namespace ovdl; using namespace ovdl::csv; -/// BufferHandler /// - -template<EncodingType> -struct LexyEncodingFrom { -}; - -template<> -struct LexyEncodingFrom<EncodingType::Windows1252> { - using encoding = lexy::default_encoding; -}; - -template<> -struct LexyEncodingFrom<EncodingType::Utf8> { - using encoding = lexy::utf8_char_encoding; -}; +/// ParseHandler /// template<EncodingType Encoding> -class Parser<Encoding>::BufferHandler final : public detail::BasicBufferHandler<typename LexyEncodingFrom<Encoding>::encoding> { -public: - template<typename Node, typename ErrorCallback> - std::optional<std::vector<ParseError>> parse(const ErrorCallback& callback) { - auto result = lexy::parse<Node>(this->_buffer, callback); +struct Parser<Encoding>::ParseHandler final : detail::BasicFileParseHandler<CsvParseState<Encoding>> { + template<typename Node> + std::optional<DiagnosticLogger::error_range> parse() { + auto result = lexy::parse<Node>(this->buffer(), *this->_parse_state, this->_parse_state->logger().error_callback()); if (!result) { - return result.errors(); + return this->_parse_state->logger().get_errors(); } _lines = std::move(result.value()); return std::nullopt; @@ -59,8 +46,14 @@ private: template<EncodingType Encoding> Parser<Encoding>::Parser() - : _buffer_handler(std::make_unique<BufferHandler>()) { - set_error_log_to_stderr(); + : _parse_handler(std::make_unique<ParseHandler>()) { + set_error_log_to_null(); +} + +template<EncodingType Encoding> +Parser<Encoding>::Parser(std::basic_ostream<char>& error_stream) + : _parse_handler(std::make_unique<ParseHandler>()) { + set_error_log_to(error_stream); } template<EncodingType Encoding> @@ -115,28 +108,31 @@ Parser<Encoding> Parser<Encoding>::from_file(const std::filesystem::path& path) /// template<EncodingType Encoding> template<typename... Args> -constexpr void Parser<Encoding>::_run_load_func(detail::LoadCallback<BufferHandler, Args...> auto func, Args... args) { - _warnings.clear(); - _errors.clear(); +constexpr void Parser<Encoding>::_run_load_func(detail::LoadCallback<ParseHandler, Args...> auto func, Args... args) { _has_fatal_error = false; - if (auto error = func(_buffer_handler.get(), std::forward<Args>(args)...); error) { - _has_fatal_error = error.value().type == ParseError::Type::Fatal; - _errors.push_back(error.value()); - _error_stream.get() << "Error: " << _errors.back().message << '\n'; + auto error = func(_parse_handler.get(), std::forward<Args>(args)...); + auto error_message = _parse_handler->make_error_from(error); + if (!error_message.empty()) { + _has_error = true; + _has_fatal_error = true; + _parse_handler->parse_state().logger().template create_log<error::BufferError>(DiagnosticLogger::DiagnosticKind::error, fmt::runtime(error_message)); + } + if (has_error() && &_error_stream.get() != &detail::cnull) { + print_errors_to(_error_stream.get()); } } template<EncodingType Encoding> constexpr Parser<Encoding>& Parser<Encoding>::load_from_buffer(const char* data, std::size_t size) { // Type can't be deduced? - _run_load_func(std::mem_fn(&BufferHandler::load_buffer_size), data, size); + _run_load_func(std::mem_fn(&ParseHandler::load_buffer_size), data, size); return *this; } template<EncodingType Encoding> constexpr Parser<Encoding>& Parser<Encoding>::load_from_buffer(const char* start, const char* end) { // Type can't be deduced? - _run_load_func(std::mem_fn(&BufferHandler::load_buffer), start, end); + _run_load_func(std::mem_fn(&ParseHandler::load_buffer), start, end); return *this; } @@ -149,7 +145,7 @@ template<EncodingType Encoding> constexpr Parser<Encoding>& Parser<Encoding>::load_from_file(const char* path) { _file_path = path; // Type can be deduced?? - _run_load_func(std::mem_fn(&BufferHandler::load_file), path); + _run_load_func(std::mem_fn(&ParseHandler::load_file), path); return *this; } @@ -159,38 +155,34 @@ Parser<Encoding>& Parser<Encoding>::load_from_file(const std::filesystem::path& } template<EncodingType Encoding> -constexpr Parser<Encoding>& Parser<Encoding>::load_from_file(const detail::Has_c_str auto& path) { - return load_from_file(path.c_str()); -} - -template<EncodingType Encoding> bool Parser<Encoding>::parse_csv(bool handle_strings) { - if (!_buffer_handler->is_valid()) { + if (!_parse_handler->is_valid()) { return false; } - std::optional<std::vector<ParseError>> errors; - auto report_error = ovdl::detail::ReporError.path(_file_path).to(detail::OStreamOutputIterator { _error_stream }); + std::optional<Parser<Encoding>::error_range> errors; + // auto report_error = ovdl::detail::ReporError.path(_file_path).to(detail::OStreamOutputIterator { _error_stream }); if constexpr (Encoding == EncodingType::Windows1252) { if (handle_strings) - errors = _buffer_handler->template parse<csv::grammar::windows1252::strings::SemiColonFile>(report_error); + errors = _parse_handler->template parse<csv::grammar::windows1252::strings::SemiColonFile>(); else - errors = _buffer_handler->template parse<csv::grammar::windows1252::SemiColonFile>(report_error); + errors = _parse_handler->template parse<csv::grammar::windows1252::SemiColonFile>(); } else { if (handle_strings) - errors = _buffer_handler->template parse<csv::grammar::utf8::strings::SemiColonFile>(report_error); + errors = _parse_handler->template parse<csv::grammar::utf8::strings::SemiColonFile>(); else - errors = _buffer_handler->template parse<csv::grammar::utf8::SemiColonFile>(report_error); + errors = _parse_handler->template parse<csv::grammar::utf8::SemiColonFile>(); } - if (errors) { - _errors.reserve(errors->size()); - for (auto& err : errors.value()) { - _has_fatal_error |= err.type == ParseError::Type::Fatal; - _errors.push_back(err); + _has_error = _parse_handler->parse_state().logger().errored(); + _has_warning = _parse_handler->parse_state().logger().warned(); + if (!errors->empty()) { + _has_fatal_error = true; + if (&_error_stream.get() != &detail::cnull) { + print_errors_to(_error_stream); } return false; } - _lines = std::move(_buffer_handler->get_lines()); + _lines = std::move(_parse_handler->get_lines()); return true; } @@ -199,5 +191,60 @@ const std::vector<csv::LineObject>& Parser<Encoding>::get_lines() const { return _lines; } +template<EncodingType Encoding> +typename Parser<Encoding>::error_range Parser<Encoding>::get_errors() const { + return _parse_handler->parse_state().logger().get_errors(); +} + +template<EncodingType Encoding> +const FilePosition Parser<Encoding>::get_error_position(const error::Error* error) const { + if (!error || !error->is_linked_in_tree()) { + return {}; + } + auto err_location = _parse_handler->parse_state().logger().location_of(error); + if (err_location.is_synthesized()) { + return {}; + } + + auto loc_begin = lexy::get_input_location(_parse_handler->buffer(), err_location.begin()); + FilePosition result { loc_begin.line_nr(), loc_begin.line_nr(), loc_begin.column_nr(), loc_begin.column_nr() }; + if (err_location.begin() < err_location.end()) { + auto loc_end = lexy::get_input_location(_parse_handler->buffer(), err_location.end(), loc_begin.anchor()); + result.end_line = loc_end.line_nr(); + result.end_column = loc_end.column_nr(); + } + return result; +} + +template<EncodingType Encoding> +void Parser<Encoding>::print_errors_to(std::basic_ostream<char>& stream) const { + auto errors = get_errors(); + if (errors.empty()) return; + for (const auto error : errors) { + dryad::visit_tree( + error, + [&](const error::BufferError* buffer_error) { + stream << buffer_error->message() << '\n'; + }, + [&](const error::ParseError* parse_error) { + stream << parse_error->message() << '\n'; + }, + [&](dryad::child_visitor<error::ErrorKind> visitor, const error::Semantic* semantic) { + stream << semantic->message() << '\n'; + auto annotations = semantic->annotations(); + if (annotations.empty()) return; + for (auto annotation : annotations) { + visitor(annotation); + } + }, + [&](const error::PrimaryAnnotation* primary) { + stream << primary->message() << '\n'; + }, + [&](const error::SecondaryAnnotation* secondary) { + stream << secondary->message() << '\n'; + }); + } +} + template class ovdl::csv::Parser<EncodingType::Windows1252>; template class ovdl::csv::Parser<EncodingType::Utf8>;
\ No newline at end of file |