diff options
-rw-r--r-- | .clang-format | 6 | ||||
-rw-r--r-- | .gitmodules | 4 | ||||
-rw-r--r-- | deps/SCsub | 28 | ||||
m--------- | deps/fmt | 0 | ||||
-rw-r--r-- | include/openvic-dataloader/csv/Parser.hpp | 44 | ||||
-rw-r--r-- | src/openvic-dataloader/csv/CsvGrammar.hpp | 9 | ||||
-rw-r--r-- | src/openvic-dataloader/csv/Grammar.inc | 115 | ||||
-rw-r--r-- | src/openvic-dataloader/csv/Parser.cpp | 14 |
8 files changed, 173 insertions, 47 deletions
diff --git a/.clang-format b/.clang-format index 86fc638..bfd1ace 100644 --- a/.clang-format +++ b/.clang-format @@ -55,7 +55,9 @@ IncludeCategories: Priority: 3 - Regex: ^<lexy/ Priority: 4 - - Regex: ^"openvic-dataloader/ + - Regex: ^<fmt/ Priority: 5 - - Regex: .* + - Regex: ^"openvic-dataloader/ Priority: 6 + - Regex: .* + Priority: 7 diff --git a/.gitmodules b/.gitmodules index 0a1353b..796fcc8 100644 --- a/.gitmodules +++ b/.gitmodules @@ -5,3 +5,7 @@ [submodule "scripts"] path = scripts url = https://github.com/OpenVicProject/scripts +[submodule "deps/fmt"] + path = deps/fmt + url = https://github.com/fmtlib/fmt + ignore = dirty @@ -41,5 +41,31 @@ def build_lexy(env): env.Append(LIBPATH=[lexy_env.Dir("lexy/src")]) env.Prepend(LIBS=[library_name]) +def build_fmt(env): + fmt_env = env.Clone() -build_lexy(env)
\ No newline at end of file + if fmt_env.get("is_msvc", False): + fmt_env.Append(CXXFLAGS=["/std:c++20"]) + else: + fmt_env.Append(CXXFLAGS=["-std=c++20"]) + + paths = ["fmt/include", "fmt/src"] + fmt_env.Append(CPPPATH=[[fmt_env.Dir(p) for p in paths]]) + sources = env.GlobRecursive("*.cc", paths, exclude=["fmt.cc"]) + env.fmt_sources = sources + library_name = "libfmt" + env["LIBSUFFIX"] + library = fmt_env.StaticLibrary(target="fmt/src/" + library_name, source=sources) + Default(library) + + env.Append(CPPPATH=[fmt_env.Dir("fmt/include")]) + if env.get("is_msvc", False): + env.Append(CXXFLAGS=["/external:I", fmt_env.Dir("fmt/include"), "/external:W0"]) + else: + env.Append(CXXFLAGS=["-isystem", fmt_env.Dir("fmt/include")]) + env.Append(CXXFLAGS=[""]) + env.Append(LIBPATH=[fmt_env.Dir("fmt/src")]) + env.Prepend(LIBS=[library_name]) + + +build_lexy(env) +build_fmt(env)
\ No newline at end of file diff --git a/deps/fmt b/deps/fmt new file mode 160000 +Subproject f5e54359df4c26b6230fc61d38aa29458139308 diff --git a/include/openvic-dataloader/csv/Parser.hpp b/include/openvic-dataloader/csv/Parser.hpp index fffd92a..544f0b0 100644 --- a/include/openvic-dataloader/csv/Parser.hpp +++ b/include/openvic-dataloader/csv/Parser.hpp @@ -1,5 +1,11 @@ #pragma once +#include <functional> +#include <string_view> +#include <unordered_map> +#include <unordered_set> +#include <vector> + #include <openvic-dataloader/csv/LineObject.hpp> #include <openvic-dataloader/detail/BasicParser.hpp> @@ -9,9 +15,42 @@ namespace ovdl::csv { Utf8 }; + struct string_hash { + using is_transparent = void; + [[nodiscard]] size_t operator()(const char* txt) const { + return std::hash<std::string_view> {}(txt); + } + [[nodiscard]] size_t operator()(std::string_view txt) const { + return std::hash<std::string_view> {}(txt); + } + [[nodiscard]] size_t operator()(std::string& txt) const { + return std::hash<std::string> {}(txt); + } + }; + template<EncodingType Encoding = EncodingType::Windows1252> class Parser final : public detail::BasicParser { public: + struct State { + std::unordered_map<std::string, std::string, string_hash, std::equal_to<>> escape_values; + + inline bool has_value(std::string_view key) const { + return escape_values.find(key) != escape_values.end(); + } + + inline decltype(escape_values)::const_iterator find_value(std::string_view key) const { + return escape_values.find(key); + } + + inline decltype(escape_values)::const_iterator begin() const { + return escape_values.begin(); + } + + inline decltype(escape_values)::const_iterator end() const { + return escape_values.end(); + } + }; + Parser(); static Parser from_buffer(const char* data, std::size_t size); @@ -30,6 +69,10 @@ namespace ovdl::csv { bool parse_csv(bool handle_strings = false); + void add_escape_value(std::string_view key, std::string_view value); + void remove_escape_value(std::string_view key, std::string_view value); + void clear_escape_values(); + const std::vector<csv::LineObject>& get_lines() const; Parser(Parser&&); @@ -41,6 +84,7 @@ namespace ovdl::csv { class BufferHandler; std::unique_ptr<BufferHandler> _buffer_handler; std::vector<csv::LineObject> _lines; + State _parser_state; template<typename... Args> constexpr void _run_load_func(detail::LoadCallback<BufferHandler, Args...> auto func, Args... args); diff --git a/src/openvic-dataloader/csv/CsvGrammar.hpp b/src/openvic-dataloader/csv/CsvGrammar.hpp index 8f8da82..bfae8d0 100644 --- a/src/openvic-dataloader/csv/CsvGrammar.hpp +++ b/src/openvic-dataloader/csv/CsvGrammar.hpp @@ -1,16 +1,21 @@ #pragma once #include <initializer_list> +#include <memory> #include <string> +#include <string_view> #include <tuple> #include <type_traits> #include <vector> #include <openvic-dataloader/csv/LineObject.hpp> +#include <openvic-dataloader/csv/Parser.hpp> #include <lexy/callback.hpp> #include <lexy/dsl.hpp> +#include <fmt/format.h> + #include "detail/LexyLitRange.hpp" // Grammar Definitions // @@ -20,6 +25,8 @@ namespace ovdl::csv::grammar::windows1252 { lexy::dsl::ascii::control / lexy::dsl::lit_b<0x81> / lexy::dsl::lit_b<0x8D> / lexy::dsl::lit_b<0x8F> / lexy::dsl::lit_b<0x90> / lexy::dsl::lit_b<0x9D>; + constexpr auto id_head = lexy::dsl::ascii::alpha_underscore; + constexpr auto id_tail = lexy::dsl::ascii::alpha_digit_underscore; #include "Grammar.inc" } @@ -27,6 +34,8 @@ namespace ovdl::csv::grammar::windows1252 { namespace ovdl::csv::grammar::utf8 { constexpr auto character = lexy::dsl::unicode::character; constexpr auto control = lexy::dsl::unicode::control; + constexpr auto id_head = lexy::dsl::unicode::xid_start_underscore; + constexpr auto id_tail = lexy::dsl::unicode::xid_continue; #include "Grammar.inc" }
\ No newline at end of file diff --git a/src/openvic-dataloader/csv/Grammar.inc b/src/openvic-dataloader/csv/Grammar.inc index 6dd4158..7de9e81 100644 --- a/src/openvic-dataloader/csv/Grammar.inc +++ b/src/openvic-dataloader/csv/Grammar.inc @@ -4,16 +4,21 @@ // Includes to keep file errors small #include <initializer_list> +#include <memory> #include <string> +#include <string_view> #include <tuple> #include <type_traits> #include <vector> #include <openvic-dataloader/csv/LineObject.hpp> +#include <openvic-dataloader/csv/Parser.hpp> #include <lexy/callback.hpp> #include <lexy/dsl.hpp> +#include <fmt/format.h> + #include "detail/LexyLitRange.hpp" struct ParseOptions { @@ -21,32 +26,27 @@ struct ParseOptions { char SepChar; /// @brief Determines whether StringValue is supported bool SupportStrings; + /// @brief Paradox-style localization escape characters + /// @note Is ignored if SupportStrings is true + char EscapeChar; }; -#include "detail/LexyLitRange.hpp" +constexpr auto escaped_symbols = lexy::symbol_table<char> // + .map<'"'>('"') + .map<'\''>('\'') + .map<'\\'>('\\') + .map<'/'>('/') + .map<'b'>('\b') + .map<'f'>('\f') + .map<'n'>('\n') + .map<'r'>('\r') + .map<'t'>('\t'); -struct ParseOptions { - /// @brief Seperator character - char SepChar; - /// @brief Determines whether StringValue is supported - bool SupportStrings; -}; +constexpr auto escaped_quote = lexy::symbol_table<char> // + .map<'"'>('"'); +template<ParseOptions Options> struct StringValue { - static constexpr auto escaped_symbols = lexy::symbol_table<char> // - .map<'"'>('"') - .map<'\''>('\'') - .map<'\\'>('\\') - .map<'/'>('/') - .map<'b'>('\b') - .map<'f'>('\f') - .map<'n'>('\n') - .map<'r'>('\r') - .map<'t'>('\t'); - /// This doesn't actually do anything, so this might to be manually parsed if vic2's CSV parser creates a " from "" - static constexpr auto escaped_quote = lexy::symbol_table<char> // - .map<'"'>('"'); - static constexpr auto rule = [] { // Arbitrary code points auto c = character - control; @@ -57,25 +57,66 @@ struct StringValue { auto quote_escape = lexy::dsl::escape(lexy::dsl::lit_c<'"'>) // .symbol<escaped_quote>(); - return lexy::dsl::delimited(lexy::dsl::lit_c<'"'>, lexy::dsl::not_followed_by(lexy::dsl::lit_c<'"'>, lexy::dsl::lit_c<'"'>))(c, back_escape, quote_escape); + auto quotes = lexy::dsl::delimited(lexy::dsl::lit_c<'"'>, lexy::dsl::not_followed_by(lexy::dsl::lit_c<'"'>, lexy::dsl::lit_c<'"'>)); + + return quotes(c, back_escape, quote_escape); }(); static constexpr auto value = lexy::as_string<std::string>; }; template<ParseOptions Options> +struct EscapeValue { + static constexpr auto rule = [] { + auto id = lexy::dsl::identifier(id_head, id_tail); + + return lexy::dsl::lit_b<Options.EscapeChar> >> + (lexy::dsl::lit_b<Options.EscapeChar> | + (id >> lexy::dsl::lit_b<Options.EscapeChar>)); + }(); + static constexpr auto value = + lexy::callback_with_state<std::string>( + [](const auto& state, auto&& lexeme) { + auto check = std::string_view { lexeme.data(), lexeme.size() }; + if (auto value = state.find_value(check); value != state.end()) + return std::string(value->second.data(), value->second.size()); + return fmt::format("${}$", check); + }, + [](auto&& lexeme) { + return fmt::format("${}$", std::string_view { lexeme.data(), lexeme.size() }); + }, + [](lexy::nullopt = {}) { + return std::string(1, Options.EscapeChar); + }, + [](const auto& state, lexy::nullopt = {}) { + return std::string(1, Options.EscapeChar); + }); +}; + +template<ParseOptions Options> struct PlainValue { static constexpr auto rule = [] { + auto min_skip = lexy::dsl::lit_b<Options.SepChar> / lexy::dsl::ascii::newline; if constexpr (Options.SupportStrings) { - return lexy::dsl::identifier(character - (lexy::dsl::lit_b<Options.SepChar> / lexy::dsl::ascii::newline)); + return lexy::dsl::identifier(character - min_skip); } else { - auto escape_check_char = character - (lexy::dsl::lit_b<Options.SepChar> / lexy::dsl::ascii::newline); + auto escape_check_char = [=] { + if constexpr (Options.EscapeChar != 0) { + return character - (min_skip / lexy::dsl::lit_b<Options.EscapeChar>); + } else { + return character - min_skip; + } + }(); auto id_check_char = escape_check_char - lexy::dsl::lit_b<'\\'>; auto id_segment = lexy::dsl::identifier(id_check_char); auto escape_segement = lexy::dsl::token(escape_check_char); - auto escape_sym = lexy::dsl::symbol<StringValue::escaped_symbols>(escape_segement); + auto escape_sym = lexy::dsl::symbol<escaped_symbols>(escape_segement); auto escape_rule = lexy::dsl::lit_b<'\\'> >> escape_sym; - return lexy::dsl::list(id_segment | escape_rule); + if constexpr (Options.EscapeChar != 0) { + return lexy::dsl::list(lexy::dsl::p<EscapeValue<Options>> | id_segment | escape_rule); + } else { + return lexy::dsl::list(id_segment | escape_rule); + } } }(); static constexpr auto value = lexy::as_string<std::string>; @@ -85,7 +126,7 @@ template<ParseOptions Options> struct Value { static constexpr auto rule = [] { if constexpr (Options.SupportStrings) { - return lexy::dsl::p<StringValue> | lexy::dsl::p<PlainValue<Options>>; + return lexy::dsl::p<StringValue<Options>> | lexy::dsl::p<PlainValue<Options>>; } else { return lexy::dsl::p<PlainValue<Options>>; } @@ -165,16 +206,16 @@ struct File { static constexpr auto value = lexy::as_list<std::vector<ovdl::csv::LineObject>>; }; -using CommaFile = File<ParseOptions { ',' }>; -using ColonFile = File<ParseOptions { ':' }>; -using SemiColonFile = File<ParseOptions { ';' }>; -using TabFile = File<ParseOptions { '\t' }>; -using BarFile = File<ParseOptions { '|' }>; +using CommaFile = File<ParseOptions { ',', false, '$' }>; +using ColonFile = File<ParseOptions { ':', false, '$' }>; +using SemiColonFile = File<ParseOptions { ';', false, '$' }>; +using TabFile = File<ParseOptions { '\t', false, '$' }>; +using BarFile = File<ParseOptions { '|', false, '$' }>; namespace strings { - using CommaFile = File<ParseOptions { ',', true }>; - using ColonFile = File<ParseOptions { ':', true }>; - using SemiColonFile = File<ParseOptions { ';', true }>; - using TabFile = File<ParseOptions { '\t', true }>; - using BarFile = File<ParseOptions { '|', true }>; + using CommaFile = File<ParseOptions { ',', true, '$' }>; + using ColonFile = File<ParseOptions { ':', true, '$' }>; + using SemiColonFile = File<ParseOptions { ';', true, '$' }>; + using TabFile = File<ParseOptions { '\t', true, '$' }>; + using BarFile = File<ParseOptions { '|', true, '$' }>; } diff --git a/src/openvic-dataloader/csv/Parser.cpp b/src/openvic-dataloader/csv/Parser.cpp index 14ef553..40f0037 100644 --- a/src/openvic-dataloader/csv/Parser.cpp +++ b/src/openvic-dataloader/csv/Parser.cpp @@ -38,9 +38,9 @@ struct LexyEncodingFrom<EncodingType::Utf8> { template<EncodingType Encoding> class Parser<Encoding>::BufferHandler final : public detail::BasicBufferHandler<typename LexyEncodingFrom<Encoding>::encoding> { public: - template<typename Node, typename ErrorCallback> - std::optional<std::vector<ParseError>> parse(const ErrorCallback& callback) { - auto result = lexy::parse<Node>(this->_buffer, callback); + template<typename Node, typename ParseState, typename ErrorCallback> + std::optional<std::vector<ParseError>> parse(const ParseState& state, const ErrorCallback& callback) { + auto result = lexy::parse<Node>(this->_buffer, state, callback); if (!result) { return result.errors(); } @@ -174,14 +174,14 @@ bool Parser<Encoding>::parse_csv(bool handle_strings) { auto report_error = ovdl::detail::ReporError.path(_file_path).to(detail::OStreamOutputIterator { _error_stream }); if constexpr (Encoding == EncodingType::Windows1252) { if (handle_strings) - errors = _buffer_handler->template parse<csv::grammar::windows1252::strings::SemiColonFile>(report_error); + errors = _buffer_handler->template parse<csv::grammar::windows1252::strings::SemiColonFile>(_parser_state, report_error); else - errors = _buffer_handler->template parse<csv::grammar::windows1252::SemiColonFile>(report_error); + errors = _buffer_handler->template parse<csv::grammar::windows1252::SemiColonFile>(_parser_state, report_error); } else { if (handle_strings) - errors = _buffer_handler->template parse<csv::grammar::utf8::strings::SemiColonFile>(report_error); + errors = _buffer_handler->template parse<csv::grammar::utf8::strings::SemiColonFile>(_parser_state, report_error); else - errors = _buffer_handler->template parse<csv::grammar::utf8::SemiColonFile>(report_error); + errors = _buffer_handler->template parse<csv::grammar::utf8::SemiColonFile>(_parser_state, report_error); } if (errors) { _errors.reserve(errors->size()); |