aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
author Spartan322 <Megacake1234@gmail.com>2023-09-14 08:39:44 +0200
committer Spartan322 <Megacake1234@gmail.com>2023-09-29 02:15:46 +0200
commit70ee2cea9bf1c752bcb3f1e0bd9e7b00f437967e (patch)
tree384e326485b8b19816b567515a34fe6db66a7f8d
parent5afe363e7f48ee52fd70edea316789fcb18178dc (diff)
Add parse state escape behavior
-rw-r--r--.clang-format6
-rw-r--r--.gitmodules4
-rw-r--r--deps/SCsub28
m---------deps/fmt0
-rw-r--r--include/openvic-dataloader/csv/Parser.hpp44
-rw-r--r--src/openvic-dataloader/csv/CsvGrammar.hpp9
-rw-r--r--src/openvic-dataloader/csv/Grammar.inc115
-rw-r--r--src/openvic-dataloader/csv/Parser.cpp14
8 files changed, 173 insertions, 47 deletions
diff --git a/.clang-format b/.clang-format
index 86fc638..bfd1ace 100644
--- a/.clang-format
+++ b/.clang-format
@@ -55,7 +55,9 @@ IncludeCategories:
Priority: 3
- Regex: ^<lexy/
Priority: 4
- - Regex: ^"openvic-dataloader/
+ - Regex: ^<fmt/
Priority: 5
- - Regex: .*
+ - Regex: ^"openvic-dataloader/
Priority: 6
+ - Regex: .*
+ Priority: 7
diff --git a/.gitmodules b/.gitmodules
index 0a1353b..796fcc8 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -5,3 +5,7 @@
[submodule "scripts"]
path = scripts
url = https://github.com/OpenVicProject/scripts
+[submodule "deps/fmt"]
+ path = deps/fmt
+ url = https://github.com/fmtlib/fmt
+ ignore = dirty
diff --git a/deps/SCsub b/deps/SCsub
index eb27dab..4c427fe 100644
--- a/deps/SCsub
+++ b/deps/SCsub
@@ -41,5 +41,31 @@ def build_lexy(env):
env.Append(LIBPATH=[lexy_env.Dir("lexy/src")])
env.Prepend(LIBS=[library_name])
+def build_fmt(env):
+ fmt_env = env.Clone()
-build_lexy(env) \ No newline at end of file
+ if fmt_env.get("is_msvc", False):
+ fmt_env.Append(CXXFLAGS=["/std:c++20"])
+ else:
+ fmt_env.Append(CXXFLAGS=["-std=c++20"])
+
+ paths = ["fmt/include", "fmt/src"]
+ fmt_env.Append(CPPPATH=[[fmt_env.Dir(p) for p in paths]])
+ sources = env.GlobRecursive("*.cc", paths, exclude=["fmt.cc"])
+ env.fmt_sources = sources
+ library_name = "libfmt" + env["LIBSUFFIX"]
+ library = fmt_env.StaticLibrary(target="fmt/src/" + library_name, source=sources)
+ Default(library)
+
+ env.Append(CPPPATH=[fmt_env.Dir("fmt/include")])
+ if env.get("is_msvc", False):
+ env.Append(CXXFLAGS=["/external:I", fmt_env.Dir("fmt/include"), "/external:W0"])
+ else:
+ env.Append(CXXFLAGS=["-isystem", fmt_env.Dir("fmt/include")])
+ env.Append(CXXFLAGS=[""])
+ env.Append(LIBPATH=[fmt_env.Dir("fmt/src")])
+ env.Prepend(LIBS=[library_name])
+
+
+build_lexy(env)
+build_fmt(env) \ No newline at end of file
diff --git a/deps/fmt b/deps/fmt
new file mode 160000
+Subproject f5e54359df4c26b6230fc61d38aa29458139308
diff --git a/include/openvic-dataloader/csv/Parser.hpp b/include/openvic-dataloader/csv/Parser.hpp
index fffd92a..544f0b0 100644
--- a/include/openvic-dataloader/csv/Parser.hpp
+++ b/include/openvic-dataloader/csv/Parser.hpp
@@ -1,5 +1,11 @@
#pragma once
+#include <functional>
+#include <string_view>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
#include <openvic-dataloader/csv/LineObject.hpp>
#include <openvic-dataloader/detail/BasicParser.hpp>
@@ -9,9 +15,42 @@ namespace ovdl::csv {
Utf8
};
+ struct string_hash {
+ using is_transparent = void;
+ [[nodiscard]] size_t operator()(const char* txt) const {
+ return std::hash<std::string_view> {}(txt);
+ }
+ [[nodiscard]] size_t operator()(std::string_view txt) const {
+ return std::hash<std::string_view> {}(txt);
+ }
+ [[nodiscard]] size_t operator()(std::string& txt) const {
+ return std::hash<std::string> {}(txt);
+ }
+ };
+
template<EncodingType Encoding = EncodingType::Windows1252>
class Parser final : public detail::BasicParser {
public:
+ struct State {
+ std::unordered_map<std::string, std::string, string_hash, std::equal_to<>> escape_values;
+
+ inline bool has_value(std::string_view key) const {
+ return escape_values.find(key) != escape_values.end();
+ }
+
+ inline decltype(escape_values)::const_iterator find_value(std::string_view key) const {
+ return escape_values.find(key);
+ }
+
+ inline decltype(escape_values)::const_iterator begin() const {
+ return escape_values.begin();
+ }
+
+ inline decltype(escape_values)::const_iterator end() const {
+ return escape_values.end();
+ }
+ };
+
Parser();
static Parser from_buffer(const char* data, std::size_t size);
@@ -30,6 +69,10 @@ namespace ovdl::csv {
bool parse_csv(bool handle_strings = false);
+ void add_escape_value(std::string_view key, std::string_view value);
+ void remove_escape_value(std::string_view key, std::string_view value);
+ void clear_escape_values();
+
const std::vector<csv::LineObject>& get_lines() const;
Parser(Parser&&);
@@ -41,6 +84,7 @@ namespace ovdl::csv {
class BufferHandler;
std::unique_ptr<BufferHandler> _buffer_handler;
std::vector<csv::LineObject> _lines;
+ State _parser_state;
template<typename... Args>
constexpr void _run_load_func(detail::LoadCallback<BufferHandler, Args...> auto func, Args... args);
diff --git a/src/openvic-dataloader/csv/CsvGrammar.hpp b/src/openvic-dataloader/csv/CsvGrammar.hpp
index 8f8da82..bfae8d0 100644
--- a/src/openvic-dataloader/csv/CsvGrammar.hpp
+++ b/src/openvic-dataloader/csv/CsvGrammar.hpp
@@ -1,16 +1,21 @@
#pragma once
#include <initializer_list>
+#include <memory>
#include <string>
+#include <string_view>
#include <tuple>
#include <type_traits>
#include <vector>
#include <openvic-dataloader/csv/LineObject.hpp>
+#include <openvic-dataloader/csv/Parser.hpp>
#include <lexy/callback.hpp>
#include <lexy/dsl.hpp>
+#include <fmt/format.h>
+
#include "detail/LexyLitRange.hpp"
// Grammar Definitions //
@@ -20,6 +25,8 @@ namespace ovdl::csv::grammar::windows1252 {
lexy::dsl::ascii::control /
lexy::dsl::lit_b<0x81> / lexy::dsl::lit_b<0x8D> / lexy::dsl::lit_b<0x8F> /
lexy::dsl::lit_b<0x90> / lexy::dsl::lit_b<0x9D>;
+ constexpr auto id_head = lexy::dsl::ascii::alpha_underscore;
+ constexpr auto id_tail = lexy::dsl::ascii::alpha_digit_underscore;
#include "Grammar.inc"
}
@@ -27,6 +34,8 @@ namespace ovdl::csv::grammar::windows1252 {
namespace ovdl::csv::grammar::utf8 {
constexpr auto character = lexy::dsl::unicode::character;
constexpr auto control = lexy::dsl::unicode::control;
+ constexpr auto id_head = lexy::dsl::unicode::xid_start_underscore;
+ constexpr auto id_tail = lexy::dsl::unicode::xid_continue;
#include "Grammar.inc"
} \ No newline at end of file
diff --git a/src/openvic-dataloader/csv/Grammar.inc b/src/openvic-dataloader/csv/Grammar.inc
index 6dd4158..7de9e81 100644
--- a/src/openvic-dataloader/csv/Grammar.inc
+++ b/src/openvic-dataloader/csv/Grammar.inc
@@ -4,16 +4,21 @@
// Includes to keep file errors small
#include <initializer_list>
+#include <memory>
#include <string>
+#include <string_view>
#include <tuple>
#include <type_traits>
#include <vector>
#include <openvic-dataloader/csv/LineObject.hpp>
+#include <openvic-dataloader/csv/Parser.hpp>
#include <lexy/callback.hpp>
#include <lexy/dsl.hpp>
+#include <fmt/format.h>
+
#include "detail/LexyLitRange.hpp"
struct ParseOptions {
@@ -21,32 +26,27 @@ struct ParseOptions {
char SepChar;
/// @brief Determines whether StringValue is supported
bool SupportStrings;
+ /// @brief Paradox-style localization escape characters
+ /// @note Is ignored if SupportStrings is true
+ char EscapeChar;
};
-#include "detail/LexyLitRange.hpp"
+constexpr auto escaped_symbols = lexy::symbol_table<char> //
+ .map<'"'>('"')
+ .map<'\''>('\'')
+ .map<'\\'>('\\')
+ .map<'/'>('/')
+ .map<'b'>('\b')
+ .map<'f'>('\f')
+ .map<'n'>('\n')
+ .map<'r'>('\r')
+ .map<'t'>('\t');
-struct ParseOptions {
- /// @brief Seperator character
- char SepChar;
- /// @brief Determines whether StringValue is supported
- bool SupportStrings;
-};
+constexpr auto escaped_quote = lexy::symbol_table<char> //
+ .map<'"'>('"');
+template<ParseOptions Options>
struct StringValue {
- static constexpr auto escaped_symbols = lexy::symbol_table<char> //
- .map<'"'>('"')
- .map<'\''>('\'')
- .map<'\\'>('\\')
- .map<'/'>('/')
- .map<'b'>('\b')
- .map<'f'>('\f')
- .map<'n'>('\n')
- .map<'r'>('\r')
- .map<'t'>('\t');
- /// This doesn't actually do anything, so this might to be manually parsed if vic2's CSV parser creates a " from ""
- static constexpr auto escaped_quote = lexy::symbol_table<char> //
- .map<'"'>('"');
-
static constexpr auto rule = [] {
// Arbitrary code points
auto c = character - control;
@@ -57,25 +57,66 @@ struct StringValue {
auto quote_escape = lexy::dsl::escape(lexy::dsl::lit_c<'"'>) //
.symbol<escaped_quote>();
- return lexy::dsl::delimited(lexy::dsl::lit_c<'"'>, lexy::dsl::not_followed_by(lexy::dsl::lit_c<'"'>, lexy::dsl::lit_c<'"'>))(c, back_escape, quote_escape);
+ auto quotes = lexy::dsl::delimited(lexy::dsl::lit_c<'"'>, lexy::dsl::not_followed_by(lexy::dsl::lit_c<'"'>, lexy::dsl::lit_c<'"'>));
+
+ return quotes(c, back_escape, quote_escape);
}();
static constexpr auto value = lexy::as_string<std::string>;
};
template<ParseOptions Options>
+struct EscapeValue {
+ static constexpr auto rule = [] {
+ auto id = lexy::dsl::identifier(id_head, id_tail);
+
+ return lexy::dsl::lit_b<Options.EscapeChar> >>
+ (lexy::dsl::lit_b<Options.EscapeChar> |
+ (id >> lexy::dsl::lit_b<Options.EscapeChar>));
+ }();
+ static constexpr auto value =
+ lexy::callback_with_state<std::string>(
+ [](const auto& state, auto&& lexeme) {
+ auto check = std::string_view { lexeme.data(), lexeme.size() };
+ if (auto value = state.find_value(check); value != state.end())
+ return std::string(value->second.data(), value->second.size());
+ return fmt::format("${}$", check);
+ },
+ [](auto&& lexeme) {
+ return fmt::format("${}$", std::string_view { lexeme.data(), lexeme.size() });
+ },
+ [](lexy::nullopt = {}) {
+ return std::string(1, Options.EscapeChar);
+ },
+ [](const auto& state, lexy::nullopt = {}) {
+ return std::string(1, Options.EscapeChar);
+ });
+};
+
+template<ParseOptions Options>
struct PlainValue {
static constexpr auto rule = [] {
+ auto min_skip = lexy::dsl::lit_b<Options.SepChar> / lexy::dsl::ascii::newline;
if constexpr (Options.SupportStrings) {
- return lexy::dsl::identifier(character - (lexy::dsl::lit_b<Options.SepChar> / lexy::dsl::ascii::newline));
+ return lexy::dsl::identifier(character - min_skip);
} else {
- auto escape_check_char = character - (lexy::dsl::lit_b<Options.SepChar> / lexy::dsl::ascii::newline);
+ auto escape_check_char = [=] {
+ if constexpr (Options.EscapeChar != 0) {
+ return character - (min_skip / lexy::dsl::lit_b<Options.EscapeChar>);
+ } else {
+ return character - min_skip;
+ }
+ }();
auto id_check_char = escape_check_char - lexy::dsl::lit_b<'\\'>;
auto id_segment = lexy::dsl::identifier(id_check_char);
auto escape_segement = lexy::dsl::token(escape_check_char);
- auto escape_sym = lexy::dsl::symbol<StringValue::escaped_symbols>(escape_segement);
+ auto escape_sym = lexy::dsl::symbol<escaped_symbols>(escape_segement);
auto escape_rule = lexy::dsl::lit_b<'\\'> >> escape_sym;
- return lexy::dsl::list(id_segment | escape_rule);
+ if constexpr (Options.EscapeChar != 0) {
+ return lexy::dsl::list(lexy::dsl::p<EscapeValue<Options>> | id_segment | escape_rule);
+ } else {
+ return lexy::dsl::list(id_segment | escape_rule);
+ }
}
}();
static constexpr auto value = lexy::as_string<std::string>;
@@ -85,7 +126,7 @@ template<ParseOptions Options>
struct Value {
static constexpr auto rule = [] {
if constexpr (Options.SupportStrings) {
- return lexy::dsl::p<StringValue> | lexy::dsl::p<PlainValue<Options>>;
+ return lexy::dsl::p<StringValue<Options>> | lexy::dsl::p<PlainValue<Options>>;
} else {
return lexy::dsl::p<PlainValue<Options>>;
}
@@ -165,16 +206,16 @@ struct File {
static constexpr auto value = lexy::as_list<std::vector<ovdl::csv::LineObject>>;
};
-using CommaFile = File<ParseOptions { ',' }>;
-using ColonFile = File<ParseOptions { ':' }>;
-using SemiColonFile = File<ParseOptions { ';' }>;
-using TabFile = File<ParseOptions { '\t' }>;
-using BarFile = File<ParseOptions { '|' }>;
+using CommaFile = File<ParseOptions { ',', false, '$' }>;
+using ColonFile = File<ParseOptions { ':', false, '$' }>;
+using SemiColonFile = File<ParseOptions { ';', false, '$' }>;
+using TabFile = File<ParseOptions { '\t', false, '$' }>;
+using BarFile = File<ParseOptions { '|', false, '$' }>;
namespace strings {
- using CommaFile = File<ParseOptions { ',', true }>;
- using ColonFile = File<ParseOptions { ':', true }>;
- using SemiColonFile = File<ParseOptions { ';', true }>;
- using TabFile = File<ParseOptions { '\t', true }>;
- using BarFile = File<ParseOptions { '|', true }>;
+ using CommaFile = File<ParseOptions { ',', true, '$' }>;
+ using ColonFile = File<ParseOptions { ':', true, '$' }>;
+ using SemiColonFile = File<ParseOptions { ';', true, '$' }>;
+ using TabFile = File<ParseOptions { '\t', true, '$' }>;
+ using BarFile = File<ParseOptions { '|', true, '$' }>;
}
diff --git a/src/openvic-dataloader/csv/Parser.cpp b/src/openvic-dataloader/csv/Parser.cpp
index 14ef553..40f0037 100644
--- a/src/openvic-dataloader/csv/Parser.cpp
+++ b/src/openvic-dataloader/csv/Parser.cpp
@@ -38,9 +38,9 @@ struct LexyEncodingFrom<EncodingType::Utf8> {
template<EncodingType Encoding>
class Parser<Encoding>::BufferHandler final : public detail::BasicBufferHandler<typename LexyEncodingFrom<Encoding>::encoding> {
public:
- template<typename Node, typename ErrorCallback>
- std::optional<std::vector<ParseError>> parse(const ErrorCallback& callback) {
- auto result = lexy::parse<Node>(this->_buffer, callback);
+ template<typename Node, typename ParseState, typename ErrorCallback>
+ std::optional<std::vector<ParseError>> parse(const ParseState& state, const ErrorCallback& callback) {
+ auto result = lexy::parse<Node>(this->_buffer, state, callback);
if (!result) {
return result.errors();
}
@@ -174,14 +174,14 @@ bool Parser<Encoding>::parse_csv(bool handle_strings) {
auto report_error = ovdl::detail::ReporError.path(_file_path).to(detail::OStreamOutputIterator { _error_stream });
if constexpr (Encoding == EncodingType::Windows1252) {
if (handle_strings)
- errors = _buffer_handler->template parse<csv::grammar::windows1252::strings::SemiColonFile>(report_error);
+ errors = _buffer_handler->template parse<csv::grammar::windows1252::strings::SemiColonFile>(_parser_state, report_error);
else
- errors = _buffer_handler->template parse<csv::grammar::windows1252::SemiColonFile>(report_error);
+ errors = _buffer_handler->template parse<csv::grammar::windows1252::SemiColonFile>(_parser_state, report_error);
} else {
if (handle_strings)
- errors = _buffer_handler->template parse<csv::grammar::utf8::strings::SemiColonFile>(report_error);
+ errors = _buffer_handler->template parse<csv::grammar::utf8::strings::SemiColonFile>(_parser_state, report_error);
else
- errors = _buffer_handler->template parse<csv::grammar::utf8::SemiColonFile>(report_error);
+ errors = _buffer_handler->template parse<csv::grammar::utf8::SemiColonFile>(_parser_state, report_error);
}
if (errors) {
_errors.reserve(errors->size());