Add parse state escape behavior

author: Spartan322 <Megacake1234@gmail.com> 2023-09-14 08:39:44 +0200
committer: Spartan322 <Megacake1234@gmail.com> 2023-09-29 02:15:46 +0200
commit: 70ee2cea9bf1c752bcb3f1e0bd9e7b00f437967e (patch)
tree: 384e326485b8b19816b567515a34fe6db66a7f8d
parent: 5afe363e7f48ee52fd70edea316789fcb18178dc (diff)
8 files changed, 173 insertions, 47 deletions
diff --git a/.clang-format b/.clang-format
index 86fc638..bfd1ace 100644
--- a/.clang-format
+++ b/.clang-format
@@ -55,7 +55,9 @@ IncludeCategories:
     Priority: 3
   - Regex: ^<lexy/
     Priority: 4
-  - Regex: ^"openvic-dataloader/
+  - Regex: ^<fmt/
     Priority: 5
-  - Regex: .*
+  - Regex: ^"openvic-dataloader/
     Priority: 6
+  - Regex: .*
+    Priority: 7
diff --git a/.gitmodules b/.gitmodules
index 0a1353b..796fcc8 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -5,3 +5,7 @@
 [submodule "scripts"]
 	path = scripts
 	url = https://github.com/OpenVicProject/scripts
+[submodule "deps/fmt"]
+	path = deps/fmt
+	url = https://github.com/fmtlib/fmt
+	ignore = dirty
diff --git a/deps/SCsub b/deps/SCsub
index eb27dab..4c427fe 100644
--- a/deps/SCsub
+++ b/deps/SCsub
@@ -41,5 +41,31 @@ def build_lexy(env):
     env.Append(LIBPATH=[lexy_env.Dir("lexy/src")])
     env.Prepend(LIBS=[library_name])
 
+def build_fmt(env):
+    fmt_env = env.Clone()
 
-build_lexy(env)
-\ No newline at end of file
+    if fmt_env.get("is_msvc", False):
+        fmt_env.Append(CXXFLAGS=["/std:c++20"])
+    else:
+        fmt_env.Append(CXXFLAGS=["-std=c++20"])
+    
+    paths = ["fmt/include", "fmt/src"]
+    fmt_env.Append(CPPPATH=[[fmt_env.Dir(p) for p in paths]])
+    sources = env.GlobRecursive("*.cc", paths, exclude=["fmt.cc"])
+    env.fmt_sources = sources
+    library_name = "libfmt" + env["LIBSUFFIX"]
+    library = fmt_env.StaticLibrary(target="fmt/src/" + library_name, source=sources)
+    Default(library)
+
+    env.Append(CPPPATH=[fmt_env.Dir("fmt/include")])
+    if env.get("is_msvc", False):
+        env.Append(CXXFLAGS=["/external:I", fmt_env.Dir("fmt/include"), "/external:W0"])
+    else:
+        env.Append(CXXFLAGS=["-isystem", fmt_env.Dir("fmt/include")])
+    env.Append(CXXFLAGS=[""])
+    env.Append(LIBPATH=[fmt_env.Dir("fmt/src")])
+    env.Prepend(LIBS=[library_name])
+
+
+build_lexy(env)
+build_fmt(env)
+\ No newline at end of file
diff --git a/deps/fmt b/deps/fmt
new file mode 160000
+Subproject f5e54359df4c26b6230fc61d38aa29458139308
diff --git a/include/openvic-dataloader/csv/Parser.hpp b/include/openvic-dataloader/csv/Parser.hpp
index fffd92a..544f0b0 100644
--- a/include/openvic-dataloader/csv/Parser.hpp
+++ b/include/openvic-dataloader/csv/Parser.hpp
@@ -1,5 +1,11 @@
 #pragma once
 
+#include <functional>
+#include <string_view>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
 #include <openvic-dataloader/csv/LineObject.hpp>
 #include <openvic-dataloader/detail/BasicParser.hpp>
 
@@ -9,9 +15,42 @@ namespace ovdl::csv {
 		Utf8
 	};
 
+	struct string_hash {
+		using is_transparent = void;
+		[[nodiscard]] size_t operator()(const char* txt) const {
+			return std::hash<std::string_view> {}(txt);
+		}
+		[[nodiscard]] size_t operator()(std::string_view txt) const {
+			return std::hash<std::string_view> {}(txt);
+		}
+		[[nodiscard]] size_t operator()(std::string& txt) const {
+			return std::hash<std::string> {}(txt);
+		}
+	};
+
 	template<EncodingType Encoding = EncodingType::Windows1252>
 	class Parser final : public detail::BasicParser {
 	public:
+		struct State {
+			std::unordered_map<std::string, std::string, string_hash, std::equal_to<>> escape_values;
+
+			inline bool has_value(std::string_view key) const {
+				return escape_values.find(key) != escape_values.end();
+			}
+
+			inline decltype(escape_values)::const_iterator find_value(std::string_view key) const {
+				return escape_values.find(key);
+			}
+
+			inline decltype(escape_values)::const_iterator begin() const {
+				return escape_values.begin();
+			}
+
+			inline decltype(escape_values)::const_iterator end() const {
+				return escape_values.end();
+			}
+		};
+
 		Parser();
 
 		static Parser from_buffer(const char* data, std::size_t size);
@@ -30,6 +69,10 @@ namespace ovdl::csv {
 
 		bool parse_csv(bool handle_strings = false);
 
+		void add_escape_value(std::string_view key, std::string_view value);
+		void remove_escape_value(std::string_view key, std::string_view value);
+		void clear_escape_values();
+
 		const std::vector<csv::LineObject>& get_lines() const;
 
 		Parser(Parser&&);
@@ -41,6 +84,7 @@ namespace ovdl::csv {
 		class BufferHandler;
 		std::unique_ptr<BufferHandler> _buffer_handler;
 		std::vector<csv::LineObject> _lines;
+		State _parser_state;
 
 		template<typename... Args>
 		constexpr void _run_load_func(detail::LoadCallback<BufferHandler, Args...> auto func, Args... args);
diff --git a/src/openvic-dataloader/csv/CsvGrammar.hpp b/src/openvic-dataloader/csv/CsvGrammar.hpp
index 8f8da82..bfae8d0 100644
--- a/src/openvic-dataloader/csv/CsvGrammar.hpp
+++ b/src/openvic-dataloader/csv/CsvGrammar.hpp
@@ -1,16 +1,21 @@
 #pragma once
 
 #include <initializer_list>
+#include <memory>
 #include <string>
+#include <string_view>
 #include <tuple>
 #include <type_traits>
 #include <vector>
 
 #include <openvic-dataloader/csv/LineObject.hpp>
+#include <openvic-dataloader/csv/Parser.hpp>
 
 #include <lexy/callback.hpp>
 #include <lexy/dsl.hpp>
 
+#include <fmt/format.h>
+
 #include "detail/LexyLitRange.hpp"
 
 // Grammar Definitions //
@@ -20,6 +25,8 @@ namespace ovdl::csv::grammar::windows1252 {
 		lexy::dsl::ascii::control /
 		lexy::dsl::lit_b<0x81> / lexy::dsl::lit_b<0x8D> / lexy::dsl::lit_b<0x8F> /
 		lexy::dsl::lit_b<0x90> / lexy::dsl::lit_b<0x9D>;
+	constexpr auto id_head = lexy::dsl::ascii::alpha_underscore;
+	constexpr auto id_tail = lexy::dsl::ascii::alpha_digit_underscore;
 
 #include "Grammar.inc"
 }
@@ -27,6 +34,8 @@ namespace ovdl::csv::grammar::windows1252 {
 namespace ovdl::csv::grammar::utf8 {
 	constexpr auto character = lexy::dsl::unicode::character;
 	constexpr auto control = lexy::dsl::unicode::control;
+	constexpr auto id_head = lexy::dsl::unicode::xid_start_underscore;
+	constexpr auto id_tail = lexy::dsl::unicode::xid_continue;
 
 #include "Grammar.inc"
 }
 \ No newline at end of file
diff --git a/src/openvic-dataloader/csv/Grammar.inc b/src/openvic-dataloader/csv/Grammar.inc
index 6dd4158..7de9e81 100644
--- a/src/openvic-dataloader/csv/Grammar.inc
+++ b/src/openvic-dataloader/csv/Grammar.inc
@@ -4,16 +4,21 @@
 
 // Includes to keep file errors small
 #include <initializer_list>
+#include <memory>
 #include <string>
+#include <string_view>
 #include <tuple>
 #include <type_traits>
 #include <vector>
 
 #include <openvic-dataloader/csv/LineObject.hpp>
+#include <openvic-dataloader/csv/Parser.hpp>
 
 #include <lexy/callback.hpp>
 #include <lexy/dsl.hpp>
 
+#include <fmt/format.h>
+
 #include "detail/LexyLitRange.hpp"
 
 struct ParseOptions {
@@ -21,32 +26,27 @@ struct ParseOptions {
 	char SepChar;
 	/// @brief Determines whether StringValue is supported
 	bool SupportStrings;
+	/// @brief Paradox-style localization escape characters
+	/// @note Is ignored if SupportStrings is true
+	char EscapeChar;
 };
 
-#include "detail/LexyLitRange.hpp"
+constexpr auto escaped_symbols = lexy::symbol_table<char> //
+									 .map<'"'>('"')
+									 .map<'\''>('\'')
+									 .map<'\\'>('\\')
+									 .map<'/'>('/')
+									 .map<'b'>('\b')
+									 .map<'f'>('\f')
+									 .map<'n'>('\n')
+									 .map<'r'>('\r')
+									 .map<'t'>('\t');
 
-struct ParseOptions {
-	/// @brief Seperator character
-	char SepChar;
-	/// @brief Determines whether StringValue is supported
-	bool SupportStrings;
-};
+constexpr auto escaped_quote = lexy::symbol_table<char> //
+								   .map<'"'>('"');
 
+template<ParseOptions Options>
 struct StringValue {
-	static constexpr auto escaped_symbols = lexy::symbol_table<char> //
-												.map<'"'>('"')
-												.map<'\''>('\'')
-												.map<'\\'>('\\')
-												.map<'/'>('/')
-												.map<'b'>('\b')
-												.map<'f'>('\f')
-												.map<'n'>('\n')
-												.map<'r'>('\r')
-												.map<'t'>('\t');
-	/// This doesn't actually do anything, so this might to be manually parsed if vic2's CSV parser creates a " from ""
-	static constexpr auto escaped_quote = lexy::symbol_table<char> //
-											  .map<'"'>('"');
-
 	static constexpr auto rule = [] {
 		// Arbitrary code points
 		auto c = character - control;
@@ -57,25 +57,66 @@ struct StringValue {
 		auto quote_escape = lexy::dsl::escape(lexy::dsl::lit_c<'"'>) //
 								.symbol<escaped_quote>();
 
-		return lexy::dsl::delimited(lexy::dsl::lit_c<'"'>, lexy::dsl::not_followed_by(lexy::dsl::lit_c<'"'>, lexy::dsl::lit_c<'"'>))(c, back_escape, quote_escape);
+		auto quotes = lexy::dsl::delimited(lexy::dsl::lit_c<'"'>, lexy::dsl::not_followed_by(lexy::dsl::lit_c<'"'>, lexy::dsl::lit_c<'"'>));
+
+		return quotes(c, back_escape, quote_escape);
 	}();
 
 	static constexpr auto value = lexy::as_string<std::string>;
 };
 
 template<ParseOptions Options>
+struct EscapeValue {
+	static constexpr auto rule = [] {
+		auto id = lexy::dsl::identifier(id_head, id_tail);
+
+		return lexy::dsl::lit_b<Options.EscapeChar> >>
+			   (lexy::dsl::lit_b<Options.EscapeChar> |
+				   (id >> lexy::dsl::lit_b<Options.EscapeChar>));
+	}();
+	static constexpr auto value =
+		lexy::callback_with_state<std::string>(
+			[](const auto& state, auto&& lexeme) {
+				auto check = std::string_view { lexeme.data(), lexeme.size() };
+				if (auto value = state.find_value(check); value != state.end())
+					return std::string(value->second.data(), value->second.size());
+				return fmt::format("${}$", check);
+			},
+			[](auto&& lexeme) {
+				return fmt::format("${}$", std::string_view { lexeme.data(), lexeme.size() });
+			},
+			[](lexy::nullopt = {}) {
+				return std::string(1, Options.EscapeChar);
+			},
+			[](const auto& state, lexy::nullopt = {}) {
+				return std::string(1, Options.EscapeChar);
+			});
+};
+
+template<ParseOptions Options>
 struct PlainValue {
 	static constexpr auto rule = [] {
+		auto min_skip = lexy::dsl::lit_b<Options.SepChar> / lexy::dsl::ascii::newline;
 		if constexpr (Options.SupportStrings) {
-			return lexy::dsl::identifier(character - (lexy::dsl::lit_b<Options.SepChar> / lexy::dsl::ascii::newline));
+			return lexy::dsl::identifier(character - min_skip);
 		} else {
-			auto escape_check_char = character - (lexy::dsl::lit_b<Options.SepChar> / lexy::dsl::ascii::newline);
+			auto escape_check_char = [=] {
+				if constexpr (Options.EscapeChar != 0) {
+					return character - (min_skip / lexy::dsl::lit_b<Options.EscapeChar>);
+				} else {
+					return character - min_skip;
+				}
+			}();
 			auto id_check_char = escape_check_char - lexy::dsl::lit_b<'\\'>;
 			auto id_segment = lexy::dsl::identifier(id_check_char);
 			auto escape_segement = lexy::dsl::token(escape_check_char);
-			auto escape_sym = lexy::dsl::symbol<StringValue::escaped_symbols>(escape_segement);
+			auto escape_sym = lexy::dsl::symbol<escaped_symbols>(escape_segement);
 			auto escape_rule = lexy::dsl::lit_b<'\\'> >> escape_sym;
-			return lexy::dsl::list(id_segment | escape_rule);
+			if constexpr (Options.EscapeChar != 0) {
+				return lexy::dsl::list(lexy::dsl::p<EscapeValue<Options>> | id_segment | escape_rule);
+			} else {
+				return lexy::dsl::list(id_segment | escape_rule);
+			}
 		}
 	}();
 	static constexpr auto value = lexy::as_string<std::string>;
@@ -85,7 +126,7 @@ template<ParseOptions Options>
 struct Value {
 	static constexpr auto rule = [] {
 		if constexpr (Options.SupportStrings) {
-			return lexy::dsl::p<StringValue> | lexy::dsl::p<PlainValue<Options>>;
+			return lexy::dsl::p<StringValue<Options>> | lexy::dsl::p<PlainValue<Options>>;
 		} else {
 			return lexy::dsl::p<PlainValue<Options>>;
 		}
@@ -165,16 +206,16 @@ struct File {
 	static constexpr auto value = lexy::as_list<std::vector<ovdl::csv::LineObject>>;
 };
 
-using CommaFile = File<ParseOptions { ',' }>;
-using ColonFile = File<ParseOptions { ':' }>;
-using SemiColonFile = File<ParseOptions { ';' }>;
-using TabFile = File<ParseOptions { '\t' }>;
-using BarFile = File<ParseOptions { '|' }>;
+using CommaFile = File<ParseOptions { ',', false, '$' }>;
+using ColonFile = File<ParseOptions { ':', false, '$' }>;
+using SemiColonFile = File<ParseOptions { ';', false, '$' }>;
+using TabFile = File<ParseOptions { '\t', false, '$' }>;
+using BarFile = File<ParseOptions { '|', false, '$' }>;
 
 namespace strings {
-	using CommaFile = File<ParseOptions { ',', true }>;
-	using ColonFile = File<ParseOptions { ':', true }>;
-	using SemiColonFile = File<ParseOptions { ';', true }>;
-	using TabFile = File<ParseOptions { '\t', true }>;
-	using BarFile = File<ParseOptions { '|', true }>;
+	using CommaFile = File<ParseOptions { ',', true, '$' }>;
+	using ColonFile = File<ParseOptions { ':', true, '$' }>;
+	using SemiColonFile = File<ParseOptions { ';', true, '$' }>;
+	using TabFile = File<ParseOptions { '\t', true, '$' }>;
+	using BarFile = File<ParseOptions { '|', true, '$' }>;
 }
diff --git a/src/openvic-dataloader/csv/Parser.cpp b/src/openvic-dataloader/csv/Parser.cpp
index 14ef553..40f0037 100644
--- a/src/openvic-dataloader/csv/Parser.cpp
+++ b/src/openvic-dataloader/csv/Parser.cpp
@@ -38,9 +38,9 @@ struct LexyEncodingFrom<EncodingType::Utf8> {
 template<EncodingType Encoding>
 class Parser<Encoding>::BufferHandler final : public detail::BasicBufferHandler<typename LexyEncodingFrom<Encoding>::encoding> {
 public:
-	template<typename Node, typename ErrorCallback>
-	std::optional<std::vector<ParseError>> parse(const ErrorCallback& callback) {
-		auto result = lexy::parse<Node>(this->_buffer, callback);
+	template<typename Node, typename ParseState, typename ErrorCallback>
+	std::optional<std::vector<ParseError>> parse(const ParseState& state, const ErrorCallback& callback) {
+		auto result = lexy::parse<Node>(this->_buffer, state, callback);
 		if (!result) {
 			return result.errors();
 		}
@@ -174,14 +174,14 @@ bool Parser<Encoding>::parse_csv(bool handle_strings) {
 	auto report_error = ovdl::detail::ReporError.path(_file_path).to(detail::OStreamOutputIterator { _error_stream });
 	if constexpr (Encoding == EncodingType::Windows1252) {
 		if (handle_strings)
-			errors = _buffer_handler->template parse<csv::grammar::windows1252::strings::SemiColonFile>(report_error);
+			errors = _buffer_handler->template parse<csv::grammar::windows1252::strings::SemiColonFile>(_parser_state, report_error);
 		else
-			errors = _buffer_handler->template parse<csv::grammar::windows1252::SemiColonFile>(report_error);
+			errors = _buffer_handler->template parse<csv::grammar::windows1252::SemiColonFile>(_parser_state, report_error);
 	} else {
 		if (handle_strings)
-			errors = _buffer_handler->template parse<csv::grammar::utf8::strings::SemiColonFile>(report_error);
+			errors = _buffer_handler->template parse<csv::grammar::utf8::strings::SemiColonFile>(_parser_state, report_error);
 		else
-			errors = _buffer_handler->template parse<csv::grammar::utf8::SemiColonFile>(report_error);
+			errors = _buffer_handler->template parse<csv::grammar::utf8::SemiColonFile>(_parser_state, report_error);
 	}
 	if (errors) {
 		_errors.reserve(errors->size());
author	Spartan322 <Megacake1234@gmail.com>	2023-09-14 08:39:44 +0200
committer	Spartan322 <Megacake1234@gmail.com>	2023-09-29 02:15:46 +0200
commit	70ee2cea9bf1c752bcb3f1e0bd9e7b00f437967e (patch)
tree	384e326485b8b19816b567515a34fe6db66a7f8d
parent	5afe363e7f48ee52fd70edea316789fcb18178dc (diff)