aboutsummaryrefslogtreecommitdiff
path: root/src/openvic-dataloader
diff options
context:
space:
mode:
Diffstat (limited to 'src/openvic-dataloader')
-rw-r--r--src/openvic-dataloader/csv/CsvGrammar.hpp9
-rw-r--r--src/openvic-dataloader/csv/Grammar.inc115
-rw-r--r--src/openvic-dataloader/csv/Parser.cpp14
3 files changed, 94 insertions, 44 deletions
diff --git a/src/openvic-dataloader/csv/CsvGrammar.hpp b/src/openvic-dataloader/csv/CsvGrammar.hpp
index 8f8da82..bfae8d0 100644
--- a/src/openvic-dataloader/csv/CsvGrammar.hpp
+++ b/src/openvic-dataloader/csv/CsvGrammar.hpp
@@ -1,16 +1,21 @@
#pragma once
#include <initializer_list>
+#include <memory>
#include <string>
+#include <string_view>
#include <tuple>
#include <type_traits>
#include <vector>
#include <openvic-dataloader/csv/LineObject.hpp>
+#include <openvic-dataloader/csv/Parser.hpp>
#include <lexy/callback.hpp>
#include <lexy/dsl.hpp>
+#include <fmt/format.h>
+
#include "detail/LexyLitRange.hpp"
// Grammar Definitions //
@@ -20,6 +25,8 @@ namespace ovdl::csv::grammar::windows1252 {
lexy::dsl::ascii::control /
lexy::dsl::lit_b<0x81> / lexy::dsl::lit_b<0x8D> / lexy::dsl::lit_b<0x8F> /
lexy::dsl::lit_b<0x90> / lexy::dsl::lit_b<0x9D>;
+ constexpr auto id_head = lexy::dsl::ascii::alpha_underscore;
+ constexpr auto id_tail = lexy::dsl::ascii::alpha_digit_underscore;
#include "Grammar.inc"
}
@@ -27,6 +34,8 @@ namespace ovdl::csv::grammar::windows1252 {
namespace ovdl::csv::grammar::utf8 {
constexpr auto character = lexy::dsl::unicode::character;
constexpr auto control = lexy::dsl::unicode::control;
+ constexpr auto id_head = lexy::dsl::unicode::xid_start_underscore;
+ constexpr auto id_tail = lexy::dsl::unicode::xid_continue;
#include "Grammar.inc"
} \ No newline at end of file
diff --git a/src/openvic-dataloader/csv/Grammar.inc b/src/openvic-dataloader/csv/Grammar.inc
index 6dd4158..7de9e81 100644
--- a/src/openvic-dataloader/csv/Grammar.inc
+++ b/src/openvic-dataloader/csv/Grammar.inc
@@ -4,16 +4,21 @@
// Includes to keep file errors small
#include <initializer_list>
+#include <memory>
#include <string>
+#include <string_view>
#include <tuple>
#include <type_traits>
#include <vector>
#include <openvic-dataloader/csv/LineObject.hpp>
+#include <openvic-dataloader/csv/Parser.hpp>
#include <lexy/callback.hpp>
#include <lexy/dsl.hpp>
+#include <fmt/format.h>
+
#include "detail/LexyLitRange.hpp"
struct ParseOptions {
@@ -21,32 +26,27 @@ struct ParseOptions {
char SepChar;
/// @brief Determines whether StringValue is supported
bool SupportStrings;
+ /// @brief Paradox-style localization escape characters
+ /// @note Is ignored if SupportStrings is true
+ char EscapeChar;
};
-#include "detail/LexyLitRange.hpp"
+constexpr auto escaped_symbols = lexy::symbol_table<char> //
+ .map<'"'>('"')
+ .map<'\''>('\'')
+ .map<'\\'>('\\')
+ .map<'/'>('/')
+ .map<'b'>('\b')
+ .map<'f'>('\f')
+ .map<'n'>('\n')
+ .map<'r'>('\r')
+ .map<'t'>('\t');
-struct ParseOptions {
- /// @brief Seperator character
- char SepChar;
- /// @brief Determines whether StringValue is supported
- bool SupportStrings;
-};
+constexpr auto escaped_quote = lexy::symbol_table<char> //
+ .map<'"'>('"');
+template<ParseOptions Options>
struct StringValue {
- static constexpr auto escaped_symbols = lexy::symbol_table<char> //
- .map<'"'>('"')
- .map<'\''>('\'')
- .map<'\\'>('\\')
- .map<'/'>('/')
- .map<'b'>('\b')
- .map<'f'>('\f')
- .map<'n'>('\n')
- .map<'r'>('\r')
- .map<'t'>('\t');
- /// This doesn't actually do anything, so this might to be manually parsed if vic2's CSV parser creates a " from ""
- static constexpr auto escaped_quote = lexy::symbol_table<char> //
- .map<'"'>('"');
-
static constexpr auto rule = [] {
// Arbitrary code points
auto c = character - control;
@@ -57,25 +57,66 @@ struct StringValue {
auto quote_escape = lexy::dsl::escape(lexy::dsl::lit_c<'"'>) //
.symbol<escaped_quote>();
- return lexy::dsl::delimited(lexy::dsl::lit_c<'"'>, lexy::dsl::not_followed_by(lexy::dsl::lit_c<'"'>, lexy::dsl::lit_c<'"'>))(c, back_escape, quote_escape);
+ auto quotes = lexy::dsl::delimited(lexy::dsl::lit_c<'"'>, lexy::dsl::not_followed_by(lexy::dsl::lit_c<'"'>, lexy::dsl::lit_c<'"'>));
+
+ return quotes(c, back_escape, quote_escape);
}();
static constexpr auto value = lexy::as_string<std::string>;
};
template<ParseOptions Options>
+struct EscapeValue {
+ static constexpr auto rule = [] {
+ auto id = lexy::dsl::identifier(id_head, id_tail);
+
+ return lexy::dsl::lit_b<Options.EscapeChar> >>
+ (lexy::dsl::lit_b<Options.EscapeChar> |
+ (id >> lexy::dsl::lit_b<Options.EscapeChar>));
+ }();
+ static constexpr auto value =
+ lexy::callback_with_state<std::string>(
+ [](const auto& state, auto&& lexeme) {
+ auto check = std::string_view { lexeme.data(), lexeme.size() };
+ if (auto value = state.find_value(check); value != state.end())
+ return std::string(value->second.data(), value->second.size());
+ return fmt::format("${}$", check);
+ },
+ [](auto&& lexeme) {
+ return fmt::format("${}$", std::string_view { lexeme.data(), lexeme.size() });
+ },
+ [](lexy::nullopt = {}) {
+ return std::string(1, Options.EscapeChar);
+ },
+ [](const auto& state, lexy::nullopt = {}) {
+ return std::string(1, Options.EscapeChar);
+ });
+};
+
+template<ParseOptions Options>
struct PlainValue {
static constexpr auto rule = [] {
+ auto min_skip = lexy::dsl::lit_b<Options.SepChar> / lexy::dsl::ascii::newline;
if constexpr (Options.SupportStrings) {
- return lexy::dsl::identifier(character - (lexy::dsl::lit_b<Options.SepChar> / lexy::dsl::ascii::newline));
+ return lexy::dsl::identifier(character - min_skip);
} else {
- auto escape_check_char = character - (lexy::dsl::lit_b<Options.SepChar> / lexy::dsl::ascii::newline);
+ auto escape_check_char = [=] {
+ if constexpr (Options.EscapeChar != 0) {
+ return character - (min_skip / lexy::dsl::lit_b<Options.EscapeChar>);
+ } else {
+ return character - min_skip;
+ }
+ }();
auto id_check_char = escape_check_char - lexy::dsl::lit_b<'\\'>;
auto id_segment = lexy::dsl::identifier(id_check_char);
auto escape_segement = lexy::dsl::token(escape_check_char);
- auto escape_sym = lexy::dsl::symbol<StringValue::escaped_symbols>(escape_segement);
+ auto escape_sym = lexy::dsl::symbol<escaped_symbols>(escape_segement);
auto escape_rule = lexy::dsl::lit_b<'\\'> >> escape_sym;
- return lexy::dsl::list(id_segment | escape_rule);
+ if constexpr (Options.EscapeChar != 0) {
+ return lexy::dsl::list(lexy::dsl::p<EscapeValue<Options>> | id_segment | escape_rule);
+ } else {
+ return lexy::dsl::list(id_segment | escape_rule);
+ }
}
}();
static constexpr auto value = lexy::as_string<std::string>;
@@ -85,7 +126,7 @@ template<ParseOptions Options>
struct Value {
static constexpr auto rule = [] {
if constexpr (Options.SupportStrings) {
- return lexy::dsl::p<StringValue> | lexy::dsl::p<PlainValue<Options>>;
+ return lexy::dsl::p<StringValue<Options>> | lexy::dsl::p<PlainValue<Options>>;
} else {
return lexy::dsl::p<PlainValue<Options>>;
}
@@ -165,16 +206,16 @@ struct File {
static constexpr auto value = lexy::as_list<std::vector<ovdl::csv::LineObject>>;
};
-using CommaFile = File<ParseOptions { ',' }>;
-using ColonFile = File<ParseOptions { ':' }>;
-using SemiColonFile = File<ParseOptions { ';' }>;
-using TabFile = File<ParseOptions { '\t' }>;
-using BarFile = File<ParseOptions { '|' }>;
+using CommaFile = File<ParseOptions { ',', false, '$' }>;
+using ColonFile = File<ParseOptions { ':', false, '$' }>;
+using SemiColonFile = File<ParseOptions { ';', false, '$' }>;
+using TabFile = File<ParseOptions { '\t', false, '$' }>;
+using BarFile = File<ParseOptions { '|', false, '$' }>;
namespace strings {
- using CommaFile = File<ParseOptions { ',', true }>;
- using ColonFile = File<ParseOptions { ':', true }>;
- using SemiColonFile = File<ParseOptions { ';', true }>;
- using TabFile = File<ParseOptions { '\t', true }>;
- using BarFile = File<ParseOptions { '|', true }>;
+ using CommaFile = File<ParseOptions { ',', true, '$' }>;
+ using ColonFile = File<ParseOptions { ':', true, '$' }>;
+ using SemiColonFile = File<ParseOptions { ';', true, '$' }>;
+ using TabFile = File<ParseOptions { '\t', true, '$' }>;
+ using BarFile = File<ParseOptions { '|', true, '$' }>;
}
diff --git a/src/openvic-dataloader/csv/Parser.cpp b/src/openvic-dataloader/csv/Parser.cpp
index 14ef553..40f0037 100644
--- a/src/openvic-dataloader/csv/Parser.cpp
+++ b/src/openvic-dataloader/csv/Parser.cpp
@@ -38,9 +38,9 @@ struct LexyEncodingFrom<EncodingType::Utf8> {
template<EncodingType Encoding>
class Parser<Encoding>::BufferHandler final : public detail::BasicBufferHandler<typename LexyEncodingFrom<Encoding>::encoding> {
public:
- template<typename Node, typename ErrorCallback>
- std::optional<std::vector<ParseError>> parse(const ErrorCallback& callback) {
- auto result = lexy::parse<Node>(this->_buffer, callback);
+ template<typename Node, typename ParseState, typename ErrorCallback>
+ std::optional<std::vector<ParseError>> parse(const ParseState& state, const ErrorCallback& callback) {
+ auto result = lexy::parse<Node>(this->_buffer, state, callback);
if (!result) {
return result.errors();
}
@@ -174,14 +174,14 @@ bool Parser<Encoding>::parse_csv(bool handle_strings) {
auto report_error = ovdl::detail::ReporError.path(_file_path).to(detail::OStreamOutputIterator { _error_stream });
if constexpr (Encoding == EncodingType::Windows1252) {
if (handle_strings)
- errors = _buffer_handler->template parse<csv::grammar::windows1252::strings::SemiColonFile>(report_error);
+ errors = _buffer_handler->template parse<csv::grammar::windows1252::strings::SemiColonFile>(_parser_state, report_error);
else
- errors = _buffer_handler->template parse<csv::grammar::windows1252::SemiColonFile>(report_error);
+ errors = _buffer_handler->template parse<csv::grammar::windows1252::SemiColonFile>(_parser_state, report_error);
} else {
if (handle_strings)
- errors = _buffer_handler->template parse<csv::grammar::utf8::strings::SemiColonFile>(report_error);
+ errors = _buffer_handler->template parse<csv::grammar::utf8::strings::SemiColonFile>(_parser_state, report_error);
else
- errors = _buffer_handler->template parse<csv::grammar::utf8::SemiColonFile>(report_error);
+ errors = _buffer_handler->template parse<csv::grammar::utf8::SemiColonFile>(_parser_state, report_error);
}
if (errors) {
_errors.reserve(errors->size());