diff options
author | Spartan322 <Megacake1234@gmail.com> | 2024-05-09 16:06:02 +0200 |
---|---|---|
committer | Spartan322 <Megacake1234@gmail.com> | 2024-06-18 01:31:12 +0200 |
commit | b0c3ba3f91926b0c95625bdbf4aab69269130b13 (patch) | |
tree | f15ebc47d6bf370031af28e4bb4814ae30ef46e1 /src/openvic-dataloader/v2script/SimpleGrammar.hpp | |
parent | 7b521d6023113372cf6b02e562828273c4040f0e (diff) |
Add runtime encoding detection and conversionfix/char-detection
Win-1251/1252 detection is a reduced C++ version of https://github.com/hsivonen/chardetng
Add manually-specified encoding fallback
Add default system encoding fallback
Add error recovery to v2script
Add unknown encoding detection warning
Remove csv::Parser templating
Fix lua files dropping data
Update lexy to foonathan/lexy@1e5d99fa3826b1c3c8628d3a11117fb4fb4cc0d0
Remove exclusive reliance on lexy::default_encoding for v2script
Move internal concepts to src/openvic-detail/InternalConcepts.hpp
Move contents of DetectUtf8.hpp to src/detail/Detect.hpp
Move openvic-dataloader/AbstractSyntaxTree.hpp to src
Move DiagnosticLogger.hpp to src
Move File.hpp to src
Move openvic-dataloader/detail/utlity files to openvic-dataloader/detail
Add ovdl::utility::type_concat
Add ovdl::utility::type_prepend
Add ovdl::utility::is_instance_of
Overhaul parse error messages
Diffstat (limited to 'src/openvic-dataloader/v2script/SimpleGrammar.hpp')
-rw-r--r-- | src/openvic-dataloader/v2script/SimpleGrammar.hpp | 307 |
1 files changed, 229 insertions, 78 deletions
diff --git a/src/openvic-dataloader/v2script/SimpleGrammar.hpp b/src/openvic-dataloader/v2script/SimpleGrammar.hpp index 37e295f..d42ce07 100644 --- a/src/openvic-dataloader/v2script/SimpleGrammar.hpp +++ b/src/openvic-dataloader/v2script/SimpleGrammar.hpp @@ -5,10 +5,22 @@ #include <lexy/callback.hpp> #include <lexy/dsl.hpp> +#include <lexy/dsl/any.hpp> #include <lexy/dsl/identifier.hpp> +#include <lexy/dsl/option.hpp> +#include <lexy/dsl/peek.hpp> +#include <lexy/dsl/punctuator.hpp> +#include <lexy/dsl/recover.hpp> +#include <lexy/dsl/scan.hpp> #include <lexy/dsl/symbol.hpp> - -#include "ParseState.hpp" +#include <lexy/dsl/unicode.hpp> +#include <lexy/encoding.hpp> +#include <lexy/input/base.hpp> +#include <lexy/input/buffer.hpp> +#include <lexy/lexeme.hpp> + +#include "detail/Convert.hpp" +#include "detail/InternalConcepts.hpp" #include "detail/dsl.hpp" // Grammar Definitions // @@ -23,17 +35,28 @@ */ namespace ovdl::v2script::grammar { template<typename T> - constexpr auto construct = dsl::construct<ast::ParseState, T>; + constexpr auto construct = dsl::construct<T>; template<typename T, bool DisableEmpty = false, typename ListType = ast::AssignStatementList> - constexpr auto construct_list = dsl::construct_list<ast::ParseState, T, ListType, DisableEmpty>; + constexpr auto construct_list = dsl::construct_list<T, ListType, DisableEmpty>; + + struct ConvertErrorHandler { + static constexpr void on_invalid_character(detail::IsStateType auto& state, auto reader) { + state.logger().warning("invalid character value '{}' found.", static_cast<int>(reader.peek())) // + .primary(BasicNodeLocation { reader.position() }, "here") + .finish(); + } + }; + + template<typename String> + constexpr auto convert_as_string = convert::convert_as_string<String, ConvertErrorHandler>; struct ParseOptions { /// @brief Makes string parsing avoid string escapes bool NoStringEscape; }; - static constexpr ParseOptions NoStringEscapeOption = ParseOptions { true }; - static constexpr ParseOptions StringEscapeOption = ParseOptions { false }; + static constexpr auto NoStringEscapeOption = ParseOptions { true }; + static constexpr auto StringEscapeOption = ParseOptions { false }; /* REQUIREMENTS: DAT-630 */ static constexpr auto whitespace_specifier = lexy::dsl::ascii::blank / lexy::dsl::ascii::newline; @@ -50,24 +73,28 @@ namespace ovdl::v2script::grammar { ascii / lexy::dsl::lit_b<0x8A> / lexy::dsl::lit_b<0x8C> / lexy::dsl::lit_b<0x8E> / lexy::dsl::lit_b<0x92> / lexy::dsl::lit_b<0x97> / lexy::dsl::lit_b<0x9A> / lexy::dsl::lit_b<0x9C> / - dsl::make_range<0x9E, 0x9F>() / - dsl::make_range<0xC0, 0xD6>() / - dsl::make_range<0xD8, 0xF6>() / - dsl::make_range<0xF8, 0xFF>(); + dsl::lit_b_range<0x9E, 0x9F> / + dsl::lit_b_range<0xC0, 0xD6> / + dsl::lit_b_range<0xD8, 0xF6> / + dsl::lit_b_range<0xF8, 0xFF>; static constexpr auto windows_1251_data_specifier_additions = - dsl::make_range<0x80, 0x81>() / lexy::dsl::lit_b<0x83> / lexy::dsl::lit_b<0x8D> / lexy::dsl::lit_b<0x8F> / + dsl::lit_b_range<0x80, 0x81> / lexy::dsl::lit_b<0x83> / lexy::dsl::lit_b<0x8D> / lexy::dsl::lit_b<0x8F> / lexy::dsl::lit_b<0x90> / lexy::dsl::lit_b<0x9D> / lexy::dsl::lit_b<0x9F> / - dsl::make_range<0xA1, 0xA3>() / lexy::dsl::lit_b<0xA5> / lexy::dsl::lit_b<0xA8> / lexy::dsl::lit_b<0xAA> / + dsl::lit_b_range<0xA1, 0xA3> / lexy::dsl::lit_b<0xA5> / lexy::dsl::lit_b<0xA8> / lexy::dsl::lit_b<0xAA> / lexy::dsl::lit_b<0xAF> / - dsl::make_range<0xB2, 0xB4>() / lexy::dsl::lit_b<0xB8> / lexy::dsl::lit_b<0xBA> / - dsl::make_range<0xBC, 0xBF>() / + dsl::lit_b_range<0xB2, 0xB4> / lexy::dsl::lit_b<0xB8> / lexy::dsl::lit_b<0xBA> / + dsl::lit_b_range<0xBC, 0xBF> / lexy::dsl::lit_b<0xD7> / lexy::dsl::lit_b<0xF7>; static constexpr auto data_specifier = windows_1252_data_specifier / windows_1251_data_specifier_additions; static constexpr auto data_char_class = LEXY_CHAR_CLASS("DataSpecifier", data_specifier); + static constexpr auto utf_data_specifier = lexy::dsl::unicode::xid_continue / LEXY_ASCII_ONE_OF("+:@%&'-."); + + static constexpr auto utf_char_class = LEXY_CHAR_CLASS("DataSpecifier", utf_data_specifier); + static constexpr auto escaped_symbols = lexy::symbol_table<char> // .map<'"'>('"') .map<'\''>('\'') @@ -79,50 +106,121 @@ namespace ovdl::v2script::grammar { .map<'r'>('\r') .map<'t'>('\t'); - static constexpr auto id = lexy::dsl::identifier(data_char_class); + static constexpr auto id = lexy::dsl::identifier(ascii); template<ParseOptions Options> struct SimpleGrammar { struct StatementListBlock; - struct Identifier { - static constexpr auto rule = lexy::dsl::identifier(data_char_class); - static constexpr auto value = dsl::callback<ast::IdentifierValue*>( - [](ast::ParseState& state, auto lexeme) { - auto value = state.ast().intern(lexeme.data(), lexeme.size()); - return state.ast().create<ast::IdentifierValue>(ovdl::NodeLocation::make_from(lexeme.begin(), lexeme.end()), value); - }); + struct Identifier : lexy::scan_production<ast::IdentifierValue*>, + lexy::token_production { + + template<typename Context, typename Reader> + static constexpr scan_result scan(lexy::rule_scanner<Context, Reader>& scanner, detail::IsParseState auto& state) { + using encoding = typename Reader::encoding; + using char_type = typename encoding::char_type; + + std::basic_string<char_type> value_result; + + auto content_begin = scanner.position(); + do { + if constexpr (std::same_as<encoding, lexy::default_encoding> || std::same_as<encoding, lexy::byte_encoding>) { + if (lexy::scan_result<lexy::lexeme<Reader>> ascii_result; scanner.branch(ascii_result, lexy::dsl::identifier(ascii))) { + value_result.append(ascii_result.value().begin(), ascii_result.value().end()); + continue; + } + + char_type char_array[] { *scanner.position(), char_type {} }; + auto input = lexy::range_input(&char_array[0], &char_array[1]); + auto reader = input.reader(); + convert::map_value val = convert::try_parse_map(state.encoding(), reader); + + if (val.is_invalid()) { + ConvertErrorHandler::on_invalid_character(state, reader); + continue; + } + + if (!val.is_pass()) { + // non-pass characters are not valid ascii and are mapped to utf8 values + value_result.append(val._value); + scanner.parse(data_char_class); + } else { + break; + } + } else { + auto lexeme_result = scanner.template parse<lexy::lexeme<Reader>>(lexy::dsl::identifier(utf_char_class)); + if (lexeme_result) { + value_result.append(lexeme_result.value().begin(), lexeme_result.value().size()); + break; + } + } + } while (scanner); + auto content_end = scanner.position(); + + if (value_result.empty()) { + return lexy::scan_failed; + } + + auto value = state.ast().intern(value_result); + return state.ast().template create<ast::IdentifierValue>(ovdl::NodeLocation::make_from(content_begin, content_end), value); + } + + static constexpr auto rule = dsl::peek(data_char_class, utf_char_class) >> lexy::dsl::scan; }; /* REQUIREMENTS: * DAT-633 * DAT-634 */ - struct StringExpression { - static constexpr auto rule = [] { - if constexpr (Options.NoStringEscape) { - auto c = dsl::make_range<0x20, 0xFF>() / lexy::dsl::lit_b<0x07> / lexy::dsl::lit_b<0x09> / lexy::dsl::lit_b<0x0A> / lexy::dsl::lit_b<0x0D>; - return lexy::dsl::delimited(lexy::dsl::position(lexy::dsl::lit_b<'"'>))(c); - } else { - // Arbitrary code points that aren't control characters. - auto c = dsl::make_range<0x20, 0xFF>() - lexy::dsl::ascii::control; - - // Escape sequences start with a backlash. - // They either map one of the symbols, - // or a Unicode code point of the form uXXXX. - auto escape = lexy::dsl::backslash_escape // - .symbol<escaped_symbols>(); - return lexy::dsl::delimited(lexy::dsl::position(lexy::dsl::lit_b<'"'>))(c, escape); - } - }(); - - static constexpr auto value = - lexy::as_string<std::string> >> - dsl::callback<ast::StringValue*>( - [](ast::ParseState& state, const char* begin, auto&& str, const char* end) { - auto value = state.ast().intern(str.data(), str.length()); - return state.ast().create<ast::StringValue>(ovdl::NodeLocation::make_from(begin, end), value); - }); + struct StringExpression : lexy::scan_production<ast::StringValue*>, + lexy::token_production { + + template<typename Context, typename Reader> + static constexpr scan_result scan(lexy::rule_scanner<Context, Reader>& scanner, detail::IsParseState auto& state) { + using encoding = typename Reader::encoding; + + constexpr auto rule = [] { + if constexpr (Options.NoStringEscape) { + auto c = [] { + if constexpr (std::same_as<encoding, lexy::default_encoding> || std::same_as<encoding, lexy::byte_encoding>) { + return dsl::lit_b_range<0x20, 0xFF> / lexy::dsl::lit_b<0x07> / lexy::dsl::lit_b<0x09> / lexy::dsl::lit_b<0x0A> / lexy::dsl::lit_b<0x0D>; + } else { + return -lexy::dsl::unicode::control; + } + }(); + return lexy::dsl::quoted(c); + } else { + // Arbitrary code points that aren't control characters. + auto c = [] { + if constexpr (std::same_as<encoding, lexy::default_encoding> || std::same_as<encoding, lexy::byte_encoding>) { + return dsl::lit_b_range<0x20, 0xFF> - lexy::dsl::ascii::control; + } else { + return -lexy::dsl::unicode::control; + } + }(); + + // Escape sequences start with a backlash. + // They either map one of the symbols, + // or a Unicode code point of the form uXXXX. + auto escape = lexy::dsl::backslash_escape // + .symbol<escaped_symbols>(); + return lexy::dsl::quoted(c, escape); + } + }(); + + auto begin = scanner.position(); + lexy::scan_result<std::string> str_result; + scanner.parse(str_result, rule); + if (!scanner || !str_result) + return lexy::scan_failed; + auto end = scanner.position(); + auto str = str_result.value(); + auto value = state.ast().intern(str.data(), str.size()); + return state.ast().template create<ast::StringValue>(ovdl::NodeLocation::make_from(begin, end), value); + } + + static constexpr auto rule = lexy::dsl::peek(lexy::dsl::quoted.open()) >> lexy::dsl::scan; + static constexpr auto value = convert_as_string<std::string> >> lexy::forward<ast::StringValue*>; }; /* REQUIREMENTS: DAT-638 */ @@ -132,59 +230,112 @@ namespace ovdl::v2script::grammar { }; struct SimpleAssignmentStatement { - static constexpr auto rule = - dsl::p<Identifier> >> - (lexy::dsl::equal_sign >> - (lexy::dsl::p<ValueExpression> | lexy::dsl::recurse_branch<StatementListBlock>)); + static constexpr auto rule = [] { + auto right_brace = lexy::dsl::lit_c<'}'>; + + auto value_expression = lexy::dsl::p<ValueExpression>; + auto statement_list_expression = lexy::dsl::recurse_branch<StatementListBlock>; + + auto rhs_recover = lexy::dsl::recover(value_expression, statement_list_expression).limit(right_brace); + auto rhs_try = lexy::dsl::try_(value_expression | statement_list_expression, rhs_recover); + + auto identifier = + dsl::p<Identifier> >> + (lexy::dsl::equal_sign >> rhs_try); + + auto recover = lexy::dsl::recover(identifier).limit(right_brace); + return lexy::dsl::try_(identifier, recover); + }(); static constexpr auto value = construct<ast::AssignStatement>; }; /* REQUIREMENTS: DAT-639 */ struct AssignmentStatement { - static constexpr auto rule = - dsl::p<Identifier> >> + static constexpr auto rule = [] { + auto right_brace = lexy::dsl::lit_c<'}'>; + + auto value_expression = lexy::dsl::p<ValueExpression>; + auto statement_list_expression = lexy::dsl::recurse_branch<StatementListBlock>; + + auto rhs_recover = lexy::dsl::recover(value_expression, statement_list_expression).limit(right_brace); + auto rhs_try = lexy::dsl::try_(value_expression | statement_list_expression, rhs_recover); + + auto identifier = + dsl::p<Identifier> >> (lexy::dsl::equal_sign >> - (lexy::dsl::p<ValueExpression> | lexy::dsl::recurse_branch<StatementListBlock>) | - lexy::dsl::else_ >> lexy::dsl::return_) | - dsl::p<StringExpression> | - lexy::dsl::recurse_branch<StatementListBlock>; + rhs_try | + lexy::dsl::else_ >> lexy::dsl::return_); + + auto string_expression = dsl::p<StringExpression>; + auto statement_list = lexy::dsl::recurse_branch<StatementListBlock>; + + return identifier | string_expression | statement_list; + }(); static constexpr auto value = dsl::callback<ast::Statement*>( - [](ast::ParseState& state, const char* pos, ast::IdentifierValue* name, ast::Value* initializer) { - return state.ast().create<ast::AssignStatement>(pos, name, initializer); + [](detail::IsParseState auto& state, const char* pos, ast::IdentifierValue* name, ast::Value* initializer) { + return state.ast().template create<ast::AssignStatement>(pos, name, initializer); }, - [](ast::ParseState& state, const char* pos, ast::Value* left, lexy::nullopt = {}) { - return state.ast().create<ast::ValueStatement>(pos, left); + [](detail::IsParseState auto& state, bool&, const char* pos, ast::IdentifierValue* name, ast::Value* initializer) { + return state.ast().template create<ast::AssignStatement>(pos, name, initializer); }, - [](ast::ParseState& state, ast::Value* left) { - return state.ast().create<ast::ValueStatement>(state.ast().location_of(left), left); + [](detail::IsParseState auto& state, bool&, bool&, const char* pos, ast::IdentifierValue* name, ast::Value* initializer) { + return state.ast().template create<ast::AssignStatement>(pos, name, initializer); + }, + [](detail::IsParseState auto& state, bool&, bool&, const char* pos, ast::Value* name) { + return state.ast().template create<ast::ValueStatement>(pos, name); + }, + [](detail::IsParseState auto& state, const char* pos, ast::Value* left, lexy::nullopt = {}) { + return state.ast().template create<ast::ValueStatement>(pos, left); + }, + [](detail::IsParseState auto& state, bool&, const char* pos, ast::Value* left, lexy::nullopt = {}) { + return state.ast().template create<ast::ValueStatement>(pos, left); + }, + [](detail::IsParseState auto& state, ast::Value* left) -> ast::ValueStatement* { + if (left == nullptr) return nullptr; + return state.ast().template create<ast::ValueStatement>(state.ast().location_of(left), left); + }, + [](detail::IsParseState auto& state, bool&, ast::Value* left) -> ast::ValueStatement* { + if (left == nullptr) return nullptr; + return state.ast().template create<ast::ValueStatement>(state.ast().location_of(left), left); }); }; /* REQUIREMENTS: DAT-640 */ struct StatementListBlock { - static constexpr auto rule = - dsl::curly_bracketed( - (lexy::dsl::opt(lexy::dsl::list(lexy::dsl::recurse_branch<AssignmentStatement>)) + - lexy::dsl::opt(lexy::dsl::semicolon))); + static constexpr auto rule = [] { + auto right_brace = lexy::dsl::lit_c<'}'>; + + auto assign_statement = lexy::dsl::recurse_branch<AssignmentStatement>; + + auto assign_try = lexy::dsl::try_(assign_statement); + auto assign_opt = lexy::dsl::opt(lexy::dsl::list(assign_try)); + + auto curly_bracket = dsl::curly_bracketed(assign_opt + lexy::dsl::opt(lexy::dsl::semicolon)); + + return lexy::dsl::try_(curly_bracket, lexy::dsl::find(right_brace)); + }(); static constexpr auto value = lexy::as_list<ast::StatementList> >> dsl::callback<ast::ListValue*>( - [](ast::ParseState& state, const char* begin, auto&& list, const char* end) { + [](detail::IsParseState auto& state, const char* begin, auto&& list, const char* end) { if constexpr (std::is_same_v<std::decay_t<decltype(list)>, lexy::nullopt>) { - return state.ast().create<ast::ListValue>(ovdl::NodeLocation::make_from(begin, end)); + return state.ast().template create<ast::ListValue>(ovdl::NodeLocation::make_from(begin, end)); } else { - return state.ast().create<ast::ListValue>(ovdl::NodeLocation::make_from(begin, end), LEXY_MOV(list)); + return state.ast().template create<ast::ListValue>(ovdl::NodeLocation::make_from(begin, end), LEXY_MOV(list)); } }, - [](ast::ParseState& state, const char* begin, auto&& list, auto&& semicolon, const char* end) { + [](detail::IsParseState auto& state, const char* begin, auto&& list, auto&& semicolon, const char* end) { if constexpr (std::is_same_v<std::decay_t<decltype(list)>, lexy::nullopt>) { - return state.ast().create<ast::ListValue>(ovdl::NodeLocation::make_from(begin, end)); + return state.ast().template create<ast::ListValue>(ovdl::NodeLocation::make_from(begin, end)); } else { - return state.ast().create<ast::ListValue>(ovdl::NodeLocation::make_from(begin, end), LEXY_MOV(list)); + return state.ast().template create<ast::ListValue>(ovdl::NodeLocation::make_from(begin, end), LEXY_MOV(list)); } + }, + [](detail::IsParseState auto& state, lexy::nullopt fail = {}) { + return fail; }); }; }; @@ -198,22 +349,20 @@ namespace ovdl::v2script::grammar { template<ParseOptions Options> using SAssignStatement = typename SimpleGrammar<Options>::SimpleAssignmentStatement; - template<ovdl::detail::string_literal Keyword, auto Production, auto Value = dsl::default_kw_value<ast::ParseState, ast::IdentifierValue, Keyword>> + template<ovdl::detail::string_literal Keyword, auto Production, auto Value = dsl::default_kw_value<ast::IdentifierValue, Keyword>> using keyword_rule = dsl::keyword_rule< - ast::ParseState, id, ast::AssignStatement, Keyword, Production, Value>; - template<ovdl::detail::string_literal Keyword, auto Production, auto Value = dsl::default_kw_value<ast::ParseState, ast::IdentifierValue, Keyword>> + template<ovdl::detail::string_literal Keyword, auto Production, auto Value = dsl::default_kw_value<ast::IdentifierValue, Keyword>> using fkeyword_rule = dsl::fkeyword_rule< - ast::ParseState, id, ast::AssignStatement, Keyword, Production, Value>; template<ParseOptions Options> - struct File { + struct BasicFile { // Allow arbitrary spaces between individual tokens. static constexpr auto whitespace = whitespace_specifier | comment_specifier; @@ -223,4 +372,6 @@ namespace ovdl::v2script::grammar { static constexpr auto value = lexy::as_list<ast::StatementList> >> construct<ast::FileTree>; }; + + using File = BasicFile<NoStringEscapeOption>; } |