From b0c3ba3f91926b0c95625bdbf4aab69269130b13 Mon Sep 17 00:00:00 2001 From: Spartan322 Date: Thu, 9 May 2024 10:06:02 -0400 Subject: Add runtime encoding detection and conversion Win-1251/1252 detection is a reduced C++ version of https://github.com/hsivonen/chardetng Add manually-specified encoding fallback Add default system encoding fallback Add error recovery to v2script Add unknown encoding detection warning Remove csv::Parser templating Fix lua files dropping data Update lexy to foonathan/lexy@1e5d99fa3826b1c3c8628d3a11117fb4fb4cc0d0 Remove exclusive reliance on lexy::default_encoding for v2script Move internal concepts to src/openvic-detail/InternalConcepts.hpp Move contents of DetectUtf8.hpp to src/detail/Detect.hpp Move openvic-dataloader/AbstractSyntaxTree.hpp to src Move DiagnosticLogger.hpp to src Move File.hpp to src Move openvic-dataloader/detail/utlity files to openvic-dataloader/detail Add ovdl::utility::type_concat Add ovdl::utility::type_prepend Add ovdl::utility::is_instance_of Overhaul parse error messages --- deps/lexy | 2 +- include/openvic-dataloader/AbstractSyntaxTree.hpp | 107 ---- include/openvic-dataloader/DiagnosticLogger.hpp | 395 ------------- include/openvic-dataloader/Error.hpp | 71 ++- include/openvic-dataloader/File.hpp | 69 --- include/openvic-dataloader/NodeLocation.hpp | 68 ++- include/openvic-dataloader/ParseData.hpp | 11 - include/openvic-dataloader/ParseError.hpp | 20 - include/openvic-dataloader/ParseState.hpp | 120 ---- include/openvic-dataloader/ParseWarning.hpp | 10 - include/openvic-dataloader/Parser.hpp | 3 - include/openvic-dataloader/csv/LineObject.hpp | 2 +- include/openvic-dataloader/csv/Parser.hpp | 41 +- include/openvic-dataloader/detail/Concepts.hpp | 49 ++ include/openvic-dataloader/detail/Constexpr.hpp | 15 + include/openvic-dataloader/detail/Encoding.hpp | 13 + include/openvic-dataloader/detail/ErrorRange.hpp | 10 + .../detail/LexyFwdDeclaration.hpp | 8 - .../openvic-dataloader/detail/LexyReportError.hpp | 107 ---- .../detail/OStreamOutputIterator.hpp | 1 - include/openvic-dataloader/detail/Utility.hpp | 89 +++ .../openvic-dataloader/detail/utility/Concepts.hpp | 45 -- .../detail/utility/Constexpr.hpp | 15 - .../detail/utility/ErrorRange.hpp | 10 - .../detail/utility/PointerHash.hpp | 23 - .../openvic-dataloader/detail/utility/SelfType.hpp | 28 - .../openvic-dataloader/detail/utility/TypeName.hpp | 52 -- .../openvic-dataloader/detail/utility/Utility.hpp | 38 -- .../v2script/AbstractSyntaxTree.hpp | 28 +- include/openvic-dataloader/v2script/Parser.hpp | 30 +- src/headless/main.cpp | 24 +- src/openvic-dataloader/AbstractSyntaxTree.cpp | 2 +- src/openvic-dataloader/AbstractSyntaxTree.hpp | 89 +++ src/openvic-dataloader/DiagnosticLogger.cpp | 5 +- src/openvic-dataloader/DiagnosticLogger.hpp | 492 ++++++++++++++++ src/openvic-dataloader/File.cpp | 12 +- src/openvic-dataloader/File.hpp | 139 +++++ src/openvic-dataloader/NodeLocation.cpp | 26 - src/openvic-dataloader/ParseState.hpp | 105 ++++ src/openvic-dataloader/csv/CsvGrammar.hpp | 244 ++++---- src/openvic-dataloader/csv/CsvParseState.hpp | 26 +- src/openvic-dataloader/csv/Parser.cpp | 182 +++--- src/openvic-dataloader/detail/Convert.hpp | 577 +++++++++++++++++++ src/openvic-dataloader/detail/Detect.cpp | 351 ++++++++++++ src/openvic-dataloader/detail/Detect.hpp | 627 +++++++++++++++++++++ src/openvic-dataloader/detail/DetectUtf8.hpp | 53 -- src/openvic-dataloader/detail/Errors.hpp | 25 - src/openvic-dataloader/detail/InternalConcepts.hpp | 127 +++++ src/openvic-dataloader/detail/ParseHandler.cpp | 347 ++++++++++++ src/openvic-dataloader/detail/ParseHandler.hpp | 199 ++++--- src/openvic-dataloader/detail/Warnings.hpp | 9 +- src/openvic-dataloader/detail/dsl.hpp | 194 ++++++- .../v2script/AbstractSyntaxTree.cpp | 53 +- src/openvic-dataloader/v2script/EventGrammar.hpp | 8 +- .../v2script/LuaDefinesGrammar.hpp | 133 +++-- .../v2script/ModifierGrammar.hpp | 14 +- src/openvic-dataloader/v2script/ParseState.hpp | 19 +- src/openvic-dataloader/v2script/Parser.cpp | 195 ++++--- src/openvic-dataloader/v2script/SimpleGrammar.hpp | 307 +++++++--- 59 files changed, 4177 insertions(+), 1887 deletions(-) delete mode 100644 include/openvic-dataloader/AbstractSyntaxTree.hpp delete mode 100644 include/openvic-dataloader/DiagnosticLogger.hpp delete mode 100644 include/openvic-dataloader/File.hpp delete mode 100644 include/openvic-dataloader/ParseData.hpp delete mode 100644 include/openvic-dataloader/ParseError.hpp delete mode 100644 include/openvic-dataloader/ParseState.hpp delete mode 100644 include/openvic-dataloader/ParseWarning.hpp create mode 100644 include/openvic-dataloader/detail/Concepts.hpp create mode 100644 include/openvic-dataloader/detail/Constexpr.hpp create mode 100644 include/openvic-dataloader/detail/Encoding.hpp create mode 100644 include/openvic-dataloader/detail/ErrorRange.hpp delete mode 100644 include/openvic-dataloader/detail/LexyFwdDeclaration.hpp delete mode 100644 include/openvic-dataloader/detail/LexyReportError.hpp create mode 100644 include/openvic-dataloader/detail/Utility.hpp delete mode 100644 include/openvic-dataloader/detail/utility/Concepts.hpp delete mode 100644 include/openvic-dataloader/detail/utility/Constexpr.hpp delete mode 100644 include/openvic-dataloader/detail/utility/ErrorRange.hpp delete mode 100644 include/openvic-dataloader/detail/utility/PointerHash.hpp delete mode 100644 include/openvic-dataloader/detail/utility/SelfType.hpp delete mode 100644 include/openvic-dataloader/detail/utility/TypeName.hpp delete mode 100644 include/openvic-dataloader/detail/utility/Utility.hpp create mode 100644 src/openvic-dataloader/AbstractSyntaxTree.hpp create mode 100644 src/openvic-dataloader/DiagnosticLogger.hpp create mode 100644 src/openvic-dataloader/File.hpp delete mode 100644 src/openvic-dataloader/NodeLocation.cpp create mode 100644 src/openvic-dataloader/ParseState.hpp create mode 100644 src/openvic-dataloader/detail/Convert.hpp create mode 100644 src/openvic-dataloader/detail/Detect.cpp create mode 100644 src/openvic-dataloader/detail/Detect.hpp delete mode 100644 src/openvic-dataloader/detail/DetectUtf8.hpp delete mode 100644 src/openvic-dataloader/detail/Errors.hpp create mode 100644 src/openvic-dataloader/detail/InternalConcepts.hpp create mode 100644 src/openvic-dataloader/detail/ParseHandler.cpp diff --git a/deps/lexy b/deps/lexy index 721bf9b..1e5d99f 160000 --- a/deps/lexy +++ b/deps/lexy @@ -1 +1 @@ -Subproject commit 721bf9b2a4f3a10cdcb51888eeec873bbc5c7b1a +Subproject commit 1e5d99fa3826b1c3c8628d3a11117fb4fb4cc0d0 diff --git a/include/openvic-dataloader/AbstractSyntaxTree.hpp b/include/openvic-dataloader/AbstractSyntaxTree.hpp deleted file mode 100644 index c6453e3..0000000 --- a/include/openvic-dataloader/AbstractSyntaxTree.hpp +++ /dev/null @@ -1,107 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include -#include - -#include - -namespace ovdl { - struct AbstractSyntaxTree : SymbolIntern { - symbol_type intern(const char* str, std::size_t length); - symbol_type intern(std::string_view str); - const char* intern_cstr(const char* str, std::size_t length); - const char* intern_cstr(std::string_view str); - symbol_interner_type& symbol_interner(); - const symbol_interner_type& symbol_interner() const; - - protected: - symbol_interner_type _symbol_interner; - }; - - template - concept IsAst = - std::derived_from && - requires( - T t, - const T ct, - const typename T::node_type* node, - NodeLocation loc // - ) { - requires IsFile; - typename T::root_node_type; - typename T::node_type; - requires std::derived_from; - { t.set_location(node, loc) } -> std::same_as; - { t.location_of(node) } -> std::same_as; - { t.root() } -> std::same_as; - { ct.root() } -> std::same_as; - { t.file() } -> std::same_as; - { ct.file() } -> std::same_as; - }; - - template RootNodeT> - struct BasicAbstractSyntaxTree : AbstractSyntaxTree { - using file_type = FileT; - using root_node_type = RootNodeT; - using node_type = typename file_type::node_type; - - explicit BasicAbstractSyntaxTree(file_type&& file) : _file(std::move(file)) {} - explicit BasicAbstractSyntaxTree(lexy::buffer&& buffer) : _file(std::move(buffer)) {} - - void set_location(const node_type* n, NodeLocation loc) { - _file.set_location(n, loc); - } - - NodeLocation location_of(const node_type* n) const { - return _file.location_of(n); - } - - root_node_type* root() { - return _tree.root(); - } - - const root_node_type* root() const { - return _tree.root(); - } - - file_type& file() { - return _file; - } - - const file_type& file() const { - return _file; - } - - template - T* create(NodeLocation loc, Args&&... args) { - auto node = _tree.template create(DRYAD_FWD(args)...); - set_location(node, loc); - return node; - } - - template - T* create(const char* begin, const char* end, Args&&... args) { - return create(NodeLocation::make_from(begin, end), DRYAD_FWD(args)...); - } - - void set_root(root_node_type* node) { - _tree.set_root(node); - } - - protected: - dryad::tree _tree; - file_type _file; - }; -} \ No newline at end of file diff --git a/include/openvic-dataloader/DiagnosticLogger.hpp b/include/openvic-dataloader/DiagnosticLogger.hpp deleted file mode 100644 index bd8f9cc..0000000 --- a/include/openvic-dataloader/DiagnosticLogger.hpp +++ /dev/null @@ -1,395 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include - -#include - -namespace ovdl { - struct DiagnosticLogger : SymbolIntern { - using AnnotationKind = lexy_ext::annotation_kind; - using DiagnosticKind = lexy_ext::diagnostic_kind; - - using error_range = detail::error_range; - - explicit operator bool() const; - bool errored() const; - bool warned() const; - - NodeLocation location_of(const error::Error* error) const; - - template Logger> - struct ErrorCallback { - ErrorCallback(Logger& logger) : _logger(&logger) {} - - struct sink_t { - using return_type = std::size_t; - - template - void operator()(lexy::error_context const& context, lexy::error_for const& error) { - using Reader = lexy::input_reader; - error::Error* result; - - std::string production_name = context.production(); - auto left_strip = production_name.find_first_of('<'); - if (left_strip != std::string::npos) { - auto right_strip = production_name.find_first_of('>', left_strip); - if (right_strip != std::string::npos) { - production_name.erase(left_strip, right_strip - left_strip + 1); - } - } - - auto production = _logger.intern_cstr(production_name); - if constexpr (std::is_same_v) { - auto string = lexy::_detail::make_literal_lexeme(error.string(), error.length()); - NodeLocation loc = NodeLocation::make_from(context.position(), error.position() - 1); - auto message = _logger.intern_cstr(fmt::format("expected '{}'", string.data())); - result = _logger.template create(loc, message, production); - } else if constexpr (std::is_same_v) { - auto string = lexy::_detail::make_literal_lexeme(error.string(), error.length()); - NodeLocation loc = NodeLocation::make_from(context.position(), error.position() - 1); - auto message = _logger.intern_cstr(fmt::format("expected keyword '{}'", string.data())); - result = _logger.template create(loc, message, production); - } else if constexpr (std::is_same_v) { - auto message = _logger.intern_cstr(fmt::format("expected {}", error.name())); - result = _logger.template create(error.position(), message, production); - } else { - NodeLocation loc = NodeLocation::make_from(error.begin(), error.end()); - auto message = _logger.intern_cstr(error.message()); - result = _logger.template create(loc, message, production); - } - - _logger.insert(result); - - _count++; - } - - std::size_t finish() && { - return _count; - } - - Logger& _logger; - std::size_t _count; - }; - - constexpr auto sink() const { - return sink_t { *_logger, 0 }; - } - - mutable Logger* _logger; - }; - - template - T* create(NodeLocation loc, Args&&... args) { - using node_creator = dryad::node_creator; - T* result = _tree.create(DRYAD_FWD(args)...); - _map.insert(result, loc); - return result; - } - - template - T* create() { - using node_creator = dryad::node_creator; - T* result = _tree.create(); - return result; - } - - protected: - bool _errored = false; - bool _warned = false; - dryad::node_map _map; - dryad::tree _tree; - - symbol_interner_type _symbol_interner; - - void insert(error::Error* root) { - _tree.root()->insert_back(root); - } - - public: - symbol_type intern(const char* str, std::size_t length) { - return _symbol_interner.intern(str, length); - } - symbol_type intern(std::string_view str) { - return intern(str.data(), str.size()); - } - const char* intern_cstr(const char* str, std::size_t length) { - return intern(str, length).c_str(_symbol_interner); - } - const char* intern_cstr(std::string_view str) { - return intern_cstr(str.data(), str.size()); - } - symbol_interner_type& symbol_interner() { - return _symbol_interner; - } - const symbol_interner_type& symbol_interner() const { - return _symbol_interner; - } - }; - - template - struct BasicDiagnosticLogger : DiagnosticLogger { - using file_type = FileT; - - template - using format_str = fmt::basic_format_string...>; - - explicit BasicDiagnosticLogger(const file_type& file) - : _file(&file) { - _tree.set_root(_tree.create()); - } - - struct Writer; - - template - Writer error(format_str fmt, Args&&... args) { - return log(DiagnosticKind::error, fmt, std::forward(args)...); - } - - template - Writer warning(format_str fmt, Args&&... args) { - return log(DiagnosticKind::warning, fmt, std::forward(args)...); - } - - template - Writer note(format_str fmt, Args&&... args) { - return log(DiagnosticKind::note, fmt, std::forward(args)...); - } - - template - Writer info(format_str fmt, Args&&... args) { - return log(DiagnosticKind::info, fmt, std::forward(args)...); - } - - template - Writer debug(format_str fmt, Args&&... args) { - return log(DiagnosticKind::debug, fmt, std::forward(args)...); - } - - template - Writer fixit(format_str fmt, Args&&... args) { - return log(DiagnosticKind::fixit, fmt, std::forward(args)...); - } - - template - Writer help(format_str fmt, Args&&... args) { - return log(DiagnosticKind::help, fmt, std::forward(args)...); - } - - Writer error(std::string_view sv) { - return log(DiagnosticKind::error, fmt::runtime(sv)); - } - - Writer warning(std::string_view sv) { - return log(DiagnosticKind::warning, fmt::runtime(sv)); - } - - Writer note(std::string_view sv) { - return log(DiagnosticKind::note, fmt::runtime(sv)); - } - - Writer info(std::string_view sv) { - return log(DiagnosticKind::info, fmt::runtime(sv)); - } - - Writer debug(std::string_view sv) { - return log(DiagnosticKind::debug, fmt::runtime(sv)); - } - - Writer fixit(std::string_view sv) { - return log(DiagnosticKind::fixit, fmt::runtime(sv)); - } - - Writer help(std::string_view sv) { - return log(DiagnosticKind::help, fmt::runtime(sv)); - } - - auto error_callback() { - return ErrorCallback(*this); - } - - template - static void _write_to_buffer(const CharT* s, std::streamsize n, void* output_str) { - auto* output = reinterpret_cast*>(output_str); - output->append(s, n); - } - - template - auto make_callback_stream(std::basic_string& output) { - return detail::make_callback_stream(&_write_to_buffer, reinterpret_cast(&output)); - } - - template - detail::OStreamOutputIterator make_ostream_iterator(std::basic_ostream& stream) { - return detail::OStreamOutputIterator { stream }; - } - - struct Writer { - template - [[nodiscard]] Writer& primary(NodeLocation loc, format_str fmt, Args&&... args) { - return annotation(AnnotationKind::primary, loc, fmt, std::forward(args)...); - } - - template - [[nodiscard]] Writer& secondary(NodeLocation loc, format_str fmt, Args&&... args) { - return annotation(AnnotationKind::secondary, loc, fmt, std::forward(args)...); - } - - [[nodiscard]] Writer& primary(NodeLocation loc, std::string_view sv) { - return annotation(AnnotationKind::primary, loc, fmt::runtime(sv)); - } - - [[nodiscard]] Writer& secondary(NodeLocation loc, std::string_view sv) { - return annotation(AnnotationKind::secondary, loc, fmt::runtime(sv)); - } - - void finish() {} - - template - [[nodiscard]] Writer& annotation(AnnotationKind kind, NodeLocation loc, format_str fmt, Args&&... args) { - auto begin_loc = lexy::get_input_location(_file->buffer(), loc.begin()); - - std::basic_string output; - auto stream = _logger.make_callback_stream(output); - auto iter = _logger.make_ostream_iterator(stream); - - _impl.write_empty_annotation(iter); - _impl.write_annotation(iter, kind, begin_loc, loc.end(), - [&](auto out, lexy::visualization_options) { - return lexy::_detail::write_str(out, fmt::format(fmt, std::forward(args)...).c_str()); - }); - - error::Annotation* annotation; - auto message = _logger.intern_cstr(output); - switch (kind) { - case AnnotationKind::primary: - annotation = _logger.create(loc, message); - break; - case AnnotationKind::secondary: - annotation = _logger.create(loc, message); - break; - default: detail::unreachable(); - } - _semantic->push_back(annotation); - return *this; - } - - private: - Writer(BasicDiagnosticLogger& logger, const file_type* file, error::Semantic* semantic) - : _file(file), - _impl(file->buffer(), { lexy::visualize_fancy }), - _logger(logger), - _semantic(semantic) {} - - const file_type* _file; - lexy_ext::diagnostic_writer> _impl; - BasicDiagnosticLogger& _logger; - error::Semantic* _semantic; - - friend BasicDiagnosticLogger; - }; - - using diagnostic_writer = lexy_ext::diagnostic_writer>; - - template T, typename... Args> - void log_with_impl(diagnostic_writer& impl, T* error, DiagnosticKind kind, format_str fmt, Args&&... args) { - std::basic_string output; - auto stream = make_callback_stream(output); - auto iter = make_ostream_iterator(stream); - - impl.write_message(iter, kind, - [&](auto out, lexy::visualization_options) { - return lexy::_detail::write_str(out, fmt::format(fmt, std::forward(args)...).c_str()); - }); - impl.write_path(iter, _file->path()); - - auto message = intern_cstr(output); - error->_set_message(message); - insert(error); - } - - template T, typename... Args> - void log_with_error(T* error, DiagnosticKind kind, format_str fmt, Args&&... args) { - auto impl = diagnostic_writer { _file->buffer() }; - log_with_impl(impl, error, kind, fmt, std::forward(args)...); - } - - template T, typename... Args> - void create_log(DiagnosticKind kind, format_str fmt, Args&&... args) { - log_with_error(create(), kind, fmt, std::forward(args)...); - } - - template - Writer log(DiagnosticKind kind, format_str fmt, Args&&... args) { - error::Semantic* semantic; - - switch (kind) { - case DiagnosticKind::error: - semantic = create(); - break; - case DiagnosticKind::warning: - semantic = create(); - break; - case DiagnosticKind::info: - semantic = create(); - break; - case DiagnosticKind::debug: - semantic = create(); - break; - case DiagnosticKind::fixit: - semantic = create(); - break; - case DiagnosticKind::help: - semantic = create(); - break; - default: detail::unreachable(); - } - - Writer result(*this, _file, semantic); - - log_with_impl(result._impl, semantic, kind, fmt, std::forward(args)...); - - if (kind == DiagnosticKind::error) - _errored = true; - if (kind == DiagnosticKind::warning) - _warned = true; - - return result; - } - - error_range get_errors() const { - return _tree.root()->errors(); - } - - private: - const file_type* _file; - }; -} \ No newline at end of file diff --git a/include/openvic-dataloader/Error.hpp b/include/openvic-dataloader/Error.hpp index 726079c..a2e13fe 100644 --- a/include/openvic-dataloader/Error.hpp +++ b/include/openvic-dataloader/Error.hpp @@ -3,14 +3,13 @@ #include #include -#include -#include +#include #include #include namespace ovdl { - template + template struct BasicDiagnosticLogger; } @@ -40,6 +39,10 @@ namespace ovdl::error { FirstSemantic = SemanticError, LastSemantic = SemanticHelp, + // Annotated Error // + FirstAnnotatedError = FirstParseError, + LastAnnotatedError = LastSemantic, + PrimaryAnnotation, SecondaryAnnotation, @@ -59,15 +62,15 @@ namespace ovdl::error { } struct Error : dryad::abstract_node_all { - std::string_view message() const { return _message; } + const char* message() const { return _message; } protected: DRYAD_ABSTRACT_NODE_CTOR(Error); void _set_message(const char* message) { _message = message; } - const char* _message; + const char* _message = ""; - template + template friend struct ovdl::BasicDiagnosticLogger; }; @@ -98,7 +101,30 @@ namespace ovdl::error { explicit BufferError(dryad::node_ctor ctor) : node_base(ctor) {} }; - struct ParseError : dryad::abstract_node_range { + struct Annotation : dryad::abstract_node_range { + protected: + explicit Annotation(dryad::node_ctor ctor, ErrorKind kind, const char* message) : node_base(ctor, kind) { + _set_message(message); + } + }; + + struct AnnotatedError : dryad::abstract_node_range, ErrorKind::FirstAnnotatedError, ErrorKind::LastAnnotatedError> { + DRYAD_CHILD_NODE_RANGE_GETTER(Annotation, annotations, nullptr, this->node_after(_last_annotation)); + + void push_back(Annotation* annotation); + void push_back(AnnotationList p_annotations); + + protected: + explicit AnnotatedError(dryad::node_ctor ctor, ErrorKind kind) : node_base(ctor, kind) { + insert_child_list_after(nullptr, AnnotationList {}); + _last_annotation = nullptr; + } + + private: + Annotation* _last_annotation; + }; + + struct ParseError : dryad::abstract_node_range { std::string_view production_name() const { return _production_name; } protected: @@ -116,8 +142,10 @@ namespace ovdl::error { template struct _ParseError_t : dryad::basic_node { + using base_node = dryad::basic_node; + explicit _ParseError_t(dryad::node_ctor ctor, const char* message, const char* production_name) - : dryad::basic_node(ctor, message, production_name) {} + : base_node(ctor, message, production_name) {} }; using ExpectedLiteral = _ParseError_t; @@ -125,30 +153,21 @@ namespace ovdl::error { using ExpectedCharClass = _ParseError_t; using GenericParseError = _ParseError_t; - struct Semantic : dryad::abstract_node_range, ErrorKind::FirstSemantic, ErrorKind::LastSemantic> { - DRYAD_CHILD_NODE_RANGE_GETTER(Annotation, annotations, nullptr, this->node_after(_last_annotation)); - - void push_back(Annotation* annotation); - void push_back(AnnotationList p_annotations); - + struct Semantic : dryad::abstract_node_range { protected: explicit Semantic(dryad::node_ctor ctor, ErrorKind kind) : node_base(ctor, kind) {}; explicit Semantic(dryad::node_ctor ctor, ErrorKind kind, const char* message) : node_base(ctor, kind) { - insert_child_list_after(nullptr, AnnotationList {}); _set_message(message); }; explicit Semantic(dryad::node_ctor ctor, ErrorKind kind, const char* message, AnnotationList annotations) : node_base(ctor, kind) { - insert_child_list_after(nullptr, annotations); + push_back(annotations); _set_message(message); }; - - private: - Error* _last_annotation; }; template @@ -172,13 +191,6 @@ namespace ovdl::error { using SemanticFixit = _SemanticError_t; using SemanticHelp = _SemanticError_t; - struct Annotation : dryad::abstract_node_range { - protected: - explicit Annotation(dryad::node_ctor ctor, ErrorKind kind, const char* message) : node_base(ctor, kind) { - _set_message(message); - } - }; - template struct _Annotation_t : dryad::basic_node { explicit _Annotation_t(dryad::node_ctor ctor, const char* message) @@ -188,12 +200,13 @@ namespace ovdl::error { using PrimaryAnnotation = _Annotation_t; using SecondaryAnnotation = _Annotation_t; - inline void Semantic::push_back(Annotation* annotation) { - insert_child_after(annotations().end().deref(), annotation); + inline void AnnotatedError::push_back(Annotation* annotation) { + insert_child_after(_last_annotation, annotation); _last_annotation = annotation; } - inline void Semantic::push_back(AnnotationList p_annotations) { + inline void AnnotatedError::push_back(AnnotationList p_annotations) { + if (p_annotations.empty()) return; insert_child_list_after(annotations().end().deref(), p_annotations); _last_annotation = *p_annotations.end(); } diff --git a/include/openvic-dataloader/File.hpp b/include/openvic-dataloader/File.hpp deleted file mode 100644 index caa4a0a..0000000 --- a/include/openvic-dataloader/File.hpp +++ /dev/null @@ -1,69 +0,0 @@ -#pragma once - -#include - -#include -#include - -#include - -namespace ovdl { - template - concept IsEncoding = requires(T t) { - typename T::char_type; - typename T::int_type; - { T::template is_secondary_char_type() } -> std::same_as; - { T::eof() } -> std::same_as; - { T::to_int_type(typename T::char_type {}) } -> std::same_as; - }; - - struct File { - explicit File(const char* path); - - const char* path() const noexcept; - - protected: - const char* _path; - }; - - template - concept IsFile = - std::derived_from && IsEncoding && - requires(T t, const typename T::node_type* node, NodeLocation location) { - { t.buffer() } -> std::same_as&>; - { t.set_location(node, location) } -> std::same_as; - { t.location_of(node) } -> std::same_as; - }; - - template - struct BasicFile : File { - using encoding_type = EncodingT; - using node_type = NodeT; - - explicit BasicFile(const char* path, lexy::buffer&& buffer) - : File(path), - _buffer(static_cast&&>(buffer)) {} - - explicit BasicFile(lexy::buffer&& buffer) - : File(""), - _buffer(static_cast&&>(buffer)) {} - - const lexy::buffer& buffer() const { - return _buffer; - } - - void set_location(const node_type* n, NodeLocation loc) { - _map.insert(n, loc); - } - - NodeLocation location_of(const node_type* n) const { - auto result = _map.lookup(n); - DRYAD_ASSERT(result != nullptr, "every Node should have a NodeLocation"); - return *result; - } - - protected: - lexy::buffer _buffer; - dryad::node_map _map; - }; -} \ No newline at end of file diff --git a/include/openvic-dataloader/NodeLocation.hpp b/include/openvic-dataloader/NodeLocation.hpp index 117560b..ced79e6 100644 --- a/include/openvic-dataloader/NodeLocation.hpp +++ b/include/openvic-dataloader/NodeLocation.hpp @@ -3,28 +3,68 @@ #include namespace ovdl { - struct NodeLocation { - const char* _begin = nullptr; - const char* _end = nullptr; + template + struct BasicNodeLocation { + using char_type = CharT; - NodeLocation(); - NodeLocation(const char* pos); - NodeLocation(const char* begin, const char* end); + const char_type* _begin = nullptr; + const char_type* _end = nullptr; - NodeLocation(const NodeLocation&) noexcept; - NodeLocation& operator=(const NodeLocation&); + BasicNodeLocation() = default; + BasicNodeLocation(const char_type* pos) : _begin(pos), + _end(pos) {} + BasicNodeLocation(const char_type* begin, const char_type* end) : _begin(begin), + _end(end) {} - NodeLocation(NodeLocation&&); - NodeLocation& operator=(NodeLocation&&); + BasicNodeLocation(const BasicNodeLocation&) noexcept = default; + BasicNodeLocation& operator=(const BasicNodeLocation&) = default; - const char* begin() const; - const char* end() const; + BasicNodeLocation(BasicNodeLocation&&) = default; + BasicNodeLocation& operator=(BasicNodeLocation&&) = default; - bool is_synthesized() const; + template + void set_from(const BasicNodeLocation& other) { + if constexpr (sizeof(CharT) <= sizeof(OtherCharT)) { + _begin = reinterpret_cast(other.begin()); + if (other.begin() == other.end()) + _end = _begin; + else + _end = reinterpret_cast(other.end()) + (sizeof(OtherCharT) - sizeof(CharT)); + } else { + _begin = reinterpret_cast(other.begin()); + if (other.end() - other.begin() <= 0) { + _end = reinterpret_cast(other.begin()); + } else { + _end = reinterpret_cast(other.end() - (sizeof(CharT) - sizeof(OtherCharT))); + } + } + } - static NodeLocation make_from(const char* begin, const char* end); + template + BasicNodeLocation(const BasicNodeLocation& other) { + set_from(other); + } + + template + BasicNodeLocation& operator=(const BasicNodeLocation& other) { + set_from(other); + return *this; + } + + const char_type* begin() const { return _begin; } + const char_type* end() const { return _end; } + + bool is_synthesized() const { return _begin == nullptr && _end == nullptr; } + + static BasicNodeLocation make_from(const char_type* begin, const char_type* end) { + end++; + if (begin >= end) return BasicNodeLocation(begin); + return BasicNodeLocation(begin, end); + } }; + using NodeLocation = BasicNodeLocation; + struct FilePosition { std::uint32_t start_line = std::uint32_t(-1), end_line = std::uint32_t(-1), start_column = std::uint32_t(-1), end_column = std::uint32_t(-1); diff --git a/include/openvic-dataloader/ParseData.hpp b/include/openvic-dataloader/ParseData.hpp deleted file mode 100644 index 8bec7d2..0000000 --- a/include/openvic-dataloader/ParseData.hpp +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once - -#include - -namespace ovdl { - struct ParseData { - const std::string production_name; - const unsigned int context_start_line; - const unsigned int context_start_column; - }; -} \ No newline at end of file diff --git a/include/openvic-dataloader/ParseError.hpp b/include/openvic-dataloader/ParseError.hpp deleted file mode 100644 index 9e4541e..0000000 --- a/include/openvic-dataloader/ParseError.hpp +++ /dev/null @@ -1,20 +0,0 @@ -#pragma once - -#include - -#include - -namespace ovdl { - struct ParseError { - const enum class Type : unsigned char { - Recoverable, - Fatal - } type; - const std::string message; - const int error_value; - const ParseData parse_data; - const unsigned int start_line; - const unsigned int start_column; - }; - -} \ No newline at end of file diff --git a/include/openvic-dataloader/ParseState.hpp b/include/openvic-dataloader/ParseState.hpp deleted file mode 100644 index 5655606..0000000 --- a/include/openvic-dataloader/ParseState.hpp +++ /dev/null @@ -1,120 +0,0 @@ -#pragma once - -#include - -#include -#include - -#include - -namespace ovdl { - template - concept IsParseState = requires( - T t, - const T ct, - typename T::ast_type::file_type&& file, - lexy::buffer&& buffer, - const char* path // - ) { - requires IsAst; - requires std::derived_from; - { T { std::move(file) } } -> std::same_as; - { T { std::move(buffer) } } -> std::same_as; - { T { path, std::move(buffer) } } -> std::same_as; - { t.ast() } -> std::same_as; - { ct.ast() } -> std::same_as; - { t.logger() } -> std::same_as; - { ct.logger() } -> std::same_as; - }; - - template - struct ParseState { - using ast_type = AstT; - using diagnostic_logger_type = BasicDiagnosticLogger; - - ParseState(typename ast_type::file_type&& file) - : _ast { std::move(file) }, - _logger { _ast.file() } {} - - ParseState(lexy::buffer&& buffer) - : ParseState(typename ast_type::file_type { std::move(buffer) }) {} - - ParseState(const char* path, lexy::buffer&& buffer) - : ParseState(typename ast_type::file_type { path, std::move(buffer) }) {} - - ast_type& ast() { - return _ast; - } - - const ast_type& ast() const { - return _ast; - } - - diagnostic_logger_type& logger() { - return _logger; - } - - const diagnostic_logger_type& logger() const { - return _logger; - } - - private: - ast_type _ast; - diagnostic_logger_type _logger; - }; - - template - concept IsFileParseState = requires( - T t, - const T ct, - typename T::file_type&& file, - lexy::buffer&& buffer, - const char* path // - ) { - requires IsFile; - requires std::derived_from; - { T { std::move(file) } } -> std::same_as; - { T { std::move(buffer) } } -> std::same_as; - { T { path, std::move(buffer) } } -> std::same_as; - { t.file() } -> std::same_as; - { ct.file() } -> std::same_as; - { t.logger() } -> std::same_as; - { ct.logger() } -> std::same_as; - }; - - template - struct FileParseState { - using file_type = FileT; - using diagnostic_logger_type = BasicDiagnosticLogger; - - FileParseState(file_type&& file) - : _file { std::move(file) }, - _logger { file } {} - - FileParseState(lexy::buffer&& buffer) - : FileParseState(file_type { std::move(buffer) }) {} - - FileParseState(const char* path, lexy::buffer&& buffer) - : FileParseState(file_type { path, std::move(buffer) }) {} - - file_type& file() { - return _file; - } - - const file_type& file() const { - return _file; - } - - diagnostic_logger_type& logger() { - return _logger; - } - - const diagnostic_logger_type& logger() const { - return _logger; - } - - private: - file_type _file; - diagnostic_logger_type _logger; - }; -} \ No newline at end of file diff --git a/include/openvic-dataloader/ParseWarning.hpp b/include/openvic-dataloader/ParseWarning.hpp deleted file mode 100644 index 307599f..0000000 --- a/include/openvic-dataloader/ParseWarning.hpp +++ /dev/null @@ -1,10 +0,0 @@ -#pragma once - -#include - -namespace ovdl { - struct ParseWarning { - const std::string message; - const int warning_value; - }; -} \ No newline at end of file diff --git a/include/openvic-dataloader/Parser.hpp b/include/openvic-dataloader/Parser.hpp index b885f3d..ba390e7 100644 --- a/include/openvic-dataloader/Parser.hpp +++ b/include/openvic-dataloader/Parser.hpp @@ -3,9 +3,6 @@ #include #include -#include -#include - namespace ovdl::detail { struct BasicParser { BasicParser(); diff --git a/include/openvic-dataloader/csv/LineObject.hpp b/include/openvic-dataloader/csv/LineObject.hpp index ca632cd..c839be2 100644 --- a/include/openvic-dataloader/csv/LineObject.hpp +++ b/include/openvic-dataloader/csv/LineObject.hpp @@ -13,7 +13,7 @@ #include #include -#include +#include namespace ovdl::csv { /// LineObject should be able to recognize the differences between: diff --git a/include/openvic-dataloader/csv/Parser.hpp b/include/openvic-dataloader/csv/Parser.hpp index 06e7251..35421c8 100644 --- a/include/openvic-dataloader/csv/Parser.hpp +++ b/include/openvic-dataloader/csv/Parser.hpp @@ -1,41 +1,38 @@ #pragma once #include +#include #include +#include #include #include -#include -#include +#include +#include +#include #include namespace ovdl::csv { - enum class EncodingType { - Windows1252, - Utf8 - }; - - template class Parser final : public detail::BasicParser { public: Parser(); Parser(std::basic_ostream& error_stream); - static Parser from_buffer(const char* data, std::size_t size); - static Parser from_buffer(const char* start, const char* end); - static Parser from_string(const std::string_view string); - static Parser from_file(const char* path); - static Parser from_file(const std::filesystem::path& path); + static Parser from_buffer(const char* data, std::size_t size, std::optional encoding_fallback = std::nullopt); + static Parser from_buffer(const char* start, const char* end, std::optional encoding_fallback = std::nullopt); + static Parser from_string(const std::string_view string, std::optional encoding_fallback = std::nullopt); + static Parser from_file(const char* path, std::optional encoding_fallback = std::nullopt); + static Parser from_file(const std::filesystem::path& path, std::optional encoding_fallback = std::nullopt); - constexpr Parser& load_from_buffer(const char* data, std::size_t size); - constexpr Parser& load_from_buffer(const char* start, const char* end); - constexpr Parser& load_from_string(const std::string_view string); - Parser& load_from_file(const char* path); - Parser& load_from_file(const std::filesystem::path& path); + constexpr Parser& load_from_buffer(const char* data, std::size_t size, std::optional encoding_fallback = std::nullopt); + constexpr Parser& load_from_buffer(const char* start, const char* end, std::optional encoding_fallback = std::nullopt); + constexpr Parser& load_from_string(const std::string_view string, std::optional encoding_fallback = std::nullopt); + Parser& load_from_file(const char* path, std::optional encoding_fallback = std::nullopt); + Parser& load_from_file(const std::filesystem::path& path, std::optional encoding_fallback = std::nullopt); - constexpr Parser& load_from_file(const detail::HasCstr auto& path) { - return load_from_file(path.c_str()); + constexpr Parser& load_from_file(const detail::HasCstr auto& path, std::optional encoding_fallback = std::nullopt) { + return load_from_file(path.c_str(), encoding_fallback); } bool parse_csv(bool handle_strings = false); @@ -57,12 +54,8 @@ namespace ovdl::csv { private: class ParseHandler; std::unique_ptr _parse_handler; - std::vector _lines; template constexpr void _run_load_func(detail::LoadCallback auto func, Args... args); }; - - using Windows1252Parser = Parser; - using Utf8Parser = Parser; } \ No newline at end of file diff --git a/include/openvic-dataloader/detail/Concepts.hpp b/include/openvic-dataloader/detail/Concepts.hpp new file mode 100644 index 0000000..79e04a9 --- /dev/null +++ b/include/openvic-dataloader/detail/Concepts.hpp @@ -0,0 +1,49 @@ +#pragma once + +#include +#include +#include +#include + +namespace ovdl { + struct File; + namespace detail { + enum class buffer_error : std::uint8_t; + } +} + +namespace ovdl::detail { + template + concept any_of = std::disjunction_v...>; + + template + concept HasCstr = + requires(T t) { + { t.c_str() } -> std::same_as; + }; + + template + concept HasPath = requires(T& t) { + { t.path() } -> std::same_as; + }; + + template + concept LoadCallback = + requires(T&& t, Self&& self, Args&&... args) { + { std::invoke(std::forward(t), std::forward(self), std::forward(args)...) } -> std::same_as; + }; + + template + concept IsEncoding = requires(T t) { + typename T::char_type; + typename T::int_type; + { T::template is_secondary_char_type() } -> std::same_as; + { T::eof() } -> std::same_as; + { T::to_int_type(typename T::char_type {}) } -> std::same_as; + }; + + template + concept Invocable_R = std::invocable && requires(Args&&... args) { + { invoke(forward(args)...) } -> std::convertible_to; + }; +} \ No newline at end of file diff --git a/include/openvic-dataloader/detail/Constexpr.hpp b/include/openvic-dataloader/detail/Constexpr.hpp new file mode 100644 index 0000000..49479c5 --- /dev/null +++ b/include/openvic-dataloader/detail/Constexpr.hpp @@ -0,0 +1,15 @@ +#pragma once + +// THANK YOU APPLE FOR YOUR UTTER DISREGARD FOR C++20 + +#if __cpp_lib_optional >= 202106L +#define OVDL_OPTIONAL_CONSTEXPR constexpr +#else +#define OVDL_OPTIONAL_CONSTEXPR inline +#endif + +#if __cpp_lib_constexpr_vector >= 201907L +#define OVDL_VECTOR_CONSTEXPR constexpr +#else +#define OVDL_VECTOR_CONSTEXPR inline +#endif \ No newline at end of file diff --git a/include/openvic-dataloader/detail/Encoding.hpp b/include/openvic-dataloader/detail/Encoding.hpp new file mode 100644 index 0000000..12a0524 --- /dev/null +++ b/include/openvic-dataloader/detail/Encoding.hpp @@ -0,0 +1,13 @@ +#pragma once + +#include + +namespace ovdl::detail { + enum class Encoding : std::int8_t { + Unknown, + Ascii, + Utf8, + Windows1251, + Windows1252 + }; +} \ No newline at end of file diff --git a/include/openvic-dataloader/detail/ErrorRange.hpp b/include/openvic-dataloader/detail/ErrorRange.hpp new file mode 100644 index 0000000..7d5ca13 --- /dev/null +++ b/include/openvic-dataloader/detail/ErrorRange.hpp @@ -0,0 +1,10 @@ +#pragma once + +#include + +#include + +namespace ovdl::detail { + template + using error_range = decltype(std::declval()->errors()); +} \ No newline at end of file diff --git a/include/openvic-dataloader/detail/LexyFwdDeclaration.hpp b/include/openvic-dataloader/detail/LexyFwdDeclaration.hpp deleted file mode 100644 index 554c88d..0000000 --- a/include/openvic-dataloader/detail/LexyFwdDeclaration.hpp +++ /dev/null @@ -1,8 +0,0 @@ -#pragma once - -namespace lexy { - struct default_encoding; - - template - struct buffer; -} \ No newline at end of file diff --git a/include/openvic-dataloader/detail/LexyReportError.hpp b/include/openvic-dataloader/detail/LexyReportError.hpp deleted file mode 100644 index 3c32bd1..0000000 --- a/include/openvic-dataloader/detail/LexyReportError.hpp +++ /dev/null @@ -1,107 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -#include -#include - -#include -#include - -#include "openvic-dataloader/detail/utility/Concepts.hpp" - -#include - -namespace ovdl::detail { - template - struct _ReportError { - OutputIterator _iter; - lexy::visualization_options _opts; - const char* _path; - - struct _sink { - OutputIterator _iter; - lexy::visualization_options _opts; - const char* _path; - std::size_t _count; - std::vector _errors; - - using return_type = std::vector; - - template - void operator()(const lexy::error_context& context, const lexy::error& error) { - _iter = lexy_ext::_detail::write_error(_iter, context, error, _opts, _path); - ++_count; - - // Convert the context location and error location into line/column information. - auto context_location = lexy::get_input_location(context.input(), context.position()); - auto location = lexy::get_input_location(context.input(), error.position(), context_location.anchor()); - - std::basic_stringstream message; - - // Write the main annotation. - if constexpr (std::is_same_v) { - auto string = lexy::_detail::make_literal_lexeme(error.string(), error.length()); - - message << "expected '" << string.data() << '\''; - } else if constexpr (std::is_same_v) { - auto string = lexy::_detail::make_literal_lexeme(error.string(), error.length()); - - message << "expected keyword '" << string.data() << '\''; - } else if constexpr (std::is_same_v) { - message << "expected " << error.name(); - } else { - message << error.message(); - } - - _errors.push_back( - ParseError { - ParseError::Type::Fatal, // TODO: distinguish recoverable errors from fatal errors - std::move(message.str()), - 0, // TODO: implement proper error codes - ParseData { - context.production(), - context_location.line_nr(), - context_location.column_nr(), - }, - location.line_nr(), - location.column_nr(), - }); - } - - return_type finish() && { - if (_count != 0) - *_iter++ = '\n'; - return _errors; - } - }; - constexpr auto sink() const { - return _sink { _iter, _opts, _path, 0 }; - } - - /// Specifies a path that will be printed alongside the diagnostic. - constexpr _ReportError path(const char* path) const { - return { _iter, _opts, path }; - } - - constexpr _ReportError path(const detail::HasCstr auto& path_object) const { - return path(path_object.c_str()); - } - - /// Specifies an output iterator where the errors are written to. - template - constexpr _ReportError to(OI out) const { - return { out, _opts, _path }; - } - - /// Overrides visualization options. - constexpr _ReportError opts(lexy::visualization_options opts) const { - return { _iter, opts, _path }; - } - }; - - constexpr auto ReporError = _ReportError {}; -} \ No newline at end of file diff --git a/include/openvic-dataloader/detail/OStreamOutputIterator.hpp b/include/openvic-dataloader/detail/OStreamOutputIterator.hpp index 8f120c7..81f6c89 100644 --- a/include/openvic-dataloader/detail/OStreamOutputIterator.hpp +++ b/include/openvic-dataloader/detail/OStreamOutputIterator.hpp @@ -1,6 +1,5 @@ #pragma once -#include #include namespace ovdl::detail { diff --git a/include/openvic-dataloader/detail/Utility.hpp b/include/openvic-dataloader/detail/Utility.hpp new file mode 100644 index 0000000..8d9e159 --- /dev/null +++ b/include/openvic-dataloader/detail/Utility.hpp @@ -0,0 +1,89 @@ +#pragma once + +#include +#include +#include +#include + +#include + +namespace ovdl::detail { + [[noreturn]] inline void unreachable() { + // Uses compiler specific extensions if possible. + // Even if no extension is used, undefined behavior is still raised by + // an empty function body and the noreturn attribute. +#ifdef __GNUC__ // GCC, Clang, ICC + __builtin_unreachable(); +#elif defined(_MSC_VER) // MSVC + __assume(false); +#endif + } + + template + requires std::is_enum_v + constexpr std::underlying_type_t to_underlying(EnumT e) { + return static_cast>(e); + } + + template + requires std::is_enum_v + constexpr EnumT from_underlying(std::underlying_type_t ut) { + return static_cast(ut); + } + + template + struct TypeRegister { + using tuple_type = std::tuple; + using variant_type = std::variant; + + template + struct _id_getter { + static constexpr std::uint32_t type_id() { + static_assert(any_of, "Cannot query an non-registered type"); + + if constexpr (std::is_same_v) return 0; + else return 1 + TypeRegister::template _id_getter::type_id(); + }; + }; + + template + static constexpr std::uint32_t type_id() { + + return _id_getter::type_id(); + } + + template + using type_by_id = std::tuple_element_t; + }; + + template + struct type_concat; + + template typename TT, typename... TTs> + struct type_concat, Ts...> { + using type = TT; + }; + + template + using type_concat_t = type_concat::type; + + template + struct type_prepend; + + template typename TT, typename... TTs> + struct type_prepend, Ts...> { + using type = TT; + }; + + template + using type_prepend_t = type_prepend::type; + + template typename Template> + struct is_instance_of : std::false_type {}; + + template typename Template> + struct is_instance_of, Template> : std::true_type {}; + + template typename Template> + static constexpr auto is_instance_of_v = is_instance_of::value; +} \ No newline at end of file diff --git a/include/openvic-dataloader/detail/utility/Concepts.hpp b/include/openvic-dataloader/detail/utility/Concepts.hpp deleted file mode 100644 index 0ba91cc..0000000 --- a/include/openvic-dataloader/detail/utility/Concepts.hpp +++ /dev/null @@ -1,45 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -namespace ovdl { - struct NodeLocation; - struct File; - namespace detail { - enum class buffer_error : std::uint8_t; - } -} - -namespace ovdl::detail { - template - concept any_of = (std::same_as || ...); - - template - concept HasCstr = - requires(T t) { - { t.c_str() } -> std::same_as; - }; - - template - concept HasPath = requires(T& t) { - { t.path() } -> std::same_as; - }; - - template - concept LoadCallback = - requires(T&& t, Self&& self, Args&&... args) { - { std::invoke(std::forward(t), std::forward(self), std::forward(args)...) } -> std::same_as; - }; - - template - concept IsEncoding = requires(T t) { - typename T::char_type; - typename T::int_type; - { T::template is_secondary_char_type() } -> std::same_as; - { T::eof() } -> std::same_as; - { T::to_int_type(typename T::char_type {}) } -> std::same_as; - }; -} \ No newline at end of file diff --git a/include/openvic-dataloader/detail/utility/Constexpr.hpp b/include/openvic-dataloader/detail/utility/Constexpr.hpp deleted file mode 100644 index 49479c5..0000000 --- a/include/openvic-dataloader/detail/utility/Constexpr.hpp +++ /dev/null @@ -1,15 +0,0 @@ -#pragma once - -// THANK YOU APPLE FOR YOUR UTTER DISREGARD FOR C++20 - -#if __cpp_lib_optional >= 202106L -#define OVDL_OPTIONAL_CONSTEXPR constexpr -#else -#define OVDL_OPTIONAL_CONSTEXPR inline -#endif - -#if __cpp_lib_constexpr_vector >= 201907L -#define OVDL_VECTOR_CONSTEXPR constexpr -#else -#define OVDL_VECTOR_CONSTEXPR inline -#endif \ No newline at end of file diff --git a/include/openvic-dataloader/detail/utility/ErrorRange.hpp b/include/openvic-dataloader/detail/utility/ErrorRange.hpp deleted file mode 100644 index 7d5ca13..0000000 --- a/include/openvic-dataloader/detail/utility/ErrorRange.hpp +++ /dev/null @@ -1,10 +0,0 @@ -#pragma once - -#include - -#include - -namespace ovdl::detail { - template - using error_range = decltype(std::declval()->errors()); -} \ No newline at end of file diff --git a/include/openvic-dataloader/detail/utility/PointerHash.hpp b/include/openvic-dataloader/detail/utility/PointerHash.hpp deleted file mode 100644 index c0d28bc..0000000 --- a/include/openvic-dataloader/detail/utility/PointerHash.hpp +++ /dev/null @@ -1,23 +0,0 @@ -#pragma once - -#include - -namespace ovdl::detail { - /* hash any pointer */ - template - struct PointerHash { - using type = T; - using ptr_type = T*; - using const_type = const T; - using const_ptr_type = const T*; - using const_ptr_const_type = const const_ptr_type; - constexpr std::size_t operator()(const_ptr_const_type pointer) const { - auto addr = reinterpret_cast(pointer); -#if SIZE_MAX < UINTPTR_MAX - /* size_t is not large enough to hold the pointer’s memory address */ - addr %= SIZE_MAX; /* truncate the address so it is small enough to fit in a size_t */ -#endif - return addr; - } - }; -} \ No newline at end of file diff --git a/include/openvic-dataloader/detail/utility/SelfType.hpp b/include/openvic-dataloader/detail/utility/SelfType.hpp deleted file mode 100644 index 5209700..0000000 --- a/include/openvic-dataloader/detail/utility/SelfType.hpp +++ /dev/null @@ -1,28 +0,0 @@ -#pragma once - -#include - -namespace ovdl::detail { -#if !defined(_MSC_VER) -#pragma GCC diagnostic push -#pragma clang diagnostic ignored "-Wunknown-warning-option" -#pragma GCC diagnostic ignored "-Wnon-template-friend" -#endif - template - struct Reader { - friend auto adl_GetSelfType(Reader); - }; - - template - struct Writer { - friend auto adl_GetSelfType(Reader) { return U {}; } - }; -#if !defined(_MSC_VER) -#pragma GCC diagnostic pop -#endif - - inline void adl_GetSelfType() {} - - template - using Read = std::remove_pointer_t {}))>; -} diff --git a/include/openvic-dataloader/detail/utility/TypeName.hpp b/include/openvic-dataloader/detail/utility/TypeName.hpp deleted file mode 100644 index 1a34a0f..0000000 --- a/include/openvic-dataloader/detail/utility/TypeName.hpp +++ /dev/null @@ -1,52 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -namespace ovdl::detail { - - template - constexpr auto substring_as_array(std::string_view str, std::index_sequence) { - return std::array { str[Idxs]... }; - } - - template - constexpr auto type_name_array() { -#if defined(__clang__) - constexpr auto prefix = std::string_view { "[T = " }; - constexpr auto suffix = std::string_view { "]" }; - constexpr auto function = std::string_view { __PRETTY_FUNCTION__ }; -#elif defined(__GNUC__) - constexpr auto prefix = std::string_view { "with T = " }; - constexpr auto suffix = std::string_view { "]" }; - constexpr auto function = std::string_view { __PRETTY_FUNCTION__ }; -#elif defined(_MSC_VER) - constexpr auto prefix = std::string_view { "type_name_array<" }; - constexpr auto suffix = std::string_view { ">(void)" }; - constexpr auto function = std::string_view { __FUNCSIG__ }; -#else -#error Unsupported compiler -#endif - - constexpr auto start = function.find(prefix) + prefix.size(); - constexpr auto end = function.rfind(suffix); - - static_assert(start < end); - - constexpr auto name = function.substr(start, (end - start)); - return substring_as_array(name, std::make_index_sequence {}); - } - - template - struct type_name_holder { - static inline constexpr auto value = type_name_array(); - }; - - template - constexpr auto type_name() -> std::string_view { - constexpr auto& value = type_name_holder::value; - return std::string_view { value.data(), value.size() }; - } -} \ No newline at end of file diff --git a/include/openvic-dataloader/detail/utility/Utility.hpp b/include/openvic-dataloader/detail/utility/Utility.hpp deleted file mode 100644 index 138a029..0000000 --- a/include/openvic-dataloader/detail/utility/Utility.hpp +++ /dev/null @@ -1,38 +0,0 @@ -#pragma once - -#include -#include - -#include "openvic-dataloader/detail/utility/TypeName.hpp" - -namespace ovdl::detail { - [[noreturn]] inline void unreachable() { - // Uses compiler specific extensions if possible. - // Even if no extension is used, undefined behavior is still raised by - // an empty function body and the noreturn attribute. -#ifdef __GNUC__ // GCC, Clang, ICC - __builtin_unreachable(); -#elif defined(_MSC_VER) // MSVC - __assume(false); -#endif - } - - template - constexpr std::string_view get_kind_name() { - constexpr auto name = type_name(); - - return name; - } - - template - requires std::is_enum_v - constexpr std::underlying_type_t to_underlying(EnumT e) { - return static_cast>(e); - } - - template - requires std::is_enum_v - constexpr EnumT from_underlying(std::underlying_type_t ut) { - return static_cast(ut); - } -} \ No newline at end of file diff --git a/include/openvic-dataloader/v2script/AbstractSyntaxTree.hpp b/include/openvic-dataloader/v2script/AbstractSyntaxTree.hpp index 27dbfcb..29e7866 100644 --- a/include/openvic-dataloader/v2script/AbstractSyntaxTree.hpp +++ b/include/openvic-dataloader/v2script/AbstractSyntaxTree.hpp @@ -1,12 +1,10 @@ #pragma once -#include #include -#include -#include #include -#include +#include +#include #include #include @@ -82,37 +80,34 @@ namespace ovdl::v2script::ast { }; struct FlatValue : dryad::abstract_node_range { - AbstractSyntaxTree::symbol_type value() const { + SymbolIntern::symbol_type value() const { return _value; } - const char* value(const AbstractSyntaxTree::symbol_interner_type& symbols) const { + const char* value(const SymbolIntern::symbol_interner_type& symbols) const { return _value.c_str(symbols); } protected: - explicit FlatValue(dryad::node_ctor ctor, NodeKind kind, AbstractSyntaxTree::symbol_type value) + explicit FlatValue(dryad::node_ctor ctor, NodeKind kind, SymbolIntern::symbol_type value) : node_base(ctor, kind), _value(value) {} protected: - AbstractSyntaxTree::symbol_type _value; + SymbolIntern::symbol_type _value; }; struct IdentifierValue : dryad::basic_node { - explicit IdentifierValue(dryad::node_ctor ctor, AbstractSyntaxTree::symbol_type value) : node_base(ctor, value) {} + explicit IdentifierValue(dryad::node_ctor ctor, SymbolIntern::symbol_type value) : node_base(ctor, value) {} }; struct StringValue : dryad::basic_node { - explicit StringValue(dryad::node_ctor ctor, AbstractSyntaxTree::symbol_type value) : node_base(ctor, value) {} + explicit StringValue(dryad::node_ctor ctor, SymbolIntern::symbol_type value) : node_base(ctor, value) {} }; struct ListValue : dryad::basic_node> { explicit ListValue(dryad::node_ctor ctor, StatementList statements); - explicit ListValue(dryad::node_ctor ctor, AssignStatementList statements) - : node_base(ctor) { - insert_child_list_after(nullptr, statements); - } + explicit ListValue(dryad::node_ctor ctor, AssignStatementList statements); explicit ListValue(dryad::node_ctor ctor) : ListValue(ctor, StatementList {}) { } @@ -171,10 +166,7 @@ namespace ovdl::v2script::ast { struct FileTree : dryad::basic_node> { explicit FileTree(dryad::node_ctor ctor, StatementList statements); - explicit FileTree(dryad::node_ctor ctor, AssignStatementList statements) : node_base(ctor) { - insert_child_list_after(nullptr, statements); - } - + explicit FileTree(dryad::node_ctor ctor, AssignStatementList statements); explicit FileTree(dryad::node_ctor ctor) : FileTree(ctor, StatementList {}) { } diff --git a/include/openvic-dataloader/v2script/Parser.hpp b/include/openvic-dataloader/v2script/Parser.hpp index f9f0ce8..1f6b158 100644 --- a/include/openvic-dataloader/v2script/Parser.hpp +++ b/include/openvic-dataloader/v2script/Parser.hpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -10,8 +11,9 @@ #include #include #include -#include -#include +#include +#include +#include #include #include @@ -25,20 +27,20 @@ namespace ovdl::v2script { Parser(); Parser(std::basic_ostream& error_stream); - static Parser from_buffer(const char* data, std::size_t size); - static Parser from_buffer(const char* start, const char* end); - static Parser from_string(const std::string_view string); - static Parser from_file(const char* path); - static Parser from_file(const std::filesystem::path& path); + static Parser from_buffer(const char* data, std::size_t size, std::optional encoding_fallback = std::nullopt); + static Parser from_buffer(const char* start, const char* end, std::optional encoding_fallback = std::nullopt); + static Parser from_string(const std::string_view string, std::optional encoding_fallback = std::nullopt); + static Parser from_file(const char* path, std::optional encoding_fallback = std::nullopt); + static Parser from_file(const std::filesystem::path& path, std::optional encoding_fallback = std::nullopt); - constexpr Parser& load_from_buffer(const char* data, std::size_t size); - constexpr Parser& load_from_buffer(const char* start, const char* end); - constexpr Parser& load_from_string(const std::string_view string); - Parser& load_from_file(const char* path); - Parser& load_from_file(const std::filesystem::path& path); + constexpr Parser& load_from_buffer(const char* data, std::size_t size, std::optional encoding_fallback = std::nullopt); + constexpr Parser& load_from_buffer(const char* start, const char* end, std::optional encoding_fallback = std::nullopt); + constexpr Parser& load_from_string(const std::string_view string, std::optional encoding_fallback = std::nullopt); + Parser& load_from_file(const char* path, std::optional encoding_fallback = std::nullopt); + Parser& load_from_file(const std::filesystem::path& path, std::optional encoding_fallback = std::nullopt); - constexpr Parser& load_from_file(const detail::HasCstr auto& path) { - return load_from_file(path.c_str()); + constexpr Parser& load_from_file(const detail::HasCstr auto& path, std::optional encoding_fallback = std::nullopt) { + return load_from_file(path.c_str(), encoding_fallback); } bool simple_parse(); diff --git a/src/headless/main.cpp b/src/headless/main.cpp index 7279a6e..0ad6115 100644 --- a/src/headless/main.cpp +++ b/src/headless/main.cpp @@ -6,7 +6,7 @@ #include #include -#include +#include #include #include #include @@ -41,9 +41,8 @@ bool insenitive_trim_eq(std::string_view lhs, std::string_view rhs) { [](char a, char b) { return std::tolower(a) == std::tolower(b); }); } -template int print_csv(const std::string_view path) { - auto parser = ovdl::csv::Parser(std::cerr); + auto parser = ovdl::csv::Parser(std::cerr); parser.load_from_file(path); if (parser.has_error()) { return 1; @@ -73,12 +72,11 @@ int print_lua(const std::string_view path, VisualizationType visual_type) { return 1; } - parser.lua_defines_parse(); - if (parser.has_error()) { + if (!parser.lua_defines_parse()) { return 2; } - if (parser.has_warning()) { + if (parser.has_error() || parser.has_warning()) { parser.print_errors_to(std::cerr); } @@ -97,12 +95,11 @@ int print_v2script_simple(const std::string_view path, VisualizationType visual_ return 1; } - parser.simple_parse(); - if (parser.has_error()) { + if (!parser.simple_parse()) { return 2; } - if (parser.has_warning()) { + if (parser.has_error() || parser.has_warning()) { parser.print_errors_to(std::cerr); } @@ -139,23 +136,18 @@ int main(int argc, char** argv) { return print_lua(args[1], type); } return print_v2script_simple(args[1], type); - case 4: - if (insenitive_trim_eq(args[1], "csv") && insenitive_trim_eq(args[2], "utf")) - return print_csv(args[3]); - goto default_jump; case 3: if (insenitive_trim_eq(args[1], "csv")) - return print_csv(args[2]); + return print_csv(args[2]); if (insenitive_trim_eq(args[1], "lua")) return print_lua(args[2], type); [[fallthrough]]; default: - default_jump: std::fprintf(stderr, "usage: %s \n", args[0].c_str()); std::fprintf(stderr, "usage: %s list \n", args[0].c_str()); std::fprintf(stderr, "usage: %s native \n", args[0].c_str()); std::fprintf(stderr, "usage: %s lua \n", args[0].c_str()); - std::fprintf(stderr, "usage: %s csv [utf] \n", args[0].c_str()); + std::fprintf(stderr, "usage: %s csv \n", args[0].c_str()); return EXIT_FAILURE; } diff --git a/src/openvic-dataloader/AbstractSyntaxTree.cpp b/src/openvic-dataloader/AbstractSyntaxTree.cpp index 11a90dc..d6f58f7 100644 --- a/src/openvic-dataloader/AbstractSyntaxTree.cpp +++ b/src/openvic-dataloader/AbstractSyntaxTree.cpp @@ -1,4 +1,4 @@ -#include +#include "AbstractSyntaxTree.hpp" using namespace ovdl; diff --git a/src/openvic-dataloader/AbstractSyntaxTree.hpp b/src/openvic-dataloader/AbstractSyntaxTree.hpp new file mode 100644 index 0000000..a5b8886 --- /dev/null +++ b/src/openvic-dataloader/AbstractSyntaxTree.hpp @@ -0,0 +1,89 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include "detail/InternalConcepts.hpp" + +namespace ovdl { + struct AbstractSyntaxTree : SymbolIntern { + symbol_type intern(const char* str, std::size_t length); + symbol_type intern(std::string_view str); + const char* intern_cstr(const char* str, std::size_t length); + const char* intern_cstr(std::string_view str); + symbol_interner_type& symbol_interner(); + const symbol_interner_type& symbol_interner() const; + + protected: + symbol_interner_type _symbol_interner; + }; + + template RootNodeT> + struct BasicAbstractSyntaxTree : AbstractSyntaxTree { + using file_type = FileT; + using root_node_type = RootNodeT; + using node_type = typename file_type::node_type; + + explicit BasicAbstractSyntaxTree(file_type&& file) : _file { std::move(file) } {} + + template + explicit BasicAbstractSyntaxTree(lexy::buffer&& buffer) : _file { std::move(buffer) } {} + + void set_location(const node_type* n, NodeLocation loc) { + _file.set_location(n, loc); + } + + NodeLocation location_of(const node_type* n) const { + return _file.location_of(n); + } + + root_node_type* root() { + return _tree.root(); + } + + const root_node_type* root() const { + return _tree.root(); + } + + file_type& file() { + return _file; + } + + const file_type& file() const { + return _file; + } + + template + T* create(NodeLocation loc, Args&&... args) { + auto node = _tree.template create(DRYAD_FWD(args)...); + set_location(node, loc); + return node; + } + + template + T* create(const char* begin, const char* end, Args&&... args) { + return create(NodeLocation::make_from(begin, end), DRYAD_FWD(args)...); + } + + void set_root(root_node_type* node) { + _tree.set_root(node); + } + + protected: + dryad::tree _tree; + file_type _file; + }; +} \ No newline at end of file diff --git a/src/openvic-dataloader/DiagnosticLogger.cpp b/src/openvic-dataloader/DiagnosticLogger.cpp index aae3dcb..9fe5e93 100644 --- a/src/openvic-dataloader/DiagnosticLogger.cpp +++ b/src/openvic-dataloader/DiagnosticLogger.cpp @@ -1,4 +1,4 @@ -#include +#include "DiagnosticLogger.hpp" using namespace ovdl; @@ -9,8 +9,7 @@ DiagnosticLogger::operator bool() const { bool DiagnosticLogger::errored() const { return _errored; } bool DiagnosticLogger::warned() const { return _warned; } - NodeLocation DiagnosticLogger::location_of(const error::Error* error) const { auto result = _map.lookup(error); - return result ? *result : NodeLocation{}; + return result ? *result : NodeLocation {}; } \ No newline at end of file diff --git a/src/openvic-dataloader/DiagnosticLogger.hpp b/src/openvic-dataloader/DiagnosticLogger.hpp new file mode 100644 index 0000000..2a655a9 --- /dev/null +++ b/src/openvic-dataloader/DiagnosticLogger.hpp @@ -0,0 +1,492 @@ +#pragma once + +#include // IWYU pragma: keep +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include + +namespace ovdl { + template + struct BasicDiagnosticLogger; + + struct DiagnosticLogger : SymbolIntern { + using AnnotationKind = lexy_ext::annotation_kind; + using DiagnosticKind = lexy_ext::diagnostic_kind; + + using error_range = detail::error_range; + + explicit operator bool() const; + bool errored() const; + bool warned() const; + + NodeLocation location_of(const error::Error* error) const; + + template Logger> + struct ErrorCallback { + ErrorCallback(Logger& logger) : _logger(&logger) {} + + struct sink_t { + using return_type = std::size_t; + + template + void operator()(lexy::error_context const& context, lexy::error_for const& error) { + using Reader = lexy::input_reader; + using Encoding = typename Reader::encoding; + using char_type = typename Encoding::char_type; + error::Error* result; + + std::string production_name = context.production(); + auto left_strip = production_name.find_first_of('<'); + if (left_strip != std::string::npos) { + auto right_strip = production_name.find_first_of('>', left_strip); + if (right_strip != std::string::npos) { + production_name.erase(left_strip, right_strip - left_strip + 1); + } + } + + auto context_location = lexy::get_input_location(context.input(), context.position()); + auto location = lexy::get_input_location(context.input(), error.position(), context_location.anchor()); + + if constexpr (detail::is_instance_of_v) { + lexy_ext::diagnostic_writer impl { context.input() }; + + BasicNodeLocation loc = [&] { + if constexpr (std::is_same_v) { + return BasicNodeLocation::make_from(error.position(), error.position() + error.index() + 1); + } else if constexpr (std::is_same_v) { + return BasicNodeLocation::make_from(error.position(), error.end()); + } else if constexpr (std::is_same_v) { + return BasicNodeLocation::make_from(error.position(), error.position() + 1); + } else { + return BasicNodeLocation::make_from(error.position(), error.end()); + } + }(); + + auto writer = _logger.template parse_error(impl, loc, production_name.c_str()); + if (location.line_nr() != context_location.line_nr()) + writer.secondary(BasicNodeLocation { context.position(), lexy::_detail::next(context.position()) }, "beginning here").finish(); + + if constexpr (std::is_same_v) { + auto string = lexy::_detail::make_literal_lexeme(error.string(), error.length()); + writer.primary(loc, "expected '{}'", string.data()) + .finish(); + } else if constexpr (std::is_same_v) { + auto string = lexy::_detail::make_literal_lexeme(error.string(), error.length()); + writer.primary(loc, "expected keyword '{}'", string.data()) + .finish(); + } else if constexpr (std::is_same_v) { + writer.primary(loc, "expected {}", error.name()) + .finish(); + } else { + writer.primary(loc, error.message()) + .finish(); + } + result = writer.error(); + } else { + auto production = _logger.intern_cstr(production_name); + if constexpr (std::is_same_v) { + auto string = lexy::_detail::make_literal_lexeme(error.string(), error.length()); + NodeLocation loc = NodeLocation::make_from(context.position(), error.position() - 1); + auto message = _logger.intern_cstr(fmt::format("expected '{}'", string.data())); + result = _logger.template create(loc, message, production); + } else if constexpr (std::is_same_v) { + auto string = lexy::_detail::make_literal_lexeme(error.string(), error.length()); + NodeLocation loc = NodeLocation::make_from(context.position(), error.position() - 1); + auto message = _logger.intern_cstr(fmt::format("expected keyword '{}'", string.data())); + result = _logger.template create(loc, message, production); + } else if constexpr (std::is_same_v) { + auto message = _logger.intern_cstr(fmt::format("expected {}", error.name())); + result = _logger.template create(error.position(), message, production); + } else { + NodeLocation loc = NodeLocation::make_from(error.begin(), error.end()); + auto message = _logger.intern_cstr(error.message()); + result = _logger.template create(loc, message, production); + } + } + _logger.insert(result); + + _count++; + } + + std::size_t finish() && { + return _count; + } + + Logger& _logger; + std::size_t _count; + }; + + constexpr auto sink() const { + return sink_t { *_logger, 0 }; + } + + mutable Logger* _logger; + }; + + template + T* create(BasicNodeLocation loc, Args&&... args) { + using node_creator = dryad::node_creator; + T* result = _tree.create(DRYAD_FWD(args)...); + _map.insert(result, loc); + return result; + } + + template + T* create() { + using node_creator = dryad::node_creator; + T* result = _tree.create(); + return result; + } + + error_range get_errors() const { + return _tree.root()->errors(); + } + + protected: + bool _errored = false; + bool _warned = false; + dryad::node_map _map; + dryad::tree _tree; + + symbol_interner_type _symbol_interner; + + void insert(error::Error* root) { + _tree.root()->insert_back(root); + } + + public: + symbol_type intern(const char* str, std::size_t length) { + return _symbol_interner.intern(str, length); + } + symbol_type intern(std::string_view str) { + return intern(str.data(), str.size()); + } + const char* intern_cstr(const char* str, std::size_t length) { + return intern(str, length).c_str(_symbol_interner); + } + const char* intern_cstr(std::string_view str) { + return intern_cstr(str.data(), str.size()); + } + symbol_interner_type& symbol_interner() { + return _symbol_interner; + } + const symbol_interner_type& symbol_interner() const { + return _symbol_interner; + } + }; + + template + struct BasicDiagnosticLogger : DiagnosticLogger { + using parse_state_type = ParseState; + using file_type = typename parse_state_type::file_type; + + template + using format_str = fmt::basic_format_string...>; + + explicit BasicDiagnosticLogger(const file_type& file) + : _file(&file) { + _tree.set_root(_tree.create()); + } + + struct Writer; + + template + Writer error(format_str fmt, Args&&... args) { + return log(DiagnosticKind::error, fmt, std::forward(args)...); + } + + template + Writer warning(format_str fmt, Args&&... args) { + return log(DiagnosticKind::warning, fmt, std::forward(args)...); + } + + template + Writer note(format_str fmt, Args&&... args) { + return log(DiagnosticKind::note, fmt, std::forward(args)...); + } + + template + Writer info(format_str fmt, Args&&... args) { + return log(DiagnosticKind::info, fmt, std::forward(args)...); + } + + template + Writer debug(format_str fmt, Args&&... args) { + return log(DiagnosticKind::debug, fmt, std::forward(args)...); + } + + template + Writer fixit(format_str fmt, Args&&... args) { + return log(DiagnosticKind::fixit, fmt, std::forward(args)...); + } + + template + Writer help(format_str fmt, Args&&... args) { + return log(DiagnosticKind::help, fmt, std::forward(args)...); + } + + Writer error(std::string_view sv) { + return log(DiagnosticKind::error, fmt::runtime(sv)); + } + + Writer warning(std::string_view sv) { + return log(DiagnosticKind::warning, fmt::runtime(sv)); + } + + Writer note(std::string_view sv) { + return log(DiagnosticKind::note, fmt::runtime(sv)); + } + + Writer info(std::string_view sv) { + return log(DiagnosticKind::info, fmt::runtime(sv)); + } + + Writer debug(std::string_view sv) { + return log(DiagnosticKind::debug, fmt::runtime(sv)); + } + + Writer fixit(std::string_view sv) { + return log(DiagnosticKind::fixit, fmt::runtime(sv)); + } + + Writer help(std::string_view sv) { + return log(DiagnosticKind::help, fmt::runtime(sv)); + } + + auto error_callback() { + return ErrorCallback(*this); + } + + template + static void _write_to_buffer(const CharT* s, std::streamsize n, void* output_str) { + auto* output = reinterpret_cast*>(output_str); + output->append(s, n); + } + + template + auto make_callback_stream(std::basic_string& output) { + return detail::make_callback_stream(&_write_to_buffer, reinterpret_cast(&output)); + } + + template + detail::OStreamOutputIterator make_ostream_iterator(std::basic_ostream& stream) { + return detail::OStreamOutputIterator { stream }; + } + + struct Writer { + template + [[nodiscard]] Writer& primary(BasicNodeLocation loc, format_str fmt, Args&&... args) { + return annotation(AnnotationKind::primary, loc, fmt, std::forward(args)...); + } + + template + [[nodiscard]] Writer& secondary(BasicNodeLocation loc, format_str fmt, Args&&... args) { + return annotation(AnnotationKind::secondary, loc, fmt, std::forward(args)...); + } + + template + [[nodiscard]] Writer& primary(BasicNodeLocation loc, const char* sv) { + return annotation(AnnotationKind::primary, loc, fmt::runtime(sv)); + } + + template + [[nodiscard]] Writer& secondary(BasicNodeLocation loc, const char* sv) { + return annotation(AnnotationKind::secondary, loc, fmt::runtime(sv)); + } + + void finish() {} + + template + [[nodiscard]] Writer& annotation(AnnotationKind kind, BasicNodeLocation loc, format_str fmt, Args&&... args) { + std::basic_string output; + + _file.visit_buffer([&](auto&& buffer) { + using char_type = typename std::decay_t::encoding::char_type; + + BasicNodeLocation converted_loc = loc; + + auto begin_loc = lexy::get_input_location(buffer, converted_loc.begin()); + + auto stream = _logger.make_callback_stream(output); + auto iter = _logger.make_ostream_iterator(stream); + + lexy_ext::diagnostic_writer _impl { buffer, { lexy::visualize_fancy } }; + _impl.write_empty_annotation(iter); + _impl.write_annotation(iter, kind, begin_loc, converted_loc.end(), + [&](auto out, lexy::visualization_options) { + return lexy::_detail::write_str(out, fmt::format(fmt, std::forward(args)...).c_str()); + }); + }); + + error::Annotation* annotation; + auto message = _logger.intern_cstr(output); + switch (kind) { + case AnnotationKind::primary: + annotation = _logger.create(loc, message); + break; + case AnnotationKind::secondary: + annotation = _logger.create(loc, message); + break; + default: detail::unreachable(); + } + _annotated->push_back(annotation); + return *this; + } + + error::AnnotatedError* error() { + return _annotated; + } + + private: + Writer(BasicDiagnosticLogger& logger, const file_type& file, error::AnnotatedError* annotated) + : _file(file), + _logger(logger), + _annotated(annotated) {} + + const file_type& _file; + BasicDiagnosticLogger& _logger; + error::AnnotatedError* _annotated; + + friend BasicDiagnosticLogger; + }; + + template T, typename Buffer, typename... Args> + void log_with_impl(lexy_ext::diagnostic_writer& impl, T* error, DiagnosticKind kind, format_str fmt, Args&&... args) { + std::basic_string output; + auto stream = make_callback_stream(output); + auto iter = make_ostream_iterator(stream); + + impl.write_message(iter, kind, + [&](auto out, lexy::visualization_options) { + return lexy::_detail::write_str(out, fmt::format(fmt, std::forward(args)...).c_str()); + }); + impl.write_path(iter, file().path()); + + auto message = intern_cstr(output); + error->_set_message(message); + if (!error->is_linked_in_tree()) + insert(error); + } + + template + Writer parse_error(lexy_ext::diagnostic_writer& impl, NodeLocation loc, const char* production_name) { + std::basic_string output; + auto stream = make_callback_stream(output); + auto iter = make_ostream_iterator(stream); + + impl.write_message(iter, DiagnosticKind::error, + [&](auto out, lexy::visualization_options) { + return lexy::_detail::write_str(out, fmt::format("while parsing {}", production_name).c_str()); + }); + impl.write_path(iter, file().path()); + + auto production = intern_cstr(production_name); + auto message = intern_cstr(output); + auto* error = [&] { + if constexpr (std::is_same_v) { + return create(loc, message, production); + } else if constexpr (std::is_same_v) { + return create(loc, message, production); + } else if constexpr (std::is_same_v) { + return create(loc, message, production); + } else { + return create(loc, message, production); + } + }(); + + Writer result(*this, file(), error); + _errored = true; + + return result; + } + + template T, typename... Args> + void log_with_error(T* error, DiagnosticKind kind, format_str fmt, Args&&... args) { + file().visit_buffer( + [&](auto&& buffer) { + lexy_ext::diagnostic_writer impl { buffer }; + log_with_impl(impl, error, kind, fmt, std::forward(args)...); + }); + } + + template T, typename... Args> + void create_log(DiagnosticKind kind, format_str fmt, Args&&... args) { + log_with_error(create(), kind, fmt, std::forward(args)...); + } + + template + Writer log(DiagnosticKind kind, format_str fmt, Args&&... args) { + error::Semantic* semantic; + + switch (kind) { + case DiagnosticKind::error: + semantic = create(); + break; + case DiagnosticKind::warning: + semantic = create(); + break; + case DiagnosticKind::info: + semantic = create(); + break; + case DiagnosticKind::debug: + semantic = create(); + break; + case DiagnosticKind::fixit: + semantic = create(); + break; + case DiagnosticKind::help: + semantic = create(); + break; + default: detail::unreachable(); + } + + Writer result(*this, file(), semantic); + + file().visit_buffer([&](auto&& buffer) { + lexy_ext::diagnostic_writer impl { buffer }; + log_with_impl(impl, semantic, kind, fmt, std::forward(args)...); + }); + + if (kind == DiagnosticKind::error) + _errored = true; + if (kind == DiagnosticKind::warning) + _warned = true; + + return result; + } + + const auto& file() const { + return *_file; + } + + private: + const file_type* _file; + }; +} \ No newline at end of file diff --git a/src/openvic-dataloader/File.cpp b/src/openvic-dataloader/File.cpp index 9b27bf0..e4d3773 100644 --- a/src/openvic-dataloader/File.cpp +++ b/src/openvic-dataloader/File.cpp @@ -1,4 +1,10 @@ -#include +#include "File.hpp" + +#include + +#include + +#include using namespace ovdl; @@ -6,4 +12,8 @@ File::File(const char* path) : _path(path) {} const char* File::path() const noexcept { return _path; +} + +bool File::is_valid() const noexcept { + return _buffer.index() != 0 && !_buffer.valueless_by_exception() && visit_buffer([](auto&& buffer) { return buffer.data() != nullptr; }); } \ No newline at end of file diff --git a/src/openvic-dataloader/File.hpp b/src/openvic-dataloader/File.hpp new file mode 100644 index 0000000..90fcb11 --- /dev/null +++ b/src/openvic-dataloader/File.hpp @@ -0,0 +1,139 @@ +#pragma once + +#include +#include // IWYU pragma: keep +#include +#include + +#include +#include + +#include +#include + +#include + +namespace ovdl { + struct File { + using buffer_ids = detail::TypeRegister< + lexy::buffer, + lexy::buffer, + lexy::buffer, + lexy::buffer, + lexy::buffer, + lexy::buffer>; + + explicit File(const char* path); + + const char* path() const noexcept; + + bool is_valid() const noexcept; + + template + constexpr bool is_buffer() const { + return buffer_ids::type_id>() + 1 == _buffer.index(); + } + + template + lexy::buffer* try_get_buffer_as() { + return std::get_if>(&_buffer); + } + + template + const lexy::buffer* try_get_buffer_as() const { + return std::get_if>(&_buffer); + } + + template + lexy::buffer& get_buffer_as() { + assert((is_buffer())); + return *std::get_if>(&_buffer); + } + + template + const lexy::buffer& get_buffer_as() const { + assert((is_buffer())); + return *std::get_if>(&_buffer); + } + +#define SWITCH_LIST \ + X(1) \ + X(2) \ + X(3) \ + X(4) \ + X(5) \ + X(6) + +#define X(NUM) \ + case NUM: \ + return visitor(std::get(_buffer)); + + template + decltype(auto) visit_buffer(Visitor&& visitor) { + switch (_buffer.index()) { + SWITCH_LIST + default: ovdl::detail::unreachable(); + } + } + + template + Return visit_buffer(Visitor&& visitor) { + switch (_buffer.index()) { + SWITCH_LIST + default: ovdl::detail::unreachable(); + } + } + + template + decltype(auto) visit_buffer(Visitor&& visitor) const { + switch (_buffer.index()) { + SWITCH_LIST + default: ovdl::detail::unreachable(); + } + } + + template + Return visit_buffer(Visitor&& visitor) const { + switch (_buffer.index()) { + SWITCH_LIST + default: ovdl::detail::unreachable(); + } + } +#undef X +#undef SWITCH_LIST + + protected: + const char* _path; + detail::type_prepend_t _buffer; + }; + + template + struct BasicFile : File { + using node_type = NodeT; + + template + explicit BasicFile(const char* path, lexy::buffer&& buffer) + : File(path) { + _buffer = static_cast&&>(buffer); + } + + template + explicit BasicFile(lexy::buffer&& buffer) + : File("") { + _buffer = static_cast&&>(buffer); + } + + void set_location(const node_type* n, NodeLocation loc) { + _map.insert(n, loc); + } + + NodeLocation location_of(const node_type* n) const { + auto result = _map.lookup(n); + DRYAD_ASSERT(result != nullptr, "every Node should have a NodeLocation"); + return *result; + } + + protected: + dryad::node_map _map; + }; +} \ No newline at end of file diff --git a/src/openvic-dataloader/NodeLocation.cpp b/src/openvic-dataloader/NodeLocation.cpp deleted file mode 100644 index 9e4f669..0000000 --- a/src/openvic-dataloader/NodeLocation.cpp +++ /dev/null @@ -1,26 +0,0 @@ -#include - -using namespace ovdl; - -NodeLocation::NodeLocation() = default; -NodeLocation::NodeLocation(const char* pos) : _begin(pos), - _end(pos) {} -NodeLocation::NodeLocation(const char* begin, const char* end) : _begin(begin), - _end(end) {} - -NodeLocation::NodeLocation(const NodeLocation&) noexcept = default; -NodeLocation& NodeLocation::operator=(const NodeLocation&) = default; - -NodeLocation::NodeLocation(NodeLocation&&) = default; -NodeLocation& NodeLocation::operator=(NodeLocation&&) = default; - -const char* NodeLocation::begin() const { return _begin; } -const char* NodeLocation::end() const { return _end; } - -bool NodeLocation::is_synthesized() const { return _begin == nullptr && _end == nullptr; } - -NodeLocation NodeLocation::make_from(const char* begin, const char* end) { - end++; - if (begin >= end) return NodeLocation(begin); - return NodeLocation(begin, end); -} diff --git a/src/openvic-dataloader/ParseState.hpp b/src/openvic-dataloader/ParseState.hpp new file mode 100644 index 0000000..806829c --- /dev/null +++ b/src/openvic-dataloader/ParseState.hpp @@ -0,0 +1,105 @@ +#pragma once + +#include + +#include + +#include +#include + +#include + +#include "DiagnosticLogger.hpp" +#include "detail/InternalConcepts.hpp" + +namespace ovdl { + struct BasicParseState { + explicit BasicParseState(detail::Encoding encoding = detail::Encoding::Unknown) : _encoding(encoding) {} + + detail::Encoding encoding() const { + return _encoding; + } + + protected: + detail::Encoding _encoding; + }; + + template + struct ParseState : BasicParseState { + using ast_type = AstT; + using file_type = typename ast_type::file_type; + using diagnostic_logger_type = BasicDiagnosticLogger; + + ParseState(typename ast_type::file_type&& file, detail::Encoding encoding) + : _ast { std::move(file) }, + _logger { this->ast().file() }, + BasicParseState(encoding) {} + + template + ParseState(lexy::buffer&& buffer, detail::Encoding encoding) + : ParseState(typename ast_type::file_type { std::move(buffer) }, encoding) {} + + template + ParseState(const char* path, lexy::buffer&& buffer, detail::Encoding encoding) + : ParseState(typename ast_type::file_type { path, std::move(buffer) }, encoding) {} + + ast_type& ast() { + return _ast; + } + + const ast_type& ast() const { + return _ast; + } + + diagnostic_logger_type& logger() { + return _logger; + } + + const diagnostic_logger_type& logger() const { + return _logger; + } + + private: + ast_type _ast; + diagnostic_logger_type _logger; + }; + + template + struct FileParseState : BasicParseState { + using file_type = FileT; + using diagnostic_logger_type = BasicDiagnosticLogger; + + FileParseState(file_type&& file, detail::Encoding encoding) + : _file { std::move(file) }, + _logger { this->file() }, + BasicParseState(encoding) {} + + template + FileParseState(lexy::buffer&& buffer, detail::Encoding encoding) + : FileParseState(file_type { std::move(buffer) }, encoding) {} + + template + FileParseState(const char* path, lexy::buffer&& buffer, detail::Encoding encoding) + : FileParseState(file_type { path, std::move(buffer) }, encoding) {} + + file_type& file() { + return _file; + } + + const file_type& file() const { + return _file; + } + + diagnostic_logger_type& logger() { + return _logger; + } + + const diagnostic_logger_type& logger() const { + return _logger; + } + + private: + file_type _file; + diagnostic_logger_type _logger; + }; +} \ No newline at end of file diff --git a/src/openvic-dataloader/csv/CsvGrammar.hpp b/src/openvic-dataloader/csv/CsvGrammar.hpp index 5451f26..19aee54 100644 --- a/src/openvic-dataloader/csv/CsvGrammar.hpp +++ b/src/openvic-dataloader/csv/CsvGrammar.hpp @@ -9,22 +9,20 @@ #include #include +#include #include +#include #include +#include +#include +#include +#include "detail/Convert.hpp" +#include "detail/InternalConcepts.hpp" #include "detail/dsl.hpp" // Grammar Definitions // namespace ovdl::csv::grammar { - using EncodingType = ovdl::csv::EncodingType; - - template - concept ParseChars = requires() { - { T::character }; - { T::control }; - }; - - template struct ParseOptions { /// @brief Seperator character char SepChar; @@ -33,12 +31,34 @@ namespace ovdl::csv::grammar { /// @brief Paradox-style localization escape characters /// @note Is ignored if SupportStrings is true char EscapeChar; + }; - static constexpr auto parse_chars = T {}; - static constexpr auto character = parse_chars.character; - static constexpr auto control = parse_chars.control; + struct ConvertErrorHandler { + static constexpr void on_invalid_character(detail::IsStateType auto& state, auto reader) { + state.logger().warning("invalid character value '{}' found", static_cast(reader.peek())) // + .primary(BasicNodeLocation { reader.position() }, "here") + .finish(); + } }; + constexpr bool IsUtf8(auto encoding) { + return std::same_as, lexy::utf8_char_encoding>; + } + + template + constexpr auto convert_as_string = convert::convert_as_string< + String, + ConvertErrorHandler>; + + constexpr auto ansi_character = lexy::dsl::ascii::character / dsl::lit_b_range<0x80, 0xFF>; + constexpr auto ansi_control = + lexy::dsl::ascii::control / + lexy::dsl::lit_b<0x81> / lexy::dsl::lit_b<0x8D> / lexy::dsl::lit_b<0x8F> / + lexy::dsl::lit_b<0x90> / lexy::dsl::lit_b<0x9D>; + + constexpr auto utf_character = lexy::dsl::unicode::character; + constexpr auto utf_control = lexy::dsl::unicode::control; + constexpr auto escaped_symbols = lexy::symbol_table // .map<'"'>('"') .map<'\''>('\'') @@ -55,38 +75,95 @@ namespace ovdl::csv::grammar { template struct CsvGrammar { - struct StringValue { - static constexpr auto rule = [] { - // Arbitrary code points - auto c = Options.character - Options.control; + struct StringValue : lexy::scan_production, + lexy::token_production { + + template + static constexpr scan_result scan(lexy::rule_scanner& scanner, detail::IsFileParseState auto& state) { + using encoding = typename Reader::encoding; + + constexpr auto rule = [] { + // Arbitrary code points + auto c = [] { + if constexpr (std::same_as || std::same_as) { + return ansi_character - ansi_control; + } else { + return utf_character - utf_control; + } + }(); - auto back_escape = lexy::dsl::backslash_escape // - .symbol(); + auto back_escape = lexy::dsl::backslash_escape // + .symbol(); - auto quote_escape = lexy::dsl::escape(lexy::dsl::lit_c<'"'>) // - .template symbol(); + auto quote_escape = lexy::dsl::escape(lexy::dsl::lit_c<'"'>) // + .template symbol(); - return lexy::dsl::delimited(lexy::dsl::lit_c<'"'>, lexy::dsl::not_followed_by(lexy::dsl::lit_c<'"'>, lexy::dsl::lit_c<'"'>))(c, back_escape, quote_escape); - }(); + return lexy::dsl::delimited(lexy::dsl::lit_c<'"'>, lexy::dsl::not_followed_by(lexy::dsl::lit_c<'"'>, lexy::dsl::lit_c<'"'>))(c, back_escape, quote_escape); + }(); + + lexy::scan_result str_result = scanner.template parse(rule); + if (!scanner || !str_result) + return lexy::scan_failed; + return str_result.value(); + } - static constexpr auto value = lexy::as_string; + static constexpr auto rule = lexy::dsl::peek(lexy::dsl::lit_c<'"'>) >> lexy::dsl::scan; + + static constexpr auto value = convert_as_string >> lexy::forward; }; - struct PlainValue { - static constexpr auto rule = [] { + struct PlainValue : lexy::scan_production, + lexy::token_production { + + template + static constexpr auto _escape_check = character - (lexy::dsl::lit_b / lexy::dsl::ascii::newline); + + template + static constexpr scan_result scan(lexy::rule_scanner& scanner, detail::IsFileParseState auto& state) { + using encoding = typename Reader::encoding; + + constexpr auto rule = [] { + constexpr auto character = [] { + if constexpr (std::same_as || std::same_as) { + return ansi_character; + } else { + return utf_character; + } + }(); + + if constexpr (Options.SupportStrings) { + return lexy::dsl::identifier(character - (lexy::dsl::lit_b / lexy::dsl::ascii::newline)); + } else { + auto escape_check_char = _escape_check; + auto id_check_char = escape_check_char - lexy::dsl::lit_b<'\\'>; + auto id_segment = lexy::dsl::identifier(id_check_char); + auto escape_segement = lexy::dsl::token(escape_check_char); + auto escape_sym = lexy::dsl::symbol(escape_segement); + auto escape_rule = lexy::dsl::lit_b<'\\'> >> escape_sym; + return lexy::dsl::list(id_segment | escape_rule); + } + }(); + if constexpr (Options.SupportStrings) { - return lexy::dsl::identifier(Options.character - (lexy::dsl::lit_b / lexy::dsl::ascii::newline)); + auto lexeme_result = scanner.template parse>(rule); + if (!scanner || !lexeme_result) + return lexy::scan_failed; + return std::string { lexeme_result.value().begin(), lexeme_result.value().end() }; } else { - auto escape_check_char = Options.character - (lexy::dsl::lit_b / lexy::dsl::ascii::newline); - auto id_check_char = escape_check_char - lexy::dsl::lit_b<'\\'>; - auto id_segment = lexy::dsl::identifier(id_check_char); - auto escape_segement = lexy::dsl::token(escape_check_char); - auto escape_sym = lexy::dsl::symbol(escape_segement); - auto escape_rule = lexy::dsl::lit_b<'\\'> >> escape_sym; - return lexy::dsl::list(id_segment | escape_rule); + lexy::scan_result str_result = scanner.template parse(rule); + if (!scanner || !str_result) + return lexy::scan_failed; + return str_result.value(); } - }(); - static constexpr auto value = lexy::as_string; + } + + static constexpr auto rule = + dsl::peek( + _escape_check, + _escape_check) >> + lexy::dsl::scan; + + static constexpr auto value = convert_as_string >> lexy::forward; }; struct Value { @@ -114,17 +191,17 @@ namespace ovdl::csv::grammar { static constexpr auto rule = lexy::dsl::list(lexy::dsl::p, lexy::dsl::trailing_sep(lexy::dsl::p)); static constexpr auto value = lexy::fold_inplace( std::initializer_list {}, - [](ovdl::csv::LineObject& result, auto&& arg) { - if constexpr (std::is_same_v, std::size_t>) { - // Count seperators, adds to previous value, making it a position - using position_type = ovdl::csv::LineObject::position_type; - result.emplace_back(static_cast(arg + result.back().first), ""); + [](ovdl::csv::LineObject& result, std::size_t&& arg) { + // Count seperators, adds to previous value, making it a position + using position_type = ovdl::csv::LineObject::position_type; + result.emplace_back(static_cast(arg + result.back().first), ""); + }, + [](ovdl::csv::LineObject& result, std::string&& arg) { + if (result.empty()) { + result.emplace_back(0u, LEXY_MOV(arg)); } else { - if (result.empty()) result.emplace_back(0u, LEXY_MOV(arg)); - else { - auto& [pos, value] = result.back(); - value = arg; - } + auto& [pos, value] = result.back(); + value = LEXY_MOV(arg); } }); }; @@ -169,74 +246,17 @@ namespace ovdl::csv::grammar { static constexpr auto value = lexy::as_list>; }; - template - using CommaFile = File { ',', false, '$' }>; - template - using ColonFile = File { ':', false, '$' }>; - template - using SemiColonFile = File { ';', false, '$' }>; - template - using TabFile = File { '\t', false, '$' }>; - template - using BarFile = File { '|', false, '$' }>; - - namespace strings { - template - using CommaFile = File { ',', true, '$' }>; - template - using ColonFile = File { ':', true, '$' }>; - template - using SemiColonFile = File { ';', true, '$' }>; - template - using TabFile = File { '\t', true, '$' }>; - template - using BarFile = File { '|', true, '$' }>; - } -} - -namespace ovdl::csv::grammar::windows1252 { - struct windows1252_t { - static constexpr auto character = dsl::make_range<0x01, 0xFF>(); - static constexpr auto control = - lexy::dsl::ascii::control / - lexy::dsl::lit_b<0x81> / lexy::dsl::lit_b<0x8D> / lexy::dsl::lit_b<0x8F> / - lexy::dsl::lit_b<0x90> / lexy::dsl::lit_b<0x9D>; - }; - - using CommaFile = CommaFile; - using ColonFile = ColonFile; - using SemiColonFile = SemiColonFile; - using TabFile = TabFile; - using BarFile = BarFile; - - namespace strings { - using CommaFile = grammar::strings::CommaFile; - using ColonFile = grammar::strings::ColonFile; - using SemiColonFile = grammar::strings::SemiColonFile; - using TabFile = grammar::strings::TabFile; - using BarFile = grammar::strings::BarFile; - - } -} - -namespace ovdl::csv::grammar::utf8 { - struct unicode_t { - static constexpr auto character = lexy::dsl::unicode::character; - static constexpr auto control = lexy::dsl::unicode::control; - }; - - using CommaFile = CommaFile; - using ColonFile = ColonFile; - using SemiColonFile = SemiColonFile; - using TabFile = TabFile; - using BarFile = BarFile; + using CommaFile = File; + using ColonFile = File; + using SemiColonFile = File; + using TabFile = File; + using BarFile = File; namespace strings { - using CommaFile = grammar::strings::CommaFile; - using ColonFile = grammar::strings::ColonFile; - using SemiColonFile = grammar::strings::SemiColonFile; - using TabFile = grammar::strings::TabFile; - using BarFile = grammar::strings::BarFile; - + using CommaFile = File; + using ColonFile = File; + using SemiColonFile = File; + using TabFile = File; + using BarFile = File; } } \ No newline at end of file diff --git a/src/openvic-dataloader/csv/CsvParseState.hpp b/src/openvic-dataloader/csv/CsvParseState.hpp index 2390453..ee60c34 100644 --- a/src/openvic-dataloader/csv/CsvParseState.hpp +++ b/src/openvic-dataloader/csv/CsvParseState.hpp @@ -1,28 +1,16 @@ #pragma once -#include -#include #include #include #include -template -struct LexyEncodingFrom { -}; +#include "File.hpp" +#include "ParseState.hpp" +#include "detail/InternalConcepts.hpp" -template<> -struct LexyEncodingFrom { - using encoding = lexy::default_encoding; -}; +namespace ovdl::csv { + using CsvParseState = ovdl::FileParseState>>; -template<> -struct LexyEncodingFrom { - using encoding = lexy::utf8_char_encoding; -}; - -template -using CsvFile = ovdl::BasicFile::encoding, std::vector>; - -template -using CsvParseState = ovdl::FileParseState>; \ No newline at end of file + static_assert(detail::IsFileParseState, "CsvParseState failed IsFileParseState concept"); +} \ No newline at end of file diff --git a/src/openvic-dataloader/csv/Parser.cpp b/src/openvic-dataloader/csv/Parser.cpp index 361f6ad..5dbee32 100644 --- a/src/openvic-dataloader/csv/Parser.cpp +++ b/src/openvic-dataloader/csv/Parser.cpp @@ -1,11 +1,14 @@ +#include +#include +#include #include -#include +#include #include #include -#include +#include #include -#include +#include #include #include @@ -22,15 +25,27 @@ using namespace ovdl::csv; /// ParseHandler /// -template -struct Parser::ParseHandler final : detail::BasicFileParseHandler> { +struct Parser::ParseHandler final : detail::BasicFileParseHandler { template std::optional parse() { - auto result = lexy::parse(this->buffer(), *this->_parse_state, this->_parse_state->logger().error_callback()); + auto result = [&] { + switch (parse_state().encoding()) { + using enum detail::Encoding; + case Ascii: + case Utf8: + return lexy::parse(buffer(), parse_state(), parse_state().logger().error_callback()); + case Unknown: + case Windows1251: + case Windows1252: + return lexy::parse(buffer(), parse_state(), parse_state().logger().error_callback()); + default: + ovdl::detail::unreachable(); + } + }(); if (!result) { - return this->_parse_state->logger().get_errors(); + return this->parse_state().logger().get_errors(); } - _lines = std::move(result.value()); + _lines = LEXY_MOV(result).value(); return std::nullopt; } @@ -42,55 +57,45 @@ private: std::vector _lines; }; -/// BufferHandler /// +/// ParserHandler /// -template -Parser::Parser() +Parser::Parser() : _parse_handler(std::make_unique()) { set_error_log_to_null(); } -template -Parser::Parser(std::basic_ostream& error_stream) +Parser::Parser(std::basic_ostream& error_stream) : _parse_handler(std::make_unique()) { set_error_log_to(error_stream); } -template -Parser::Parser(Parser&&) = default; -template -Parser& Parser::operator=(Parser&&) = default; -template -Parser::~Parser() = default; +Parser::Parser(Parser&&) = default; +Parser& Parser::operator=(Parser&&) = default; +Parser::~Parser() = default; -template -Parser Parser::from_buffer(const char* data, std::size_t size) { +Parser Parser::from_buffer(const char* data, std::size_t size, std::optional encoding_fallback) { Parser result; - return std::move(result.load_from_buffer(data, size)); + return std::move(result.load_from_buffer(data, size, encoding_fallback)); } -template -Parser Parser::from_buffer(const char* start, const char* end) { +Parser Parser::from_buffer(const char* start, const char* end, std::optional encoding_fallback) { Parser result; - return std::move(result.load_from_buffer(start, end)); + return std::move(result.load_from_buffer(start, end, encoding_fallback)); } -template -Parser Parser::from_string(const std::string_view string) { +Parser Parser::from_string(const std::string_view string, std::optional encoding_fallback) { Parser result; - return std::move(result.load_from_string(string)); + return std::move(result.load_from_string(string, encoding_fallback)); } -template -Parser Parser::from_file(const char* path) { +Parser Parser::from_file(const char* path, std::optional encoding_fallback) { Parser result; - return std::move(result.load_from_file(path)); + return std::move(result.load_from_file(path, encoding_fallback)); } -template -Parser Parser::from_file(const std::filesystem::path& path) { +Parser Parser::from_file(const std::filesystem::path& path, std::optional encoding_fallback) { Parser result; - return std::move(result.load_from_file(path)); + return std::move(result.load_from_file(path, encoding_fallback)); } /// @@ -106,9 +111,8 @@ Parser Parser::from_file(const std::filesystem::path& path) /// @param func /// @param args /// -template template -constexpr void Parser::_run_load_func(detail::LoadCallback auto func, Args... args) { +constexpr void Parser::_run_load_func(detail::LoadCallback auto func, Args... args) { _has_fatal_error = false; auto error = func(_parse_handler.get(), std::forward(args)...); auto error_message = _parse_handler->make_error_from(error); @@ -122,82 +126,66 @@ constexpr void Parser::_run_load_func(detail::LoadCallback -constexpr Parser& Parser::load_from_buffer(const char* data, std::size_t size) { +constexpr Parser& Parser::load_from_buffer(const char* data, std::size_t size, std::optional encoding_fallback) { // Type can't be deduced? - _run_load_func(std::mem_fn(&ParseHandler::load_buffer_size), data, size); + _run_load_func(std::mem_fn(&ParseHandler::load_buffer_size), data, size, encoding_fallback); return *this; } -template -constexpr Parser& Parser::load_from_buffer(const char* start, const char* end) { +constexpr Parser& Parser::load_from_buffer(const char* start, const char* end, std::optional encoding_fallback) { // Type can't be deduced? - _run_load_func(std::mem_fn(&ParseHandler::load_buffer), start, end); + _run_load_func(std::mem_fn(&ParseHandler::load_buffer), start, end, encoding_fallback); return *this; } -template -constexpr Parser& Parser::load_from_string(const std::string_view string) { - return load_from_buffer(string.data(), string.size()); +constexpr Parser& Parser::load_from_string(const std::string_view string, std::optional encoding_fallback) { + return load_from_buffer(string.data(), string.size(), encoding_fallback); } -template -Parser& Parser::load_from_file(const char* path) { +Parser& Parser::load_from_file(const char* path, std::optional encoding_fallback) { set_file_path(path); // Type can be deduced?? - _run_load_func(std::mem_fn(&ParseHandler::load_file), path); + _run_load_func(std::mem_fn(&ParseHandler::load_file), get_file_path().data(), encoding_fallback); return *this; } -template -Parser& Parser::load_from_file(const std::filesystem::path& path) { - return load_from_file(path.string().c_str()); +Parser& Parser::load_from_file(const std::filesystem::path& path, std::optional encoding_fallback) { + return load_from_file(path.string().c_str(), encoding_fallback); } -template -bool Parser::parse_csv(bool handle_strings) { +bool Parser::parse_csv(bool handle_strings) { if (!_parse_handler->is_valid()) { return false; } - std::optional::error_range> errors; - // auto report_error = ovdl::detail::ReporError.path(_file_path).to(detail::OStreamOutputIterator { _error_stream }); - if constexpr (Encoding == EncodingType::Windows1252) { + std::optional errors = [&] { if (handle_strings) - errors = _parse_handler->template parse(); + return _parse_handler->template parse(); else - errors = _parse_handler->template parse(); - } else { - if (handle_strings) - errors = _parse_handler->template parse(); - else - errors = _parse_handler->template parse(); - } + return _parse_handler->template parse(); + }(); _has_error = _parse_handler->parse_state().logger().errored(); _has_warning = _parse_handler->parse_state().logger().warned(); if (!errors->empty()) { + _has_error = true; _has_fatal_error = true; if (&_error_stream.get() != &detail::cnull) { print_errors_to(_error_stream); } return false; } - _lines = std::move(_parse_handler->get_lines()); return true; } -template -const std::vector& Parser::get_lines() const { - return _lines; +const std::vector& Parser::get_lines() const { + return _parse_handler->get_lines(); } -template -typename Parser::error_range Parser::get_errors() const { +typename Parser::error_range Parser::get_errors() const { return _parse_handler->parse_state().logger().get_errors(); } -template -const FilePosition Parser::get_error_position(const error::Error* error) const { +const FilePosition Parser::get_error_position(const error::Error* error) const { if (!error || !error->is_linked_in_tree()) { return {}; } @@ -206,18 +194,27 @@ const FilePosition Parser::get_error_position(const error::Error* erro return {}; } - auto loc_begin = lexy::get_input_location(_parse_handler->buffer(), err_location.begin()); - FilePosition result { loc_begin.line_nr(), loc_begin.line_nr(), loc_begin.column_nr(), loc_begin.column_nr() }; - if (err_location.begin() < err_location.end()) { - auto loc_end = lexy::get_input_location(_parse_handler->buffer(), err_location.end(), loc_begin.anchor()); - result.end_line = loc_end.line_nr(); - result.end_column = loc_end.column_nr(); - } - return result; +// TODO: Remove reinterpret_cast +// WARNING: This almost certainly breaks on utf16 and utf32 encodings, luckily we don't parse in that format +// This is purely to silence the node_location errors because char8_t is useless +#define REINTERPRET_IT(IT) reinterpret_cast::encoding::char_type*>((IT)) + + return _parse_handler->parse_state().file().visit_buffer( + [&](auto&& buffer) -> FilePosition { + auto loc_begin = lexy::get_input_location(buffer, REINTERPRET_IT(err_location.begin())); + FilePosition result { loc_begin.line_nr(), loc_begin.line_nr(), loc_begin.column_nr(), loc_begin.column_nr() }; + if (err_location.begin() < err_location.end()) { + auto loc_end = lexy::get_input_location(buffer, REINTERPRET_IT(err_location.end()), loc_begin.anchor()); + result.end_line = loc_end.line_nr(); + result.end_column = loc_end.column_nr(); + } + return result; + }); + +#undef REINTERPRET_IT } -template -void Parser::print_errors_to(std::basic_ostream& stream) const { +void Parser::print_errors_to(std::basic_ostream& stream) const { auto errors = get_errors(); if (errors.empty()) return; for (const auto error : errors) { @@ -226,19 +223,9 @@ void Parser::print_errors_to(std::basic_ostream& stream) const { [&](const error::BufferError* buffer_error) { stream << "buffer error: " << buffer_error->message() << '\n'; }, - [&](const error::ParseError* parse_error) { - auto position = get_error_position(parse_error); - std::string pos_str = fmt::format(":{}:{}: ", position.start_line, position.start_column); - stream << _file_path << pos_str << "parse error for '" << parse_error->production_name() << "': " << parse_error->message() << '\n'; - }, - [&](dryad::child_visitor visitor, const error::Semantic* semantic) { - auto position = get_error_position(semantic); - std::string pos_str = ": "; - if (!position.is_empty()) { - pos_str = fmt::format(":{}:{}: ", position.start_line, position.start_column); - } - stream << _file_path << pos_str << semantic->message() << '\n'; - auto annotations = semantic->annotations(); + [&](dryad::child_visitor visitor, const error::AnnotatedError* annotated_error) { + stream << annotated_error->message() << '\n'; + auto annotations = annotated_error->annotations(); for (auto annotation : annotations) { visitor(annotation); } @@ -250,7 +237,4 @@ void Parser::print_errors_to(std::basic_ostream& stream) const { stream << secondary->message() << '\n'; }); } -} - -template class ovdl::csv::Parser; -template class ovdl::csv::Parser; \ No newline at end of file +} \ No newline at end of file diff --git a/src/openvic-dataloader/detail/Convert.hpp b/src/openvic-dataloader/detail/Convert.hpp new file mode 100644 index 0000000..5d9fca0 --- /dev/null +++ b/src/openvic-dataloader/detail/Convert.hpp @@ -0,0 +1,577 @@ +#pragma once + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "openvic-dataloader/detail/Encoding.hpp" + +#include "ParseState.hpp" // IWYU pragma: keep +#include "detail/InternalConcepts.hpp" +#include "detail/dsl.hpp" +#include "v2script/ParseState.hpp" + +namespace ovdl::convert { + struct MappedChar { + char value; + std::string_view utf8; + + constexpr bool is_invalid() const { return value == 0; } + constexpr bool is_pass() const { return value == 1; } + }; + constexpr MappedChar invalid_map { 0, "" }; + constexpr MappedChar pass_map { 1, "" }; + + struct map_value { + std::string_view _value; + + constexpr map_value() noexcept : _value("") {} + constexpr map_value(std::nullptr_t) noexcept : _value("\0") {} + constexpr explicit map_value(std::string_view val) noexcept : _value(val) {} + + constexpr bool is_invalid() const { + return !_value.empty() && _value[0] == '\0'; + } + + constexpr bool is_pass() const { + return _value.empty(); + } + + constexpr bool is_valid() const noexcept { + return !_value.empty() && _value[0] != '\0'; + } + + constexpr explicit operator bool() const noexcept { + return is_valid(); + } + }; + + template + concept IsConverter = requires(unsigned char c, lexy::_pr>& reader) { + { T::try_parse(reader) } -> std::same_as; + }; + + struct Utf8 { + static constexpr auto map = lexy::symbol_table; + + template + static constexpr map_value try_parse(Reader& reader) { + return {}; + } + }; + static_assert(IsConverter); + + struct Windows1252 { + static constexpr auto map = lexy::symbol_table // + .map<'\x80'>("€") + .map<'\x82'>("‚") + .map<'\x83'>("ƒ") + .map<'\x84'>("„") + .map<'\x85'>("…") + .map<'\x86'>("†") + .map<'\x87'>("‡") + .map<'\x88'>("ˆ") + .map<'\x89'>("‰") + .map<'\x8A'>("Š") + .map<'\x8B'>("‹") + .map<'\x8C'>("Œ") + .map<'\x8E'>("Ž") + + .map<'\x91'>("‘") + .map<'\x92'>("’") + .map<'\x93'>("“") + .map<'\x94'>("”") + .map<'\x95'>("•") + .map<'\x96'>("–") + .map<'\x97'>("—") + .map<'\x98'>("˜") + .map<'\x99'>("™") + .map<'\x9A'>("š") + .map<'\x9B'>("›") + .map<'\x9C'>("œ") + .map<'\x9E'>("ž") + .map<'\x9F'>("Ÿ") + + .map<'\xA0'>(" ") + .map<'\xA1'>("¡") + .map<'\xA2'>("¢") + .map<'\xA3'>("£") + .map<'\xA4'>("¤") + .map<'\xA5'>("¥") + .map<'\xA6'>("¦") + .map<'\xA7'>("§") + .map<'\xA8'>("¨") + .map<'\xA9'>("©") + .map<'\xAA'>("ª") + .map<'\xAB'>("«") + .map<'\xAC'>("¬") + .map<'\xAD'>("­") // Soft Hyphen + .map<'\xAE'>("®") + .map<'\xAF'>("¯") + + .map<'\xB0'>("°") + .map<'\xB1'>("±") + .map<'\xB2'>("²") + .map<'\xB3'>("³") + .map<'\xB4'>("´") + .map<'\xB5'>("µ") + .map<'\xB6'>("¶") + .map<'\xB7'>("·") + .map<'\xB8'>("¸") + .map<'\xB9'>("¹") + .map<'\xBA'>("º") + .map<'\xBB'>("»") + .map<'\xBC'>("¼") + .map<'\xBD'>("½") + .map<'\xBE'>("¾") + .map<'\xBF'>("¿") + + .map<'\xC0'>("À") + .map<'\xC1'>("Á") + .map<'\xC2'>("Â") + .map<'\xC3'>("Ã") + .map<'\xC4'>("Ä") + .map<'\xC5'>("Å") + .map<'\xC6'>("Æ") + .map<'\xC7'>("Ç") + .map<'\xC8'>("È") + .map<'\xC9'>("É") + .map<'\xCA'>("Ê") + .map<'\xCB'>("Ë") + .map<'\xCC'>("Ì") + .map<'\xCD'>("Í") + .map<'\xCE'>("Î") + .map<'\xCF'>("Ï") + + .map<'\xD0'>("Ð") + .map<'\xD1'>("Ñ") + .map<'\xD2'>("Ò") + .map<'\xD3'>("Ó") + .map<'\xD4'>("Ô") + .map<'\xD5'>("Õ") + .map<'\xD6'>("Ö") + .map<'\xD7'>("×") + .map<'\xD8'>("Ø") + .map<'\xD9'>("Ù") + .map<'\xDA'>("Ú") + .map<'\xDB'>("Û") + .map<'\xDC'>("Ü") + .map<'\xDD'>("Ý") + .map<'\xDE'>("Þ") + .map<'\xDF'>("ß") + + .map<'\xE0'>("à") + .map<'\xE1'>("á") + .map<'\xE2'>("â") + .map<'\xE3'>("ã") + .map<'\xE4'>("ä") + .map<'\xE5'>("å") + .map<'\xE6'>("æ") + .map<'\xE7'>("ç") + .map<'\xE8'>("è") + .map<'\xE9'>("é") + .map<'\xEA'>("ê") + .map<'\xEB'>("ë") + .map<'\xEC'>("ì") + .map<'\xED'>("í") + .map<'\xEE'>("î") + .map<'\xEF'>("ï") + + .map<'\xF0'>("ð") + .map<'\xF1'>("ñ") + .map<'\xF2'>("ò") + .map<'\xF3'>("ó") + .map<'\xF4'>("ô") + .map<'\xF5'>("õ") + .map<'\xF6'>("ö") + .map<'\xF7'>("÷") + .map<'\xF8'>("ø") + .map<'\xF9'>("ù") + .map<'\xFA'>("ú") + .map<'\xFB'>("û") + .map<'\xFC'>("ü") + .map<'\xFD'>("ý") + .map<'\xFE'>("þ") + .map<'\xFF'>("ÿ"); + + template + static constexpr map_value try_parse(Reader& reader) { + auto index = map.try_parse(reader); + if (index) { + return map_value(map[index]); + } + return {}; + } + }; + static_assert(IsConverter); + + struct Windows1251 { + static constexpr auto map = lexy::symbol_table // + .map<'\x80'>("Ђ") + .map<'\x81'>("Ѓ") + .map<'\x82'>("‚") + .map<'\x83'>("ѓ") + .map<'\x84'>("„") + .map<'\x85'>("…") + .map<'\x86'>("†") + .map<'\x87'>("‡") + .map<'\x88'>("€") + .map<'\x89'>("‰") + .map<'\x8A'>("Љ") + .map<'\x8B'>("‹") + .map<'\x8C'>("Њ") + .map<'\x8D'>("Ќ") + .map<'\x8E'>("Ћ") + .map<'\x8F'>("Џ") + + .map<'\x90'>("ђ") + .map<'\x91'>("‘") + .map<'\x92'>("’") + .map<'\x93'>("“") + .map<'\x94'>("”") + .map<'\x95'>("•") + .map<'\x96'>("–") + .map<'\x97'>("—") + .map<'\x99'>("™") + .map<'\x9A'>("љ") + .map<'\x9B'>("›") + .map<'\x9C'>("њ") + .map<'\x9D'>("ќ") + .map<'\x9E'>("ћ") + .map<'\x9F'>("џ") + + .map<'\xA0'>(" ") + .map<'\xA1'>("Ў") + .map<'\xA2'>("ў") + .map<'\xA3'>("Ј") + .map<'\xA4'>("¤") + .map<'\xA5'>("Ґ") + .map<'\xA6'>("¦") + .map<'\xA7'>("§") + .map<'\xA8'>("Ё") + .map<'\xA9'>("©") + .map<'\xAA'>("Є") + .map<'\xAB'>("«") + .map<'\xAC'>("¬") + .map<'\xAD'>("­") // Soft Hyphen + .map<'\xAE'>("®") + .map<'\xAF'>("Ї") + + .map<'\xB0'>("°") + .map<'\xB1'>("±") + .map<'\xB2'>("І") + .map<'\xB3'>("і") + .map<'\xB4'>("ґ") + .map<'\xB5'>("µ") + .map<'\xB6'>("¶") + .map<'\xB7'>("·") + .map<'\xB8'>("ё") + .map<'\xB9'>("№") + .map<'\xBA'>("є") + .map<'\xBB'>("»") + .map<'\xBC'>("ј") + .map<'\xBD'>("Ѕ") + .map<'\xBE'>("ѕ") + .map<'\xBF'>("ї") + + .map<'\xC0'>("А") + .map<'\xC1'>("Б") + .map<'\xC2'>("В") + .map<'\xC3'>("Г") + .map<'\xC4'>("Д") + .map<'\xC5'>("Е") + .map<'\xC6'>("Ж") + .map<'\xC7'>("З") + .map<'\xC8'>("И") + .map<'\xC9'>("Й") + .map<'\xCA'>("К") + .map<'\xCB'>("Л") + .map<'\xCC'>("М") + .map<'\xCD'>("Н") + .map<'\xCE'>("О") + .map<'\xCF'>("П") + + .map<'\xD0'>("Р") + .map<'\xD1'>("С") + .map<'\xD2'>("Т") + .map<'\xD3'>("У") + .map<'\xD4'>("Ф") + .map<'\xD5'>("Х") + .map<'\xD6'>("Ц") + .map<'\xD7'>("Ч") + .map<'\xD8'>("Ш") + .map<'\xD9'>("Щ") + .map<'\xDA'>("Ъ") + .map<'\xDB'>("Ы") + .map<'\xDC'>("Ь") + .map<'\xDD'>("Э") + .map<'\xDE'>("Ю") + .map<'\xDF'>("Я") + + .map<'\xE0'>("а") + .map<'\xE1'>("б") + .map<'\xE2'>("в") + .map<'\xE3'>("г") + .map<'\xE4'>("д") + .map<'\xE5'>("е") + .map<'\xE6'>("ж") + .map<'\xE7'>("з") + .map<'\xE8'>("и") + .map<'\xE9'>("й") + .map<'\xEA'>("к") + .map<'\xEB'>("л") + .map<'\xEC'>("м") + .map<'\xED'>("н") + .map<'\xEE'>("о") + .map<'\xEF'>("п") + + .map<'\xF0'>("р") + .map<'\xF1'>("с") + .map<'\xF2'>("т") + .map<'\xF3'>("у") + .map<'\xF4'>("ф") + .map<'\xF5'>("х") + .map<'\xF6'>("ц") + .map<'\xF7'>("ч") + .map<'\xF8'>("ш") + .map<'\xF9'>("щ") + .map<'\xFA'>("ъ") + .map<'\xFB'>("ы") + .map<'\xFC'>("ь") + .map<'\xFD'>("э") + .map<'\xFE'>("ю") + .map<'\xFF'>("я"); + + template + static constexpr map_value try_parse(Reader& reader) { + auto index = map.try_parse(reader); + if (index) { + return map_value(map[index]); + } + return {}; + } + }; + static_assert(IsConverter); + + template + constexpr map_value try_parse_map(detail::Encoding&& encoding, Reader& reader) { + switch (encoding) { + case detail::Encoding::Unknown: + case detail::Encoding::Ascii: + case detail::Encoding::Utf8: return Utf8::try_parse(reader); + case detail::Encoding::Windows1251: return Windows1251::try_parse(reader); + case detail::Encoding::Windows1252: return Windows1252::try_parse(reader); + } + ovdl::detail::unreachable(); + } + + template + using _string_char_type = LEXY_DECAY_DECLTYPE(LEXY_DECLVAL(String)[0]); + + template + concept IsErrorHandler = + std::is_convertible_v // + && requires(T t, ovdl::v2script::ast::ParseState& state, lexy::_pr> reader) { + { T::on_invalid_character(state, reader) }; + }; + + struct EmptyHandler { + static constexpr void on_invalid_character(detail::IsStateType auto& state, auto reader) {} + }; + + template> Error = EmptyHandler> + constexpr auto convert_as_string = + dsl::sink( + lexy::fold_inplace( + std::initializer_list<_string_char_type> {}, // + [](String& result, detail::IsStateType auto& state, CharT c) { + if constexpr (std::is_convertible_v) { + switch (state.encoding()) { + using enum ovdl::detail::Encoding; + case Ascii: + case Utf8: + break; + // Skip Ascii and Utf8 encoding + default: { + map_value val = {}; + CharT char_array[] { c, CharT() }; + auto input = lexy::range_input(&char_array[0], &char_array[1]); + auto reader = input.reader(); + + // prefer preserving unknown conversion maps, least things will work, they'll just probably display wrong + // map = make_map_from(state.encoding(), c); + val = try_parse_map(state.encoding(), reader); + + // Invalid characters are dropped + if (val.is_invalid()) { + Error::on_invalid_character(state, reader); + return; + } + + // non-pass characters are not valid ascii and are mapped to utf8 values + if (!val.is_pass()) { + result.append(val._value); + return; + } + + break; + } + } + } + + result.push_back(c); // + }, // + [](String& result, detail::IsStateType auto& state, String&& str) { + if constexpr (std::is_convertible_v) { + switch (state.encoding()) { + using enum ovdl::detail::Encoding; + case Ascii: + case Utf8: + break; + // Skip Ascii and Utf8 encoding + default: { + auto input = lexy::string_input(str); + auto reader = input.reader(); + using encoding = decltype(reader)::encoding; + constexpr auto eof = encoding::eof(); + + if constexpr (requires { result.reserve(str.size()); }) { + result.reserve(str.size()); + } + + auto begin = reader.position(); + auto last_it = begin; + while (reader.peek() != eof) { + map_value val = try_parse_map(state.encoding(), reader); + + if (val.is_invalid()) { + Error::on_invalid_character(state, reader); + reader.bump(); + continue; + } else if (!val.is_pass()) { + result.append(val._value); + last_it = reader.position(); + continue; + } + + reader.bump(); + result.append(last_it, reader.position()); + last_it = reader.position(); + } + if (last_it != begin) { + result.append(last_it, reader.position()); + return; + } + break; + } + } + } + + result.append(LEXY_MOV(str)); // + }, // + [](String& result, detail::IsStateType auto& state, Iterator begin, Iterator end) // + -> decltype(void(LEXY_DECLVAL(Str).append(begin, end))) { + if constexpr (std::is_convertible_v) { + switch (state.encoding()) { + using enum ovdl::detail::Encoding; + case Ascii: + case Utf8: + break; + // Skip Ascii and Utf8 encoding + default: { + auto input = lexy::range_input(begin, end); + auto reader = input.reader(); + using encoding = decltype(reader)::encoding; + constexpr auto eof = encoding::eof(); + + if constexpr (requires { result.reserve(end - begin); }) { + result.reserve(end - begin); + } + + auto begin = reader.position(); + auto last_it = begin; + while (reader.peek() != eof) { + map_value val = try_parse_map(state.encoding(), reader); + + if (val.is_invalid()) { + Error::on_invalid_character(state, reader); + reader.bump(); + continue; + } else if (!val.is_pass()) { + result.append(val._value); + last_it = reader.position(); + continue; + } + + reader.bump(); + result.append(last_it, reader.position()); + last_it = reader.position(); + } + if (last_it != begin) { + result.append(last_it, reader.position()); + return; + } + break; + } + } + } + + result.append(begin, end); // + }, // + [](String& result, detail::IsStateType auto& state, lexy::lexeme lex) { + using encoding = typename Reader::encoding; + using _char_type = _string_char_type; + static_assert(lexy::char_type_compatible_with_reader, + "cannot convert lexeme to this string type"); + + if constexpr ((std::same_as || std::same_as) && + std::convertible_to) { + auto input = lexy::range_input(lex.begin(), lex.end()); + auto reader = input.reader(); + using encoding = decltype(reader)::encoding; + constexpr auto eof = encoding::eof(); + + if constexpr (requires { result.reserve(lex.end() - lex.begin()); }) { + result.reserve(lex.end() - lex.begin()); + } + + auto begin = reader.position(); + auto last_it = begin; + while (reader.peek() != eof) { + map_value val = try_parse_map(state.encoding(), reader); + + if (val.is_invalid()) { + Error::on_invalid_character(state, reader); + reader.bump(); + continue; + } else if (!val.is_pass()) { + result.append(val._value); + last_it = reader.position(); + continue; + } + + reader.bump(); + result.append(last_it, reader.position()); + last_it = reader.position(); + } + if (last_it != begin) { + result.append(last_it, reader.position()); + return; + } + } + + result.append(lex.begin(), lex.end()); // + })); +} \ No newline at end of file diff --git a/src/openvic-dataloader/detail/Detect.cpp b/src/openvic-dataloader/detail/Detect.cpp new file mode 100644 index 0000000..1516fc7 --- /dev/null +++ b/src/openvic-dataloader/detail/Detect.cpp @@ -0,0 +1,351 @@ +#include "detail/Detect.hpp" + +using namespace ovdl; +using namespace ovdl::encoding_detect; + +static constexpr int64_t INVALID_CLASS = 255; + +std::optional Utf8Canidate::read(const std::span& buffer) { + auto lexy_buffer = lexy::make_buffer_from_raw(buffer.data(), buffer.size()); + if (is_utf8(lexy_buffer)) { + return 0; + } + + return std::nullopt; +} + +std::optional AsciiCanidate::read(const std::span& buffer) { + auto lexy_buffer = lexy::make_buffer_from_raw(buffer.data(), buffer.size()); + if (is_ascii(lexy_buffer)) { + return 0; + } + + return std::nullopt; +} + +std::optional NonLatinCasedCanidate::read(const std::span& buffer) { + static constexpr cbyte LATIN_LETTER = 1; + static constexpr int64_t NON_LATIN_MIXED_CASE_PENALTY = -20; + static constexpr int64_t NON_LATIN_ALL_CAPS_PENALTY = -40; + static constexpr int64_t NON_LATIN_CAPITALIZATION_BONUS = 40; + static constexpr int64_t LATIN_ADJACENCY_PENALTY = -50; + + int64_t score = 0; + for (const ubyte& b : buffer) { + const ubyte byte_class = score_data.classify(b); + if (byte_class == INVALID_CLASS) { + return std::nullopt; + } + + const ubyte caseless_class = byte_class & 0x7F; + const bool ascii = b < 0x80; + const bool ascii_pair = prev_ascii == 0 && ascii; + const bool non_ascii_alphabetic = score_data.is_non_latin_alphabetic(caseless_class); + + if (caseless_class == LATIN_LETTER) { + case_state = CaseState::Mix; + } else if (!non_ascii_alphabetic) { + switch (case_state) { + default: break; + case CaseState::UpperLower: + score += NON_LATIN_CAPITALIZATION_BONUS; + break; + case CaseState::AllCaps: + // pass + break; + case CaseState::Mix: + score += NON_LATIN_MIXED_CASE_PENALTY * current_word_len; + break; + } + case_state = CaseState::Space; + } else if (byte_class >> 7 == 0) { + switch (case_state) { + default: break; + case CaseState::Space: + case_state = CaseState::Lower; + break; + case CaseState::Upper: + case_state = CaseState::UpperLower; + break; + case CaseState::AllCaps: + case_state = CaseState::Mix; + break; + } + } else { + switch (case_state) { + default: break; + case CaseState::Space: + case_state = CaseState::Upper; + break; + case CaseState::Upper: + case_state = CaseState::AllCaps; + break; + case CaseState::Lower: + case CaseState::UpperLower: + case_state = CaseState::Mix; + break; + } + } + + if (non_ascii_alphabetic) { + current_word_len += 1; + } else { + if (current_word_len > longest_word) { + longest_word = current_word_len; + } + current_word_len = 0; + } + + const bool is_a0 = b == 0xA0; + + if (!ascii_pair) { + // 0xA0 is no-break space in many other encodings, so avoid + // assigning score to IBM866 when 0xA0 occurs next to itself + // or a space-like byte. + if (!(ibm866 && ((is_a0 && (prev_was_a0 || prev == 0)) || caseless_class == 0 && prev_was_a0))) { + score += score_data.score(caseless_class, prev); + } + + if (prev == LATIN_LETTER && + non_ascii_alphabetic) { + score += LATIN_ADJACENCY_PENALTY; + } else if (caseless_class == LATIN_LETTER && score_data.is_non_latin_alphabetic(prev)) { + score += LATIN_ADJACENCY_PENALTY; + } + } + + prev_ascii = ascii; + prev = caseless_class; + prev_was_a0 = is_a0; + } + return score; +} + +std::optional LatinCanidate::read(const std::span& buffer) { + static constexpr int64_t IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY = -180; + static constexpr int64_t ORDINAL_BONUS = 300; + static constexpr int64_t COPYRIGHT_BONUS = 222; + static constexpr int64_t IMPLAUSIBILITY_PENALTY = -220; + + int64_t score = 0; + for (const ubyte& b : buffer) { + const ubyte byte_class = score_data.classify(b); + if (byte_class == INVALID_CLASS) { + return std::nullopt; + } + + const ubyte caseless_class = byte_class & 0x7F; + const bool ascii = b < 0x80; + const bool ascii_pair = prev_non_ascii == 0 && ascii; + + int16_t non_ascii_penalty = -200; + switch (prev_non_ascii) { + case 0: + case 1: + case 2: + non_ascii_penalty = 0; + break; + case 3: + non_ascii_penalty = -5; + break; + case 4: + non_ascii_penalty = 20; + break; + } + score += non_ascii_penalty; + + if (!score_data.is_latin_alphabetic(caseless_class)) { + case_state = CaseState::Space; + } else if (byte_class >> 7 == 0) { + if (case_state == CaseState::AllCaps && !ascii_pair) { + score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY; + } + case_state = CaseState::Lower; + } else { + switch (case_state) { + case CaseState::Lower: + if (!ascii_pair) { + score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY; + } + [[fallthrough]]; + case CaseState::Space: + case_state = CaseState::Upper; + break; + case CaseState::Upper: + case CaseState::AllCaps: + case_state = CaseState::AllCaps; + break; + } + } + + bool ascii_ish_pair = ascii_pair || (ascii && prev == 0) || (caseless_class == 0 && prev_non_ascii == 0); + + if (!ascii_ish_pair) { + score += score_data.score(caseless_class, prev); + } + + if (windows1252) { + switch (ordinal_state) { + case OrdinalState::Other: + if (caseless_class == 0) { + ordinal_state = OrdinalState::Space; + } + break; + case OrdinalState::Space: + if (caseless_class == 0) { + // pass + } else if (b == 0xAA || b == 0xBA) { + ordinal_state = OrdinalState::OrdinalExpectingSpace; + } else if (b == 'M' || b == 'D' || b == 'S') { + ordinal_state = OrdinalState::FeminineAbbreviationStartLetter; + } else if (b == 'N') { + // numero or Nuestra + ordinal_state = OrdinalState::UpperN; + } else if (b == 'n') { + // numero + ordinal_state = OrdinalState::LowerN; + } else if (caseless_class == ASCII_DIGIT) { + ordinal_state = OrdinalState::Digit; + } else if (caseless_class == 9 /* I */ || caseless_class == 22 /* V */ || caseless_class == 24) + /* X */ + { + ordinal_state = OrdinalState::Roman; + } else if (b == 0xA9) { + ordinal_state = OrdinalState::Copyright; + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::OrdinalExpectingSpace: + if (caseless_class == 0) { + score += ORDINAL_BONUS; + ordinal_state = OrdinalState::Space; + } else { + ordinal_state = OrdinalState::Other; + } + case OrdinalState::OrdinalExpectingSpaceUndoImplausibility: + if (caseless_class == 0) { + score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY; + ordinal_state = OrdinalState::Space; + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::OrdinalExpectingSpaceOrDigit: + if (caseless_class == 0) { + score += ORDINAL_BONUS; + ordinal_state = OrdinalState::Space; + } else if (caseless_class == ASCII_DIGIT) { + score += ORDINAL_BONUS; + // Deliberately set to `Other` + ordinal_state = OrdinalState::Other; + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily: + if (caseless_class == 0) { + score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY; + ordinal_state = OrdinalState::Space; + } else if (caseless_class == ASCII_DIGIT) { + score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY; + // Deliberately set to `Other` + ordinal_state = OrdinalState::Other; + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::UpperN: + if (b == 0xAA) { + ordinal_state = + OrdinalState::OrdinalExpectingSpaceUndoImplausibility; + } else if (b == 0xBA) { + ordinal_state = + OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily; + } else if (b == '.') { + ordinal_state = OrdinalState::PeriodAfterN; + } else if (caseless_class == 0) { + ordinal_state = OrdinalState::Space; + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::LowerN: + if (b == 0xBA) { + ordinal_state = + OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily; + } else if (b == '.') { + ordinal_state = OrdinalState::PeriodAfterN; + } else if (caseless_class == 0) { + ordinal_state = OrdinalState::Space; + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::FeminineAbbreviationStartLetter: + if (b == 0xAA) { + ordinal_state = + OrdinalState::OrdinalExpectingSpaceUndoImplausibility; + } else if (caseless_class == 0) { + ordinal_state = OrdinalState::Space; + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::Digit: + if (b == 0xAA || b == 0xBA) { + ordinal_state = OrdinalState::OrdinalExpectingSpace; + } else if (caseless_class == 0) { + ordinal_state = OrdinalState::Space; + } else if (caseless_class == ASCII_DIGIT) { + // pass + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::Roman: + if (b == 0xAA || b == 0xBA) { + ordinal_state = + OrdinalState::OrdinalExpectingSpaceUndoImplausibility; + } else if (caseless_class == 0) { + ordinal_state = OrdinalState::Space; + } else if (caseless_class == 9 /* I */ || caseless_class == 22 /* V */ || caseless_class == 24) + /* X */ + { + // pass + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::PeriodAfterN: + if (b == 0xBA) { + ordinal_state = OrdinalState::OrdinalExpectingSpaceOrDigit; + } else if (caseless_class == 0) { + ordinal_state = OrdinalState::Space; + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::Copyright: + if (caseless_class == 0) { + score += COPYRIGHT_BONUS; + ordinal_state = OrdinalState::Space; + } else { + ordinal_state = OrdinalState::Other; + } + break; + } + } + + if (ascii) { + prev_non_ascii = 0; + } else { + prev_non_ascii += 1; + } + prev = caseless_class; + } + return score; +} + +template struct ovdl::encoding_detect::DetectUtf8; +template struct ovdl::encoding_detect::DetectUtf8; diff --git a/src/openvic-dataloader/detail/Detect.hpp b/src/openvic-dataloader/detail/Detect.hpp new file mode 100644 index 0000000..ad36d04 --- /dev/null +++ b/src/openvic-dataloader/detail/Detect.hpp @@ -0,0 +1,627 @@ +/// Based heavily on https://github.com/hsivonen/chardetng/tree/143dadde20e283a46ef33ba960b517a3283a3d22 + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "detail/dsl.hpp" + +namespace ovdl::encoding_detect { + using cbyte = char; + using ubyte = unsigned char; + + using Encoding = detail::Encoding; + + struct DetectAscii { + // & 0b10000000 == 0b00000000 + static constexpr auto rule = lexy::dsl::while_(lexy::dsl::ascii::character) + lexy::dsl::eol; + static constexpr auto value = lexy::constant(true); + }; + + template + struct DetectUtf8 { + struct not_utf8 { + static constexpr auto name = "not utf8"; + }; + + static constexpr auto rule = [] { + constexpr auto is_not_ascii_flag = lexy::dsl::context_flag; + + // & 0b10000000 == 0b00000000 + constexpr auto ascii_values = lexy::dsl::ascii::character; + // & 0b11100000 == 0b11000000 + constexpr auto two_byte = dsl::lit_b_range<0b11000000, 0b11011111>; + // & 0b11110000 == 0b11100000 + constexpr auto three_byte = dsl::lit_b_range<0b11100000, 0b11101111>; + // & 0b11111000 == 0b11110000 + constexpr auto four_byte = dsl::lit_b_range<0b11110000, 0b11110111>; + // & 0b11000000 == 0b10000000 + constexpr auto check_bytes = dsl::lit_b_range<0b10000000, 0b10111111>; + + constexpr auto utf8_check = + ((four_byte >> lexy::dsl::times<3>(check_bytes)) | + (three_byte >> lexy::dsl::times<2>(check_bytes)) | + (two_byte >> lexy::dsl::times<1>(check_bytes))) >> + is_not_ascii_flag.set(); + + return is_not_ascii_flag.template create() + + lexy::dsl::while_(utf8_check | ascii_values) + + lexy::dsl::must(is_not_ascii_flag.is_set()).template error + lexy::dsl::eof; + }(); + + static constexpr auto value = lexy::constant(true); + }; + + extern template struct DetectUtf8; + extern template struct DetectUtf8; + + template + constexpr bool is_ascii(const Input& input) { + return lexy::match(input); + } + + template + constexpr bool is_utf8_no_ascii(const Input& input) { + return lexy::match>(input); + } + + template + constexpr bool is_utf8(const Input& input) { + return lexy::match>(input); + } + + struct DetectorData { + static constexpr std::array latin_ascii = std::to_array({ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 0, 0, 0, 0, 0, 0, // + 0, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, // + 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 0, 0, 0, 0, 0, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 0, 0, 0, 0, 0, // + }); + + static constexpr std::array non_latin_ascii = std::to_array({ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 0, 0, 0, 0, 0, 0, // + 0, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, // + 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 0, 0, 0, 0, 0, // + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, // + }); + + static constexpr std::array windows_1251 = std::to_array({ + 131, 130, 0, 2, 0, 0, 0, 0, 0, 0, 132, 0, 133, 130, 134, 135, // + 3, 0, 0, 0, 0, 0, 0, 0, 255, 0, 4, 0, 5, 2, 6, 7, // + 0, 136, 8, 140, 47, 130, 46, 47, 138, 49, 139, 49, 50, 46, 48, 141, // + 49, 50, 137, 9, 2, 49, 48, 46, 10, 47, 11, 48, 12, 130, 2, 13, // + 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, // + 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, // + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, // + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, // + }); + + static constexpr std::array windows_1252 = std::to_array({ + 0, 255, 0, 60, 0, 0, 0, 0, 0, 0, 156, 0, 157, 255, 185, 255, // + 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 28, 0, 29, 255, 57, 186, // + 0, 62, 60, 60, 60, 60, 59, 60, 60, 62, 60, 59, 63, 59, 61, 60, // + 62, 63, 61, 61, 60, 62, 61, 59, 60, 61, 60, 59, 62, 62, 62, 62, // + 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, // + 188, 174, 175, 176, 177, 178, 179, 63, 180, 181, 182, 183, 184, 188, 188, 27, // + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, // + 60, 46, 47, 48, 49, 50, 51, 63, 52, 53, 54, 55, 56, 60, 60, 58, // + }); + + // clang-format off + static constexpr std::array cyrillic = std::to_array({ + 0, 0, 0, 0, 1, 0, 16, 38, 0, 2, 5, 10,121, 4, 20, 25, 26, 53, 9, 5, 61, 23, 20, 26, 15, 95, 60, 2, 26, 15, 25, 29, 0, 14, 6, 6, 25, 1, 0, 27, 25, 8, 5, 39, // , + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // a, + 0, 0, 0,255, 0, 0,255, 0,255, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0,255, 0, 0, // ѓ, + 0, 0,255, 0, 0, 0, 0, 0,255,255,255,255, 0,255, 2, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255, // ђ, + 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255, 0,255, 0, 0, 0, 0, 0, 4, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255, // љ, + 0, 0, 0, 0, 0, 0, 0, 0,255,255,255, 0, 0,255, 5, 0, 0, 0, 0, 2, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255, // њ, + 0, 0,255, 0, 0, 0, 0, 0,255, 0,255,255, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 1,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255, // ћ, + 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255, // џ, + 7, 0, 0,255,255,255,255,255, 0, 1, 0,255,255,255, 15, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 1, 0, 0, 0, 1, // ў, + 12, 0, 0,255,255, 0,255,255, 0, 2, 0, 0, 0, 0, 2, 3, 15, 5, 5, 0, 0, 4, 0, 0, 21, 15, 10, 17, 0, 6, 14, 4, 6, 0, 3, 1, 8, 1, 0, 0, 0, 2, 0, 0, 0, 0, // і, + 0, 0,255,255,255,255,255,255, 0, 0, 0,255,255, 0, 4, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ё, + 6, 0, 0,255,255,255,255,255, 0, 0,255, 5,255, 0, 1, 7, 0, 3, 2, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 2, 2, 5, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // є, + 12, 0, 0, 0, 0, 0, 0, 0,255, 0,255, 0, 0, 0, 5, 1, 0, 0, 0, 2, 0, 0, 20,255, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0,255,255,255,255, // ј, + 9, 0, 0,255,255,255,255,255,255, 5,255, 0, 0, 13, 3, 3, 0, 4, 1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 4, 0, 0, 1, 3, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ї, + 32, 0, 0, 2, 2, 2, 0, 0, 0, 1, 0, 0, 28, 0, 23, 22, 26, 22, 19, 0, 3, 12, 5, 0, 44, 38, 18, 58, 1, 21, 44, 17, 54, 1, 2, 28, 5, 8, 3, 1, 9, 0, 12, 0, 0, 0, // а, + 40, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 7, 0, 0, 0, 1, 7, 0, 1, 1, 0, 0, 7, 4, 1, 9, 0, 1, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, // б, + 31, 0, 0, 0, 0, 0, 0, 0, 0, 11, 0, 3, 0, 0, 19, 0, 0, 1, 1, 6, 0, 2, 6, 0, 1, 0, 1, 0, 32, 0, 2, 2, 23, 9, 0, 0, 0, 1, 0, 0, 1, 1, 0, 3, 0, 2, // в, + 23, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 7, 0, 1, 20, 0, 0, 1, 0, 9, 0, 0, 9, 7, 0, 5, 2, 18, 11, 0, 8, 3, 2, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 13, 0, 3, // г, + 26, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 2, 0, 2, 19, 0, 1, 5, 0, 13, 2, 2, 3, 2, 0, 6, 1, 12, 30, 0, 4, 0, 0, 7, 0, 0, 0, 0, 0, 0, 1, 0, 0, 5, 0, 1, // д, + 12, 0, 0, 1, 4, 5, 0, 0, 0, 0, 0, 0, 24, 1, 5, 7, 11, 3, 12, 1, 6, 6, 11, 0, 3, 15, 14, 14, 4, 8, 25, 14, 29, 0, 1, 1, 4, 8, 8, 2, 0, 3, 1, 0, 0, 0, // е, + 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 3, 2, 1, 2, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, // ж, + 19, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 1, 6, 0, 0, 0, 11, 8, 0, 0, 8, 0, 0, 0, 0, 0, 4, 0, 1, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, // з, + 24, 0, 0, 0, 0, 1, 5, 0, 0, 0, 0, 0, 1, 0, 1, 10, 16, 21, 22, 0, 6, 5, 6, 1, 15, 15, 8, 38, 2, 4, 27, 9, 15, 0, 3, 8, 12, 7, 6, 1, 0, 0, 0, 0, 0, 0, // и, + 6, 0, 0, 0,255,255,255,255, 0, 7, 0, 0,255, 4, 21, 0, 0, 0, 0, 5, 0, 0, 39, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 5, 0, 3, 0, 0, // й, + 54, 0, 0, 0, 0, 0, 0, 0, 1, 8, 0, 0, 0, 0, 10, 0, 1, 0, 1, 11, 0, 0, 12, 0, 1, 2, 0, 4, 8, 0, 2, 23, 2, 4, 0, 2, 3, 3, 8, 0, 0, 3, 16, 1, 4, 3, // к, + 12, 0, 0, 0, 0, 0, 0, 0, 2, 6, 0, 6, 0, 4, 29, 12, 4, 5, 2, 18, 0, 0, 17, 4, 5, 11, 0, 0, 21, 2, 3, 4, 1, 15, 1, 0, 0, 0, 0, 0, 4, 3, 2, 12, 0, 2, // л, + 23, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 4, 0, 0, 17, 1, 0, 0, 0, 7, 0, 1, 13, 2, 0, 0, 0, 0, 13, 0, 2, 4, 0, 2, 0, 0, 0, 0, 0, 0, 1, 4, 2, 4, 1, 1, // м, + 42, 0, 0, 0, 0, 0, 0, 0, 4, 12, 6, 7, 1, 7, 76, 0, 22, 1, 4, 27, 1, 3, 34, 30, 0, 7, 1, 13, 24, 1, 3, 5, 3, 4, 0, 1, 0, 4, 1, 0, 2, 18, 7, 16, 0, 4, // н, + 37, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 1, 0, 1, 10, 27, 22, 15, 1, 2, 3, 7, 5, 32, 11, 7, 38, 8, 21, 24, 11, 23, 0, 2, 10, 2, 2, 3, 2, 0, 0, 1, 0, 0, 0, // о, + 47, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 0, 0, 2, 0, 1, 2, 4, 0, 0, 2, 0, 6, 0, 0, 5, 0, 2, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, // п, + 19, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 8, 0, 5, 47, 4, 6, 6, 5, 23, 0, 0, 5, 2, 6, 0, 0, 0, 23, 22, 0, 1, 14, 9, 1, 0, 1, 0, 0, 0, 7, 2, 8, 16, 0, 3, // р, + 53, 0, 0, 0, 0, 0, 0, 0, 4, 9, 2, 0, 1, 2, 21, 1, 4, 1, 2, 11, 0, 0, 12, 2, 4, 7, 1, 13, 15, 1, 4, 6, 3, 6, 0, 0, 0, 0, 0, 0, 1, 2, 3, 5, 0, 1, // с, + 28, 0, 0, 0, 0, 0, 0, 0, 1, 6, 0, 1, 0, 1, 32, 0, 1, 3, 0, 12, 0, 1, 22, 1, 4, 7, 1, 6, 23, 0, 14, 41, 14, 3, 0, 1, 1, 1, 21, 0, 2, 2, 6, 2, 1, 4, // т, + 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 2, 4, 2, 4, 6, 3, 0, 2, 0, 0, 6, 5, 6, 3, 0, 3, 7, 4, 7, 18, 1, 6, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, // у, + 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ф, + 41, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 2, 30, 0, 2, 0, 0, 11, 0, 0, 5, 1, 14, 3, 0, 3, 6, 0, 7, 0, 0, 1, 0, 1, 0, 2, 0, 0, 0, 4, 3, 5, 0, 0, // х, + 8, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 7, 0, 0, 0, 0, 4, 0, 0, 7, 1, 0, 1, 0, 2, 1, 0, 0, 9, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 1, 1, // ц, + 6, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 5, 0, 1, 5, 0, 2, 0, 0, 6, 0, 0, 1, 0, 0, 3, 0, 2, 0, 0, 2, 0, 1, 0, 0, 3, 0, 0, 2, 0, 0, 0, 0, // ч, + 12, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 17, 0, 0, 1, 0, 2, 0, 0, 26, 0, 0, 0, 0, 0, 22, 2, 6, 0, 0, 5, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, // ш, + 2, 0,255, 0,255,255,255,255,255, 0, 0, 0,255, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, // щ, + 0, 0,255,255,255,255, 0,255, 0, 0, 0,255,255,255, 0, 3, 4, 0, 2, 0, 0, 0, 0, 0, 11, 0, 1, 0, 0, 2, 2, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ъ, + 1, 0, 0,255,255,255,255,255, 0, 0, 0, 0, 0,255, 0, 3, 11, 0, 4, 0, 2, 1, 0, 0, 0, 3, 1, 16, 0, 0, 22, 2, 10, 0, 0, 0, 8, 6, 3, 0, 0, 0, 0, 0, 0, 0, // ы, + 0, 0, 0,255,255, 0, 0, 0,255, 0, 0, 0, 0, 0, 5, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 7, 3, 0, 1, 13, 7, 7, 0, 35, 6, 0, 0, 0, 0, 0, 0, 0, 6, 0, // ь, + 10, 0, 0,255,255,255,255,255, 0, 0, 0, 0,255, 0, 0, 1, 1, 10, 11, 0, 2, 2, 0, 0, 0, 9, 3, 9, 0, 0, 7, 6, 9, 0, 0, 8, 3, 2, 1, 0, 0, 0, 0, 17, 0, 0, // э, + 14, 0, 0, 0,255,255,255,255, 0, 0, 0, 0,255, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ю, + 5, 0, 0,255,255,255,255,255, 0, 9, 0, 0,255, 0, 11, 0, 3, 0, 0, 0, 0, 2, 24, 0, 0, 5, 2, 14, 1, 0, 2, 3, 1, 0, 0, 1, 3, 0, 0, 0, 0, 16, 1, 0, 0, 0, // я, + // , a, ѓ, ђ, љ, њ, ћ, џ, ў, і, ё, є, ј, ї, а, б, в, г, д, е, ж, з, и, й, к, л, м, н, о, п, р, с, т, у, ф, х, ц, ч, ш, щ, ъ, ы, ь, э, ю, я, + }); + // clang-format on + + // clang-format off + static constexpr std::array western = std::to_array({ + 18, 3, 0,254, 74, 0, 5,254,254, 2, 25,254,149, 4,254, 66,148,254, 0,254,122,238, 8, 1, 20, 13,254, 35, 20, 3, 1, 0, // , + 0, 3, 0, 0, 0, 0, 0, 5, 2, 0, 86, 9, 76, 0, 0, 0,241, 0, 0, 49, 0, 0, 0, 0, 11, 2, 0, 34, 0, 1, 2, 0, // a, + 19, 0, 0, 5, 5, 0, 0, 8, 13, 5, 0, 34, 22, 0, 0, 0, 4, 0, 0, 0, 6, 1, 3, 3, 42, 37, 8, 8, 0, 67, 0, 0, // b, + 0, 0, 0, 9, 6, 1, 0, 22, 10, 1, 0, 19, 54, 1, 0, 1, 18, 3, 1, 2, 40, 7, 0, 0, 6, 0, 3, 5, 1, 34, 0, 0, // c, + 0, 0, 0, 5, 5, 0, 0, 12, 45, 16, 1, 6, 42, 0, 13, 3, 10, 0, 2, 0, 66, 11, 5, 8, 33,104, 3, 4, 0, 19, 0, 0, // d, + 63, 5, 0, 0, 0, 0, 2, 33, 15, 1, 3, 0, 87, 0, 0, 0, 0, 0, 1, 21, 0, 0, 0, 49, 1, 11, 0, 3, 0, 9, 1, 0, // e, + 0, 0, 0, 8, 8, 0, 0, 10, 2, 7, 0,162, 23, 0, 13, 0, 4, 0, 0, 0, 1, 3, 0, 0, 15, 4, 0, 0, 0, 4, 0, 0, // f, + 1, 0, 0, 14, 16, 24, 0, 29, 11, 41, 0, 13, 86, 0, 14, 9, 3, 0, 0, 0, 20, 8, 7, 7, 13, 37, 14, 0, 0, 12, 0, 0, // g, + 1, 0, 0, 0, 0, 0, 0, 47, 2, 0, 0, 0, 1, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 29, 20, 0, 0, 0, 0, 45, 0, 0, // h, + 5, 4, 0,166,120, 0, 0,144, 0, 2, 3, 88,254, 0, 0, 0, 0, 0, 0, 3, 28,107, 0,112, 8, 2, 44, 32, 0, 3, 3, 0, // i, + 0, 0, 0, 0, 0, 0, 0, 39, 9, 0, 0, 2, 1, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 16, 18, 44, 0, 0, 0, 0, 0,255, // j, + 0, 2, 0, 0, 1, 0, 0, 48, 31, 32, 1, 60, 1, 0, 4, 0, 1, 0, 0, 0, 1, 3, 0, 2, 20, 47, 0, 0, 0, 20, 0, 0, // k, + 4, 0, 0, 12, 16, 0, 0, 54, 40, 48, 0, 64, 36, 0, 39, 6, 12, 3, 0, 0, 27, 9, 3, 24, 42, 33, 2, 9, 7, 77, 0, 0, // l, + 0, 0, 0, 14, 5, 4, 0, 60, 11, 4, 3, 48, 30, 7, 28, 1, 10, 1, 0, 0, 24, 41, 3, 3, 19, 24, 1, 8, 2, 36, 0, 0, // m, + 1, 1, 0, 24, 91, 16, 0,132, 62, 73, 1, 56, 71, 33, 78, 7, 35, 2, 3, 0, 94,254, 10, 21, 33, 38, 24, 21, 1, 61, 0, 0, // n, + 0, 1, 0, 0, 0, 0,254, 6, 0, 1, 27, 0, 13, 0, 0, 84,127, 0, 0, 62, 0, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, // o, + 0, 0, 0, 5, 2, 0, 0, 9, 15, 0, 0, 4, 34, 0, 6, 0, 6, 0, 0, 0, 20, 12, 9, 28, 10, 22, 0, 3, 0, 7, 0, 0, // p, + 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 33, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,255,255, // q, + 0, 0, 0, 83, 62, 1, 0,198,139,125, 0,229, 94, 54,190, 38, 18, 1, 0, 0,176, 24, 16, 29,193,181, 13, 13, 2,131, 0, 0, // r, + 1, 0, 0, 41, 34, 0, 0, 41, 24, 42, 0, 68,113, 15,159, 6, 43, 19, 4, 58, 14, 18, 1, 4, 48, 42, 4, 12, 9, 20, 0, 0, // s, + 7, 1, 0, 14, 20, 8, 0, 56, 37, 31, 0,104, 67, 14,113, 3, 50, 9, 5, 0, 89, 7, 19, 22, 13, 14, 40, 12, 15, 18, 0, 0, // t, + 0, 1, 5, 1, 2, 0, 0, 30, 0, 0, 1, 15, 2, 0, 1, 0, 1, 0, 0, 2, 4, 0, 0, 36, 0, 0, 0, 0, 0, 0, 0, 0, // u, + 0, 2, 0, 1, 6, 0, 0, 29, 33, 13, 0, 19, 46, 0, 15, 0, 7, 0, 1, 31, 2, 2, 3, 1, 32, 27, 0, 0, 1, 1, 0, 0, // v, + 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 3, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0,255, // w, + 0, 0, 0, 1, 16, 0, 0, 23, 0, 0, 0, 3, 14, 0, 0, 0, 2, 3, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, // x, + 0, 0, 0, 0, 0, 0, 0, 58, 8, 0, 0, 1, 1, 62, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 6, 82, 0, 0, 0, 0, 0,255, // y, + 0, 0, 0, 0, 2, 0, 0, 0, 14, 0, 0, 7, 3, 0, 6, 0, 3, 5, 0, 0, 0, 0, 4, 0, 1, 0, 0, 0, 0, 0, 0, 0, // z, + 0, 29, 0, 0, 0, 15, 0, 0, 0, 11, 0, 0, 0, 0, 0, 20, 0, 0, 0, 0, 0, 37, 0, 0, 0, 0, 0, 0,255,255, 0, 0,255,255, 4, 0, 0,255,255, 0,255, 0,255, 0, 0,255,255,255, 0, 0, 0, 8, 0,255, 0, 0, 2, 0, 0, // ß, + 6, 2, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 10, 1, 0, 0, 0, 0, 0, 0, 0,255, 0, 1, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // š, + 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0,255,255,255, 0, 0, 0,255,255,255, 0,255,255,255,255, 0, 0,255,255,255,255,255,255, 0,255,255,255, 0,255,255, // œ, + 107, 0, 22, 16, 18, 14, 6, 24, 46, 15, 2, 0, 42, 18, 17, 0, 36, 0, 34, 4,254, 1, 2, 0, 0, 1, 0, 0, 0,255, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0,255,255,255,255,255, 0, 0,255, 0, 0, 0, // à, + 41, 0, 10, 8, 21, 34, 5, 5, 60, 18, 5, 1, 29, 42, 26, 2, 16, 0, 27, 9, 43, 28, 7, 0, 0, 1, 4, 0, 0,255, 0, 0,255,255,255, 0,255, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255, 0, 0, 0, 0, 0,255, // á, + 24, 0, 1, 2, 0, 0, 0, 0, 7, 0, 0, 0, 3, 1, 0, 0, 0, 0, 2, 0, 5, 0, 1, 0, 0, 0, 0,255, 0,255, 0, 0, 0,255, 0,255, 0, 0, 0, 2, 0,255, 0,255, 0, 0, 0, 0,255, 0,255,255,255,255,255, 0,255, 0,255, // â, + 0, 0, 0, 1, 2, 3, 0, 1, 2, 12, 0, 0, 1, 7, 29, 4, 1,255, 11, 66, 11, 0, 1, 0, 0, 0, 0,255, 0,255,255,255, 0, 0, 0,255,255,127,255,255,255,255,255, 0, 0,255, 0, 0,255,255, 0,255,255,255,255,255,255,255,255, // ã, + 134, 1, 11, 0, 25, 6, 15, 11, 61, 24,123, 95,114, 68, 53, 1, 49, 0, 60, 98,198, 0, 88, 29, 0, 6, 12, 0, 0,255, 0,255, 0, 0,118, 0,255, 0,255, 0,255, 0,255, 0,255,255, 0,255,255, 0,255, 2,255,255,255, 0, 0, 0,255, // ä, + 156, 0, 12, 14, 19, 3, 12, 47, 17, 3, 12, 5, 30, 47, 22, 0,205, 0,184, 70, 19, 0, 22, 8, 0, 6, 1,255, 0,255,255, 0,255, 0, 0, 0, 0, 0,255, 0,255, 0,255, 0, 0,255,255,255,255,255,255, 0, 0,255,255,255,255,255,255, // å, + 26, 0, 7, 0, 4, 0, 23, 8, 15, 0, 18, 19, 56, 23, 24, 0, 9, 0, 82, 37, 24, 0, 71, 0, 0, 0, 0,255, 0,255,255, 0,255,255, 0, 0, 0, 0,255, 0,255,255,255, 0,255,255, 0,255,255,255,255, 0, 0,255,255,255,255, 0,255, // æ, + 17,112, 0, 2, 0, 15, 0, 0, 0, 35, 0, 0, 2, 0, 59, 9, 1, 0, 36, 0, 0, 8, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, // ç, + 254, 0, 9, 14, 20, 0, 15, 6, 70,144, 14, 45, 47, 92, 16, 3,123, 0, 38, 23,115, 52, 22, 42, 2, 80, 19,255, 0,255, 0, 0,255,255, 0,255,255, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0,255,255,255, 0, 0, 0, 1,255,255, // è, + 152, 2, 19, 24, 85, 0, 29, 23, 26, 25, 2, 9, 43, 60, 62, 1, 32, 0,122, 45,169, 15, 13, 30, 7, 4, 8, 0, 0,255, 0, 0, 0, 0, 0,255, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1,255, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, // é, + 5, 0, 0, 3, 7, 0, 0, 10, 2, 3, 0, 26, 6, 6, 20, 1, 2, 0, 20, 1, 11, 5, 5, 2, 0, 0, 1,255, 0,255,255,255, 0,255,255,255,255, 0, 0, 0, 0, 0,255, 0, 0, 0, 0,255, 0, 0,255,255,255, 0,255, 0, 0, 0,255, // ê, + 36, 2, 23, 15, 36,143, 5, 23, 52, 52, 66, 48, 92, 57,216, 10,125, 35, 89, 58,254, 9, 24, 14, 0, 0, 8,255, 0,255, 0,255,255,255, 0, 0,255, 1, 0, 0, 0, 0, 0,255, 0, 0, 0,255,255,255, 0, 0, 0, 0,255, 0, 0, 0,255, // ë, + 12, 0, 1, 4, 6, 0, 3, 21, 10, 0, 0, 0, 18, 8, 4, 0, 1, 0, 65, 35, 8, 3, 0, 0, 0, 0, 0,255, 0,255, 0, 0,255,255,255,255,255,255, 0, 0, 0,255, 0, 0, 0,255, 0, 0,255, 0,255,255,255, 0,255,255, 0, 0,255, // ì, + 40, 72, 7, 10, 16, 2, 23, 10, 34, 0, 0, 1, 34, 15, 21, 1, 3, 0,203, 28, 58, 23, 11, 0, 10, 0, 2, 0, 0, 0, 0, 0, 0,255, 0,255,255, 0, 0, 0, 0,255, 0, 0,255,255, 1,255, 0,255,255, 0,255,255, 0,255, 2, 0,255, // í, + 6, 5, 1, 9, 5, 0, 0, 0, 22, 0, 9, 8, 8, 6, 9, 1, 10, 0, 20, 6,182, 0, 13, 0, 0, 24, 1,255, 0,255,255,255, 0, 0,255, 0,255, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0,255,255,255,255,255, 0,255,255,255, // î, + 0, 6, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0,255, 0,255, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0,255, 0, 0, 0, 0,255,255, 0, 0, 0,255,255, // ï, + 0,254, 0, 0, 0, 26, 0, 0, 0, 61, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 25, 0, 0, 0, 0, 0,255,255,255, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 0,255, 0, 1, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,255, 0,255,255, // ñ, + 20, 0, 56, 43, 8,162, 14, 3, 23, 19, 2,118, 31, 26, 46, 0, 20, 0, 23, 6, 24, 19, 6, 21, 5, 27, 63,255, 0,255, 0, 0,255,255,255,255,255, 3, 0,255,255,255, 0, 0,255, 0, 0, 0, 0,255, 0,255,255, 0,255,255, 0,255,255, // ò, + 67, 0, 12, 15, 9, 7, 8, 66, 13,254, 3, 23, 14, 16, 16, 0, 8, 0, 29, 11, 26, 0, 5, 5, 1, 10, 13,255, 0,255,255, 0,255, 0, 0,255,255, 1,255, 0,255,255, 0, 0,255, 0, 1, 0, 0, 0, 0,255,255,255, 0,255,255, 0,255, // ó, + 18, 3, 3, 12, 1, 0, 2, 0, 7, 0, 1, 0, 2, 2, 8, 0, 6, 0, 6, 7, 4, 0, 2, 0, 0, 0, 1,255, 0, 0,255, 0, 0,255,255,255, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0,255,255,255,255, 0, 0,255,255,255, // ô, + 29, 2, 0, 0, 0, 0, 0, 0, 5, 2, 22, 30, 25, 38, 19, 0, 33,255, 4, 39, 24, 0, 88, 0, 0, 0, 0,255, 0,255,255, 0,255, 0,255,255,255, 36,255,255,255,255,255, 0,255,255, 0,255, 0, 0, 6, 0,255,255,255, 0, 0, 0,255, // õ, + 44, 0, 33, 0, 25, 0,142, 5, 46, 10, 25, 32, 26, 13, 6, 0, 3, 0, 30, 8, 35, 0, 25, 5, 0, 44, 7, 0, 0,255,255, 0,255,255, 73, 0,255, 0, 0, 0,255,255,255,255,255, 0, 0,255, 0, 0, 0, 39, 0,255,255,255, 0, 0, 0, // ö, + 52, 0, 21, 0, 57, 0,119, 12, 47, 3, 59, 33, 45, 15, 12, 0, 3, 0, 52, 82, 49, 1, 11, 0, 0, 0, 0, 0,255, 0,255,255,255,255,255, 0, 0, 0,255, 0,255,255,255, 0,255,255, 0,255,255,255,255, 0, 0,255,255,255,255,255, 0, // ø, + 25, 0, 4, 3, 53, 0, 0, 2, 12, 72, 0, 0, 30, 0, 0,254, 0, 0, 6, 3, 3, 0, 0, 0, 0, 0, 0,255, 0,255, 0,255, 0,255,255,255,255, 0, 0, 0, 0,255, 0,255,255,255,255, 0,255, 0, 0,255,255, 0, 0, 0, 0, 0, 0, // ù, + 19, 2, 1, 7, 9, 1, 12, 5, 9, 41, 1, 0, 10, 7, 9, 0, 8, 0, 12, 28, 8, 0, 0, 0, 0, 1, 0,255, 0,255,255, 0,255,255,255,255, 0, 0,255, 0,255,255,255, 0,255,255, 0, 0, 0,255, 0,255,255, 0, 0,255,255, 0,255, // ú, + 0, 0, 0, 0, 1, 5, 0, 0, 1, 0, 0, 0, 0, 0, 0, 45, 0, 0, 3, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255, 0,255,255,255,255, 0,255, 0,255,255,255, 0, 0,255,255,255,255, 0,255,255,255, 0,255, 0, 0,255, 0, // û, + 95, 2, 19, 0, 6, 2,121, 9, 15, 1, 5, 44, 18, 26, 7, 0, 11, 2, 68, 49, 20, 0, 2, 17, 0, 0, 6, 0, 0,255, 0,255,255,255, 0,255,255, 0,255, 0,255, 0,255,255,255, 0, 0,255,255,255, 0, 0,255, 0, 0, 0, 31, 0, 0, // ü, + 1, 1, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0,255,255, 0, 0,255, 0,255, 0,255,255,255,255, 0, 0, 0, 0,255, 0, 0, 0, 0, 0,255, // ž, + 0, 0, 0, 0, 0, 0,255, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0,255,255, 0,255,255,255,255,255,255, 0,255, 0,255,255,255,255,255,255,255,255,255,255,255,255,255, 0, 0,255, 0,255,255,255, 0, 0, 0, // ÿ, + // , a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, ß, š, œ, à, á, â, ã, ä, å, æ, ç, è, é, ê, ë, ì, í, î, ï, ñ, ò, ó, ô, õ, ö, ø, ù, ú, û, ü, ž, ÿ, + + }); + // clang-format on + }; + + namespace class_size { + constexpr std::size_t cyrillic_ascii = 2; + constexpr std::size_t cyrillic_non_ascii = 44; + constexpr std::size_t western_ascii = 27; + constexpr std::size_t western_non_ascii = 32; + } + + constexpr std::size_t ASCII_DIGIT = 100; + + struct ByteScore { + const Encoding encoding; + const std::array& lower; + const std::array& upper; + const std::span probabilities; + const std::size_t ascii; + const std::size_t non_ascii; + + static inline constexpr std::optional compute_index(std::size_t x, std::size_t y, std::size_t ascii_classes, std::size_t non_ascii_classes) { + if (x == 0 && y == 0) { + return std::nullopt; + } + + if (x < ascii_classes && y < ascii_classes) { + return std::nullopt; + } + + if (y >= ascii_classes) { + return (ascii_classes * non_ascii_classes) + (ascii_classes + non_ascii_classes) * (y - ascii_classes) + x; + } + + return y * non_ascii_classes + x - ascii_classes; + } + + inline constexpr cbyte classify(cbyte byte) const { + cbyte high = byte >> 7; + cbyte low = byte & 0x7F; + if (high == 0) { + return lower[low]; + } + + return upper[low]; + } + + inline constexpr bool is_latin_alphabetic(cbyte caseless_class) const { + return caseless_class > 0 && caseless_class < (ascii + non_ascii); + } + + inline constexpr bool is_non_latin_alphabetic(cbyte caseless_class) const { + return caseless_class > 1 && caseless_class < (ascii + non_ascii); + } + + inline constexpr int64_t score(cbyte current_class, cbyte previous_class) const { + constexpr std::size_t IMPLAUSABILITY_PENALTY = -220; + + constexpr std::size_t PLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE = 0; + constexpr std::size_t IMPLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE = 1; + constexpr std::size_t IMPLAUSIBLE_BEFORE_ALPHABETIC = 2; + constexpr std::size_t IMPLAUSIBLE_AFTER_ALPHABETIC = 3; + constexpr std::size_t PLAUSIBLE_NEXT_TO_NON_ASCII_ALPHABETIC_ON_EITHER_SIDE = 4; + constexpr std::size_t PLAUSIBLE_NEXT_TO_ASCII_ALPHABETIC_ON_EITHER_SIDE = 5; + + std::size_t stored_boundary = ascii + non_ascii; + if (current_class < stored_boundary) { + if (previous_class < stored_boundary) { + if (auto index = compute_index(previous_class, current_class, ascii, non_ascii); index) { + ubyte b = probabilities[index.value()]; + if (b == 255) { + return IMPLAUSABILITY_PENALTY; + } + return b; + } + return 0; + } + + if (current_class == 0 || current_class == ASCII_DIGIT) { + return 0; + } + + std::size_t previous_unstored = previous_class - stored_boundary; + switch (previous_unstored) { + case PLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE: + case IMPLAUSIBLE_AFTER_ALPHABETIC: + return 0; + case IMPLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE: + case IMPLAUSIBLE_BEFORE_ALPHABETIC: + return IMPLAUSABILITY_PENALTY; + case PLAUSIBLE_NEXT_TO_NON_ASCII_ALPHABETIC_ON_EITHER_SIDE: + if (current_class < ascii) { + return IMPLAUSABILITY_PENALTY; + } + return 0; + case PLAUSIBLE_NEXT_TO_ASCII_ALPHABETIC_ON_EITHER_SIDE: + if (current_class < ascii) { + return 0; + } + return IMPLAUSABILITY_PENALTY; + default: + assert(previous_class == ASCII_DIGIT); + return 0; + } + } + + if (previous_class < stored_boundary) { + if (previous_class == 0 || previous_class == ASCII_DIGIT) { + return 0; + } + + std::size_t current_unstored = current_class - stored_boundary; + switch (current_unstored) { + case PLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE: + case IMPLAUSIBLE_BEFORE_ALPHABETIC: + return 0; + case IMPLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE: + case IMPLAUSIBLE_AFTER_ALPHABETIC: + return IMPLAUSABILITY_PENALTY; + case PLAUSIBLE_NEXT_TO_NON_ASCII_ALPHABETIC_ON_EITHER_SIDE: + if (previous_class < ascii) { + return IMPLAUSABILITY_PENALTY; + } + return 0; + case PLAUSIBLE_NEXT_TO_ASCII_ALPHABETIC_ON_EITHER_SIDE: + if (previous_class < ascii) { + return 0; + } + return IMPLAUSABILITY_PENALTY; + default: + assert(current_class == ASCII_DIGIT); + return 0; + } + } + + if (current_class == ASCII_DIGIT || previous_class == ASCII_DIGIT) { + return 0; + } + + return IMPLAUSABILITY_PENALTY; + } + }; + + enum class ScoreIndex { + Windows1251, + Windows1252 + }; + + static constexpr std::array byte_scores { + ByteScore { + .encoding = Encoding::Windows1251, + .lower = DetectorData::non_latin_ascii, + .upper = DetectorData::windows_1251, + .probabilities = DetectorData::cyrillic, + .ascii = class_size::cyrillic_ascii, + .non_ascii = class_size::cyrillic_non_ascii }, + ByteScore { + .encoding = Encoding::Windows1252, + .lower = DetectorData::latin_ascii, + .upper = DetectorData::windows_1252, + .probabilities = DetectorData::western, + .ascii = class_size::western_ascii, + .non_ascii = class_size::western_non_ascii } + }; + + constexpr const ByteScore& get_byte_score(ScoreIndex index) { + return byte_scores[static_cast>(index)]; + } + + struct Utf8Canidate { + std::optional read(const std::span& buffer); + }; + + struct AsciiCanidate { + std::optional read(const std::span& buffer); + }; + + struct NonLatinCasedCanidate { + enum class CaseState { + Space, + Upper, + Lower, + UpperLower, + AllCaps, + Mix, + }; + + const ByteScore& score_data; + cbyte prev {}; + CaseState case_state = CaseState::Space; + bool prev_ascii = true; + uint64_t current_word_len {}; + uint64_t longest_word {}; + bool ibm866 = false; + bool prev_was_a0 = false; + + std::optional read(const std::span& buffer); + }; + + struct LatinCanidate { + enum class CaseState { + Space, + Upper, + Lower, + AllCaps, + }; + + enum class OrdinalState { + Other, + Space, + PeriodAfterN, + OrdinalExpectingSpace, + OrdinalExpectingSpaceUndoImplausibility, + OrdinalExpectingSpaceOrDigit, + OrdinalExpectingSpaceOrDigitUndoImplausibily, + UpperN, + LowerN, + FeminineAbbreviationStartLetter, + Digit, + Roman, + Copyright, + }; + + const ByteScore& score_data; + cbyte prev {}; + CaseState case_state = CaseState::Space; + uint32_t prev_non_ascii {}; + OrdinalState ordinal_state = OrdinalState::Space; // Used only when `windows1252 == true` + bool windows1252; + + constexpr LatinCanidate(const ByteScore& data) : score_data(data) { + windows1252 = data.encoding == Encoding::Windows1252; + } + + std::optional read(const std::span& buffer); + }; + + using InnerCanidate = std::variant; + + template + struct overloaded : Ts... { + using Ts::operator()...; + }; + + template + overloaded(Ts...) -> overloaded; + + struct Canidate { + InnerCanidate inner; + std::optional score_value; + + template + static constexpr Canidate create_canidate() { + return { + .inner = CanidateT(), + .score_value = 0 + }; + } + + template + static constexpr Canidate create_canidate(const ByteScore& score) { + return { + .inner = CanidateT { score }, + .score_value = 0 + }; + } + + static constexpr Canidate new_utf8() { + return create_canidate(); + } + + static constexpr Canidate new_ascii() { + return create_canidate(); + } + + static constexpr Canidate new_latin(ScoreIndex index) { + return create_canidate(get_byte_score(index)); + } + + static constexpr Canidate new_non_latin_cased(ScoreIndex index) { + return create_canidate(get_byte_score(index)); + } + + constexpr std::optional score(const std::span& buffer, std::size_t encoding, bool expectation_is_valid) { + if (auto old_score = score_value) { + auto new_score = std::visit([&](auto& inner) { + return inner.read(buffer); + }, + inner); + if (new_score) { + score_value = old_score.value() + new_score.value(); + } else { + score_value = std::nullopt; + } + } + + if (auto nlcc = std::get_if(&inner)) { + if (nlcc->longest_word < 2) { + return std::nullopt; + } + } + return score_value; + } + + constexpr Encoding encoding() const { + return std::visit( + overloaded { + [](const Utf8Canidate& canidate) { + return Encoding::Utf8; + }, + [](const AsciiCanidate& canidate) { + return Encoding::Ascii; + }, + [](const LatinCanidate& canidate) { + return canidate.score_data.encoding; + }, + [](const NonLatinCasedCanidate& canidate) { + return canidate.score_data.encoding; + } }, + inner); + } + }; + + struct Detector { + std::vector canidates { + Canidate::new_ascii(), + Canidate::new_utf8(), + Canidate::new_latin(ScoreIndex::Windows1252), + Canidate::new_non_latin_cased(ScoreIndex::Windows1251), + }; + + Encoding default_fallback = Encoding::Unknown; + + constexpr std::pair detect_assess(std::span buffer, bool allow_utf8 = true) { + int64_t max = 0; + Encoding encoding = default_fallback; // Presumes fallback, defaults to Unknown encoding if unknown (which skips conversion) + std::size_t i = 0; + for (Canidate& canidate : canidates) { + if (!allow_utf8 && canidate.encoding() == Encoding::Utf8) { + continue; + } + + if (auto score = canidate.score(buffer, i, false)) { + switch (canidate.encoding()) { + using enum Encoding; + case Ascii: + case Utf8: + return { canidate.encoding(), true }; + default: break; + } + + auto value = score.value(); + if (value > max) { + max = value; + encoding = canidate.encoding(); + } + } + i++; + } + return { encoding, max >= 0 }; + } + + constexpr Encoding detect(std::span buffer, bool allow_utf8 = true) { + return detect_assess(buffer, allow_utf8).first; + } + + template + std::pair detect_assess(const lexy::buffer& buffer, bool allow_utf8 = true) { + auto span = std::span(buffer.data(), buffer.size()); + return detect_assess(span); + } + + template + constexpr Encoding detect(const lexy::buffer& buffer, bool allow_utf8 = true) { + return detect_assess(buffer, allow_utf8).first; + } + }; +} \ No newline at end of file diff --git a/src/openvic-dataloader/detail/DetectUtf8.hpp b/src/openvic-dataloader/detail/DetectUtf8.hpp deleted file mode 100644 index e9d0350..0000000 --- a/src/openvic-dataloader/detail/DetectUtf8.hpp +++ /dev/null @@ -1,53 +0,0 @@ -#pragma once - -#include -#include - -#include "detail/dsl.hpp" - -namespace ovdl::detail { - namespace detect_utf8 { - - template - struct DetectUtf8 { - struct not_utf8 { - static constexpr auto name = "not utf8"; - }; - - static constexpr auto rule = [] { - constexpr auto is_not_ascii_flag = lexy::dsl::context_flag; - - // & 0b10000000 == 0b00000000 - constexpr auto ascii_values = dsl::make_range<0b00000000, 0b01111111>(); - // & 0b11100000 == 0b11000000 - constexpr auto two_byte = dsl::make_range<0b11000000, 0b11011111>(); - // & 0b11110000 == 0b11100000 - constexpr auto three_byte = dsl::make_range<0b11100000, 0b11101111>(); - // & 0b11111000 == 0b11110000 - constexpr auto four_byte = dsl::make_range<0b11110000, 0b11110111>(); - // & 0b11000000 == 0b10000000 - constexpr auto check_bytes = dsl::make_range<0b10000000, 0b10111111>(); - - constexpr auto utf8_check = - ((four_byte >> lexy::dsl::times<3>(check_bytes)) | - (three_byte >> lexy::dsl::times<2>(check_bytes)) | - (two_byte >> lexy::dsl::times<1>(check_bytes))) >> - is_not_ascii_flag.set(); - - return is_not_ascii_flag.template create() + - lexy::dsl::while_(utf8_check | ascii_values) + - lexy::dsl::must(is_not_ascii_flag.is_set()).template error; - }(); - }; - } - - template - constexpr bool is_utf8_no_ascii(const Input& input) { - return lexy::match>(input); - } - - template - constexpr bool is_utf8(const Input& input) { - return lexy::match>(input); - } -} \ No newline at end of file diff --git a/src/openvic-dataloader/detail/Errors.hpp b/src/openvic-dataloader/detail/Errors.hpp deleted file mode 100644 index fbebcc5..0000000 --- a/src/openvic-dataloader/detail/Errors.hpp +++ /dev/null @@ -1,25 +0,0 @@ -#pragma once - -#include - -#include - -namespace ovdl::errors { - inline const ParseError make_no_file_error(std::string_view file_path) { - std::string message; - if (file_path.empty()) { - message = "File path not specified."; - } else { - message = "File '" + std::string(file_path) + "' was not found."; - } - - return ParseError { ParseError::Type::Fatal, message, 1 }; - } -} - -namespace ovdl::v2script::errors { - -} - -namespace ovdl::ovscript::errors { -} \ No newline at end of file diff --git a/src/openvic-dataloader/detail/InternalConcepts.hpp b/src/openvic-dataloader/detail/InternalConcepts.hpp new file mode 100644 index 0000000..0c7913d --- /dev/null +++ b/src/openvic-dataloader/detail/InternalConcepts.hpp @@ -0,0 +1,127 @@ +#pragma once + +#include +#include + +#include +#include +#include + +#include +#include + +#include + +#include + +namespace ovdl::detail { + template + concept IsFile = + requires(T t, const typename T::node_type* node, NodeLocation location) { + typename T::node_type; + { t.set_location(node, location) } -> std::same_as; + { t.location_of(node) } -> std::same_as; + }; + + template + concept IsAst = + requires( + T t, + const T ct, + const typename T::node_type* node, + NodeLocation loc // + ) { + requires IsFile; + typename T::root_node_type; + typename T::node_type; + requires std::derived_from; + { t.set_location(node, loc) } -> std::same_as; + { t.location_of(node) } -> std::same_as; + { t.root() } -> std::same_as; + { ct.root() } -> std::same_as; + { t.file() } -> std::same_as; + { ct.file() } -> std::same_as; + }; + + template + concept IsDiagnosticLogger = requires( + T t, + const T ct, + const char* str, + std::size_t length, + std::string_view sv, + lexy_ext::diagnostic_kind diag_kind // + ) { + typename T::error_range; + typename T::Writer; + { static_cast(ct) } -> std::same_as; + { ct.errored() } -> std::same_as; + { ct.warned() } -> std::same_as; + { ct.get_errors() } -> std::same_as; + { t.intern(str, length) } -> std::same_as; + { t.intern(sv) } -> std::same_as; + { t.intern_cstr(str, length) } -> std::same_as; + { t.intern_cstr(sv) } -> std::same_as; + { t.symbol_interner() } -> std::same_as; + { ct.symbol_interner() } -> std::same_as; + { t.error(std::declval>()) } -> std::same_as; + { t.warning(std::declval>()) } -> std::same_as; + { t.note(std::declval>()) } -> std::same_as; + { t.info(std::declval>()) } -> std::same_as; + { t.debug(std::declval>()) } -> std::same_as; + { t.fixit(std::declval>()) } -> std::same_as; + { t.help(std::declval>()) } -> std::same_as; + { t.error(sv) } -> std::same_as; + { t.warning(sv) } -> std::same_as; + { t.note(sv) } -> std::same_as; + { t.info(sv) } -> std::same_as; + { t.debug(sv) } -> std::same_as; + { t.fixit(sv) } -> std::same_as; + { t.help(sv) } -> std::same_as; + { std::move(t.error_callback().sink()).finish() } -> std::same_as; + { t.log(diag_kind, std::declval>()) } -> std::same_as; + }; + + template + concept IsParseState = requires( + T t, + const T ct, + typename T::ast_type::file_type&& file, + lexy::buffer&& buffer, + ovdl::detail::Encoding encoding, + const char* path // + ) { + requires IsAst; + requires IsDiagnosticLogger; + { T { std::move(file), encoding } } -> std::same_as; + { T { std::move(buffer), encoding } } -> std::same_as; + { T { path, std::move(buffer), encoding } } -> std::same_as; + { t.ast() } -> std::same_as; + { ct.ast() } -> std::same_as; + { t.logger() } -> std::same_as; + { ct.logger() } -> std::same_as; + }; + + template + concept IsFileParseState = requires( + T t, + const T ct, + typename T::file_type&& file, + lexy::buffer&& buffer, + ovdl::detail::Encoding encoding, + const char* path // + ) { + requires IsFile; + requires IsDiagnosticLogger; + { T { std::move(file), encoding } } -> std::same_as; + { T { std::move(buffer), encoding } } -> std::same_as; + { T { path, std::move(buffer), encoding } } -> std::same_as; + { t.file() } -> std::same_as; + { ct.file() } -> std::same_as; + { t.logger() } -> std::same_as; + { ct.logger() } -> std::same_as; + }; + + template + concept IsStateType = IsParseState || IsFileParseState; +} \ No newline at end of file diff --git a/src/openvic-dataloader/detail/ParseHandler.cpp b/src/openvic-dataloader/detail/ParseHandler.cpp new file mode 100644 index 0000000..3818433 --- /dev/null +++ b/src/openvic-dataloader/detail/ParseHandler.cpp @@ -0,0 +1,347 @@ +#include "ParseHandler.hpp" + +#include +#include +#include +#include +#include + +#include + +using namespace ovdl::detail; + +#ifdef _WIN32 +#include +#include +#include + +#define WIN32_LEAN_AND_MEAN +#include +#undef WIN32_LEAN_AND_MEAN +#endif + +template +struct LangCodeLiteral { + char value[N]; + + constexpr LangCodeLiteral(const char (&str)[N]) { + std::copy_n(str, N, value); + } + + static constexpr std::integral_constant size = {}; + + constexpr const char& operator[](std::size_t index) const noexcept { + return value[index]; + } + + constexpr operator std::string_view() const noexcept { + return std::string_view(value, size()); + } + + constexpr bool operator==(const std::string_view view) const noexcept { + return view.size() >= size() + 1 && view.starts_with(*this) && view[size()] == '_'; + } +}; + +struct LangCodeView { + std::string_view view; + bool is_valid; + + constexpr LangCodeView() = default; + + template + constexpr LangCodeView(const char (&str)[N]) : view(str), is_valid(true) {} + + constexpr LangCodeView(char* str) : view(str) { + is_valid = view.find('_') != std::string_view::npos; + } + + constexpr std::size_t size() const noexcept { + return view.size(); + } + + constexpr const char& operator[](std::size_t index) const noexcept { + return view[index]; + } + + constexpr operator std::string_view() const noexcept { + return view; + } + + template + constexpr bool operator==(const LangCodeLiteral& literal) { + return is_valid && size() >= LangCodeLiteral::size() && view.starts_with(literal); + } +}; + +struct FallbackSetter { + std::optional& fallback; + + template + constexpr bool encoded(auto&& view) const { + if (view == LangCode) { + fallback = _Encoding; + return true; + } + return false; + }; +}; + +void ParseHandler::_detect_system_fallback_encoding() { + _system_fallback_encoding = Encoding::Unknown; + LangCodeView lang_code; + +#ifdef _WIN32 + using namespace std::string_view_literals; + + // Every Windows language id mapped to a language code according to https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-lcid/63d3d639-7fd2-4afb-abbe-0d5b5551eef8 + constexpr std::array lang_id_to_lang_code = std::to_array>({ + { 0x0001, "ar" }, + { 0x0002, "bg" }, + { 0x0003, "ca" }, + { 0x0004, "zh" }, + { 0x0005, "cs" }, + { 0x0006, "da" }, + { 0x0007, "de" }, + { 0x0008, "el" }, + { 0x0009, "en" }, + { 0x000A, "es" }, + { 0x000B, "fi" }, + { 0x000C, "fr" }, + { 0x000D, "he" }, + { 0x000E, "hu" }, + { 0x000F, "is" }, + { 0x0010, "it" }, + { 0x0011, "ja" }, + { 0x0012, "ko" }, + { 0x0013, "nl" }, + { 0x0014, "no" }, + { 0x0015, "pl" }, + { 0x0016, "pt" }, + { 0x0017, "rm" }, + { 0x0018, "ro" }, + { 0x0019, "ru" }, + { 0x001A, "hr" }, + { 0x001B, "sk" }, + { 0x001C, "sq" }, + { 0x001D, "sv" }, + { 0x001E, "th" }, + { 0x001F, "tr" }, + { 0x0020, "ur" }, + { 0x0021, "id" }, + { 0x0022, "uk" }, + { 0x0023, "be" }, + { 0x0024, "sl" }, + { 0x0025, "et" }, + { 0x0026, "lv" }, + { 0x0027, "lt" }, + { 0x0028, "tg" }, + { 0x0029, "fa" }, + { 0x002A, "vi" }, + { 0x002B, "hy" }, + { 0x002C, "az" }, + { 0x002D, "eu" }, + { 0x002E, "hsb" }, + { 0x002F, "mk" }, + { 0x0030, "st" }, + { 0x0031, "ts" }, + { 0x0032, "tn" }, + { 0x0033, "ve" }, + { 0x0034, "xh" }, + { 0x0035, "zu" }, + { 0x0036, "af" }, + { 0x0037, "ka" }, + { 0x0038, "fo" }, + { 0x0039, "hi" }, + { 0x003A, "mt" }, + { 0x003B, "se" }, + { 0x003C, "ga" }, + { 0x003D, "yi" }, + { 0x003E, "ms" }, + { 0x003F, "kk" }, + { 0x0040, "ky" }, + { 0x0041, "sw" }, + { 0x0042, "tk" }, + { 0x0043, "uz" }, + { 0x0044, "tt" }, + { 0x0045, "bn" }, + { 0x0046, "pa" }, + { 0x0047, "gu" }, + { 0x0048, "or" }, + { 0x0049, "ta" }, + { 0x004A, "te" }, + { 0x004B, "kn" }, + { 0x004C, "ml" }, + { 0x004D, "as" }, + { 0x004E, "mr" }, + { 0x004F, "sa" }, + { 0x0050, "mn" }, + { 0x0051, "bo" }, + { 0x0052, "cy" }, + { 0x0053, "km" }, + { 0x0054, "lo" }, + { 0x0055, "my" }, + { 0x0056, "gl" }, + { 0x0057, "kok" }, + { 0x0058, "mni" }, + { 0x0059, "sd" }, + { 0x005A, "syr" }, + { 0x005B, "si" }, + { 0x005C, "chr" }, + { 0x005D, "iu" }, + { 0x005E, "am" }, + { 0x005F, "tzm" }, + { 0x0060, "ks" }, + { 0x0061, "ne" }, + { 0x0062, "fy" }, + { 0x0063, "ps" }, + { 0x0064, "fil" }, + { 0x0065, "dv" }, + { 0x0066, "bin" }, + { 0x0067, "ff" }, + { 0x0068, "ha" }, + { 0x0069, "ibb" }, + { 0x006A, "yo" }, + { 0x006B, "quz" }, + { 0x006C, "nso" }, + { 0x006D, "ba" }, + { 0x006E, "lb" }, + { 0x006F, "kl" }, + { 0x0070, "ig" }, + { 0x0071, "kr" }, + { 0x0072, "om" }, + { 0x0073, "ti" }, + { 0x0074, "gn" }, + { 0x0075, "haw" }, + { 0x0076, "la" }, + { 0x0077, "so" }, + { 0x0078, "ii" }, + { 0x0079, "pap" }, + { 0x007A, "arn" }, + { 0x007C, "moh" }, + { 0x007E, "br" }, + { 0x0080, "ug" }, + { 0x0081, "mi" }, + { 0x0082, "oc" }, + { 0x0083, "co" }, + { 0x0084, "gsw" }, + { 0x0085, "sah" }, + { 0x0086, "qut" }, + { 0x0087, "rw" }, + { 0x0088, "wo" }, + { 0x008C, "prs" }, + { 0x0091, "gd" }, + { 0x0092, "ku" }, + { 0x0093, "quc" } // + }); + +#pragma pack(push, 1) + struct LocaleStruct { + struct { + uint8_t language_id; + uint8_t country_id; + } language_country; + uint8_t sort_id : 4; + uint16_t reserved : 12; + }; +#pragma pack(pop) + + std::uint32_t locale_int = GetSystemDefaultLCID(); + LocaleStruct locale_id; + std::memcpy(&locale_id, &locale_int, sizeof(locale_id)); + // first 16 bytes are language-country id, next 4 are sort id, last 12 bytes are reserved + // first 8 are the language id, last 8 bytes are a country id + const std::uint8_t& lang_id = locale_id.language_country.language_id; + + for (const auto& map : lang_id_to_lang_code) { + if (map.first != lang_id) continue; + lang_code = map.second; + break; + } +#else + lang_code = std::getenv("LANG"); +#endif + + constexpr FallbackSetter setter { _system_fallback_encoding }; + + if (lang_code.size() < 2) { + _system_fallback_encoding = Encoding::Unknown; + return; + } + +#define WIN1251(LANG_CODE) \ + if (setter.encoded(lang_code)) return; + +#define WIN1252(LANG_CODE) \ + if (setter.encoded(lang_code)) return; + + // More common, prefer + WIN1252(en); + WIN1252(es); + WIN1252(fr); + WIN1252(de); + + WIN1251(ru); + + WIN1252(af); + WIN1252(sq); + WIN1252(eu); + WIN1252(br); + WIN1252(co); + WIN1252(fo); + WIN1252(gl); + WIN1252(is); + WIN1252(io); + WIN1252(ga); + WIN1252(id); + WIN1252(in); + WIN1252(it); + WIN1252(lb); + WIN1252(ms); + WIN1252(gv); + WIN1252(no); + WIN1252(oc); + WIN1252(pt); + WIN1252(gd); + WIN1252(sw); + WIN1252(fi); + WIN1252(da); + WIN1252(et); + WIN1252(tn); + WIN1252(ca); + WIN1252(rm); + WIN1252(nl); + WIN1252(sl); + WIN1252(cy); + WIN1252(hu); + + WIN1251(be); + WIN1251(uk); + WIN1251(bg); + WIN1251(kk); + WIN1251(tg); + WIN1251(sr); + WIN1251(ky); + WIN1251(mn); + WIN1251(mk); + WIN1251(mo); + + if (lang_code.size() < 3) { + return; + } + + WIN1251(mol); + + WIN1252(ast); + WIN1252(jbo); + WIN1252(gla); + WIN1252(sco); + WIN1252(sma); + WIN1252(roo); + WIN1252(swa); + WIN1252(tsn); + WIN1252(tok); + +#undef WIN1251 +#undef WIN1252 +} \ No newline at end of file diff --git a/src/openvic-dataloader/detail/ParseHandler.hpp b/src/openvic-dataloader/detail/ParseHandler.hpp index fbec0d7..9666a5b 100644 --- a/src/openvic-dataloader/detail/ParseHandler.hpp +++ b/src/openvic-dataloader/detail/ParseHandler.hpp @@ -1,20 +1,26 @@ #pragma once +#include +#include +#include #include -#include -#include +#include #include #include #include +#include "openvic-dataloader/detail/Encoding.hpp" +#include "openvic-dataloader/detail/Utility.hpp" + #include "detail/BufferError.hpp" +#include "detail/Detect.hpp" +#include "detail/InternalConcepts.hpp" namespace ovdl::detail { - template struct ParseHandler { - std::string make_error_from(buffer_error error) { + std::string make_error_from(buffer_error error) const { switch (error) { using enum ovdl::detail::buffer_error; case buffer_is_null: @@ -30,116 +36,179 @@ namespace ovdl::detail { } } - template - constexpr void _run_load_func(detail::LoadCallback auto func, Args... args); - }; - - template - struct BasicFileParseHandler : ParseHandler> { - using parse_state_type = ParseState; - using encoding_type = typename parse_state_type::file_type::encoding_type; - constexpr bool is_valid() const { - if (!_parse_state) return false; - return buffer().data() != nullptr; + return is_valid_impl(); } - constexpr buffer_error load_buffer_size(const char* data, std::size_t size) { - lexy::buffer buffer(data, size); + buffer_error load_buffer_size(const char* data, std::size_t size, std::optional fallback) { + lexy::buffer buffer(data, size); if (buffer.data() == nullptr) return buffer_error::buffer_is_null; - _parse_state.reset(new parse_state_type { std::move(buffer) }); - return is_valid() ? buffer_error::success : buffer_error::buffer_is_null; + return load_buffer_impl(std::move(buffer), "", fallback); } - constexpr buffer_error load_buffer(const char* start, const char* end) { - lexy::buffer buffer(start, end); + buffer_error load_buffer(const char* start, const char* end, std::optional fallback) { + lexy::buffer buffer(start, end); if (buffer.data() == nullptr) return buffer_error::buffer_is_null; - _parse_state.reset(new parse_state_type { std::move(buffer) }); - return is_valid() ? buffer_error::success : buffer_error::buffer_is_null; + return load_buffer_impl(std::move(buffer), "", fallback); } - buffer_error load_file(const char* path) { - lexy::read_file_result file = lexy::read_file(path); + buffer_error load_file(const char* path, std::optional fallback) { + lexy::read_file_result file = lexy::read_file(path); + if (!file) { - _parse_state.reset(new parse_state_type { path, lexy::buffer() }); return ovdl::detail::from_underlying(ovdl::detail::to_underlying(file.error())); } - _parse_state.reset(new parse_state_type { path, std::move(file).buffer() }); - return is_valid() ? buffer_error::success : buffer_error::buffer_is_null; + + return load_buffer_impl(std::move(file).buffer(), path, fallback); } const char* path() const { + return path_impl(); + } + + static Encoding get_system_fallback() { + return _system_fallback_encoding.value_or(Encoding::Unknown); + } + + virtual ~ParseHandler() = default; + + protected: + constexpr virtual bool is_valid_impl() const = 0; + constexpr virtual buffer_error load_buffer_impl(lexy::buffer&& buffer, const char* path = "", std::optional fallback = std::nullopt) = 0; + virtual const char* path_impl() const = 0; + + template + static constexpr auto generate_state = [](std::optional* state, const char* path, auto&& buffer, Encoding encoding) { + if (path[0] != '\0') { + state->emplace( + path, + lexy::buffer(std::move(buffer)), + encoding); + return; + } + state->emplace(lexy::buffer(std::move(buffer)), encoding); + }; + + template + static void create_state(std::optional* state, const char* path, lexy::buffer&& buffer, std::optional fallback) { + if (!_system_fallback_encoding.has_value()) { + _detect_system_fallback_encoding(); + } + bool is_bad_fallback = false; + if (fallback.has_value()) { + is_bad_fallback = fallback.value() == Encoding::Ascii || fallback.value() == Encoding::Utf8; + if (is_bad_fallback) + fallback = _system_fallback_encoding.value(); + } else { + fallback = _system_fallback_encoding.value(); + } + auto [encoding, is_alone] = encoding_detect::Detector { .default_fallback = fallback.value() }.detect_assess(buffer); + switch (encoding) { + using enum Encoding; + case Ascii: + case Utf8: { + generate_state(state, path, std::move(buffer), encoding); + break; + } + case Unknown: + case Windows1251: + case Windows1252: { + generate_state(state, path, std::move(buffer), encoding); + break; + } + default: + ovdl::detail::unreachable(); + } + + if (!is_alone) { + (*state)->logger().info("encoding type could not be distinguished"); + } + + if (is_bad_fallback) { + (*state)->logger().warning("fallback encoding cannot be ascii or utf8"); + } + + if (encoding == ovdl::detail::Encoding::Unknown) { + (*state)->logger().warning("could not detect encoding"); + } + } + + private: + inline static std::optional _system_fallback_encoding = std::nullopt; + static void _detect_system_fallback_encoding(); + }; + + template + struct BasicFileParseHandler : ParseHandler { + using parse_state_type = ParseState; + + virtual constexpr bool is_valid_impl() const { + if (!_parse_state) return false; + return _parse_state.value().file().is_valid(); + } + + constexpr virtual buffer_error load_buffer_impl(lexy::buffer&& buffer, const char* path, std::optional fallback) { + if (buffer.data() == nullptr) return buffer_error::buffer_is_null; + create_state(&_parse_state, path, std::move(buffer), fallback); + return is_valid_impl() ? buffer_error::success : buffer_error::buffer_is_null; + } + + virtual const char* path_impl() const { if (!_parse_state) return ""; - return _parse_state->file().path(); + return _parse_state.value().file().path(); } parse_state_type& parse_state() { - return *_parse_state; + return _parse_state.value(); } const parse_state_type& parse_state() const { - return *_parse_state; + return _parse_state.value(); } + template constexpr const auto& buffer() const { - return _parse_state->file().buffer(); + return _parse_state.value().file().template get_buffer_as(); } protected: - std::unique_ptr _parse_state; + std::optional _parse_state; }; - template - struct BasicStateParseHandler : ParseHandler> { + template + struct BasicStateParseHandler : ParseHandler { using parse_state_type = ParseState; - using encoding_type = typename parse_state_type::ast_type::file_type::encoding_type; - constexpr bool is_valid() const { + virtual constexpr bool is_valid_impl() const { if (!_parse_state) return false; - return buffer().data() != nullptr; - } - - constexpr buffer_error load_buffer_size(const char* data, std::size_t size) { - lexy::buffer buffer(data, size); - _parse_state.reset(new parse_state_type { std::move(buffer) }); - return is_valid() ? buffer_error::success : buffer_error::buffer_is_null; - } - - constexpr buffer_error load_buffer(const char* start, const char* end) { - lexy::buffer buffer(start, end); - _parse_state.reset(new parse_state_type { std::move(buffer) }); - return is_valid() ? buffer_error::success : buffer_error::buffer_is_null; + return _parse_state.value().ast().file().is_valid(); } - buffer_error load_file(const char* path) { - lexy::read_file_result file = lexy::read_file(path); - if (!file) { - _parse_state.reset(new parse_state_type { path, lexy::buffer() }); - return ovdl::detail::from_underlying(ovdl::detail::to_underlying(file.error())); - } - - _parse_state.reset(new parse_state_type { path, std::move(file).buffer() }); - return is_valid() ? buffer_error::success : buffer_error::buffer_is_null; + constexpr virtual buffer_error load_buffer_impl(lexy::buffer&& buffer, const char* path, std::optional fallback) { + if (buffer.data() == nullptr) return buffer_error::buffer_is_null; + create_state(&_parse_state, path, std::move(buffer), fallback); + return is_valid_impl() ? buffer_error::success : buffer_error::buffer_is_null; } - const char* path() const { + virtual const char* path_impl() const { if (!_parse_state) return ""; - return _parse_state->ast().file().path(); + return _parse_state.value().ast().file().path(); } parse_state_type& parse_state() { - return *_parse_state; + return _parse_state.value(); } const parse_state_type& parse_state() const { - return *_parse_state; + return _parse_state.value(); } + template constexpr const auto& buffer() const { - return _parse_state->ast().file().buffer(); + return _parse_state.value().ast().file().template get_buffer_as(); } protected: - std::unique_ptr _parse_state; + std::optional _parse_state; }; } \ No newline at end of file diff --git a/src/openvic-dataloader/detail/Warnings.hpp b/src/openvic-dataloader/detail/Warnings.hpp index ab718bc..3a0a239 100644 --- a/src/openvic-dataloader/detail/Warnings.hpp +++ b/src/openvic-dataloader/detail/Warnings.hpp @@ -1,18 +1,17 @@ #pragma once +#include #include -#include - namespace ovdl::v2script::warnings { inline const std::string make_utf8_warning(std::string_view file_path) { - constexpr std::string_view message_suffix = "This may cause problems. Prefer Windows-1252 encoding."; + constexpr std::string_view message_suffix = "This may cause problems. Prefer Windows-1252 encoding:"; std::string message; if (file_path.empty()) { - message = "Buffer is a UTF-8 encoded string. " + std::string(message_suffix); + message = "Buffer is UTF-8 encoded. " + std::string(message_suffix); } else { - message = "File '" + std::string(file_path) + "' is a UTF-8 encoded file. " + std::string(message_suffix); + message = "File is UTF-8 encoded. " + std::string(message_suffix); } return message; diff --git a/src/openvic-dataloader/detail/dsl.hpp b/src/openvic-dataloader/detail/dsl.hpp index ccc1af6..fd8981a 100644 --- a/src/openvic-dataloader/detail/dsl.hpp +++ b/src/openvic-dataloader/detail/dsl.hpp @@ -1,16 +1,20 @@ #pragma once +#include // IWYU pragma: keep #include #include -#include +#include #include #include #include #include #include +#include +#include +#include "detail/InternalConcepts.hpp" #include "detail/StringLiteral.hpp" namespace ovdl::dsl { @@ -20,10 +24,46 @@ namespace ovdl::dsl { } template - constexpr auto sink(Sink sink) { + constexpr auto bind_sink(Sink sink) { return lexy::bind_sink(sink, lexy::parse_state); } + template + struct _sink_with_state { + using return_type = ReturnT; + + LEXY_EMPTY_MEMBER Sink _sink_cb; + + template + struct _sink_callback { + StateType& _state; + SinkCallback _sink_cb; + + using return_type = decltype(LEXY_MOV(_sink_cb).finish()); + + template + constexpr void operator()(Args&&... args) { + lexy::_detail::invoke(_sink_cb, _state, LEXY_FWD(args)...); + } + + constexpr return_type finish() && { return LEXY_MOV(_sink_cb).finish(); } + }; + + template + constexpr auto operator()(detail::IsStateType auto& state, Args... args) const -> decltype(_sink_cb(state, LEXY_FWD(args)...)) { + return _sink_cb(state, LEXY_FWD(args)...); + } + + constexpr auto sink(detail::IsStateType auto& state) const { + return _sink_callback, decltype(_sink_cb.sink())> { state, _sink_cb.sink() }; + } + }; + + template + constexpr auto sink(Sink&& sink) { + return bind_sink(_sink_with_state { LEXY_FWD(sink) }); + } + template constexpr auto collect(Callback callback) { return sink(lexy::collect(callback)); @@ -34,49 +74,76 @@ namespace ovdl::dsl { return sink(lexy::collect(callback)); } - template + template constexpr auto construct = callback( - [](StateType& state, ovdl::NodeLocation loc, auto&& arg) { - if constexpr (std::is_same_v, lexy::nullopt>) + [](detail::IsParseState auto& state, ovdl::NodeLocation loc, auto&& arg) { + if constexpr (std::same_as, lexy::nullopt>) return state.ast().template create(loc); else return state.ast().template create(loc, DRYAD_FWD(arg)); }, - [](StateType& state, ovdl::NodeLocation loc, auto&&... args) { + [](detail::IsParseState auto& state, ovdl::NodeLocation loc, auto&&... args) { return state.ast().template create(loc, DRYAD_FWD(args)...); }); - template + template constexpr auto construct_list = callback( - [](StateType& state, const char* begin, ListType&& arg, const char* end) { + [](detail::IsParseState auto& state, const char* begin, ListType&& arg, const char* end) { return state.ast().template create(NodeLocation::make_from(begin, end), DRYAD_FWD(arg)); }, - [](StateType& state, const char* begin, lexy::nullopt, const char* end) { + [](detail::IsParseState auto& state, const char* begin, lexy::nullopt, const char* end) { return state.ast().template create(NodeLocation::make_from(begin, end)); }, - [](StateType& state, const char* begin, const char* end) { + [](detail::IsParseState auto& state, const char* begin, const char* end) { return state.ast().template create(NodeLocation::make_from(begin, end)); + }, + [](detail::IsParseState auto& state) { + return nullptr; }); - template - constexpr auto construct_list = callback( - [](StateType& state, const char* begin, ListType&& arg, const char* end) { + template + constexpr auto construct_list = callback( + [](detail::IsParseState auto& state, const char* begin, ListType&& arg, const char* end) { return state.ast().template create(NodeLocation::make_from(begin, end), DRYAD_FWD(arg)); }, - [](StateType& state, const char* begin, lexy::nullopt, const char* end) { + [](detail::IsParseState auto& state, const char* begin, lexy::nullopt, const char* end) { return state.ast().template create(NodeLocation::make_from(begin, end)); }); - template - consteval auto make_range() { - if constexpr (LOW == HIGH) { - return ::lexy::dsl::lit_c; - } else if constexpr (LOW == (HIGH - 1)) { - return ::lexy::dsl::lit_c / ::lexy::dsl::lit_c; - } else { - return ::lexy::dsl::lit_c / make_range(); + template + struct _crange : lexyd::char_class_base<_crange> { + static_assert(LowC >= 0, "LowC cannot be less than 0"); + static_assert(HighC - LowC > 0, "LowC must be less than HighC"); + + static constexpr auto char_class_unicode() { + return LowC <= 0x7F && HighC <= 0x7F; } - } + + static LEXY_CONSTEVAL auto char_class_name() { + return "range"; + } + + static LEXY_CONSTEVAL auto char_class_ascii() { + lexy::_detail::ascii_set result; + if constexpr (LowC <= 0x7F && HighC <= 0x7F) + for (auto c = LowC; c <= HighC; c++) + result.insert(c); + return result; + } + + static constexpr auto char_class_match_cp([[maybe_unused]] char32_t cp) { + if constexpr (LowC <= 0x7F && HighC <= 0x7F) + return std::false_type {}; + else + return LowC <= cp && cp <= HighC; + } + }; + + template + constexpr auto lit_c_range = _crange {}; + + template + constexpr auto lit_b_range = _crange {}; template constexpr auto position_brackets = lexy::dsl::brackets(lexy::dsl::position(lexy::dsl::lit_c), lexy::dsl::position(lexy::dsl::lit_c)); @@ -89,14 +156,13 @@ namespace ovdl::dsl { template constexpr auto p = lexy::dsl::position(lexy::dsl::p); - template + template static constexpr auto default_kw_value = dsl::callback( - [](ParseType& state, NodeLocation loc) { + [](detail::IsParseState auto& state, NodeLocation loc) { return state.ast().template create(loc, state.ast().intern(Keyword.data(), Keyword.size())); }); template< - IsParseState ParseType, auto Identifier, typename RuleValue, ovdl::detail::string_literal Keyword, @@ -109,18 +175,17 @@ namespace ovdl::dsl { static constexpr auto value = Value; }; static constexpr auto rule = dsl::p >> Production; - static constexpr auto value = construct; + static constexpr auto value = construct; }; template< - IsParseState ParseType, auto Identifier, typename RuleValue, ovdl::detail::string_literal Keyword, auto Production, auto Value> - struct fkeyword_rule : keyword_rule { - using base_type = keyword_rule; + struct fkeyword_rule : keyword_rule { + using base_type = keyword_rule; struct context_t; struct rule_t : base_type::rule_t { static constexpr auto flag = lexy::dsl::context_flag; @@ -139,7 +204,7 @@ namespace ovdl::dsl { static constexpr auto make_flag = rule_t::flag.create(); static constexpr auto rule = dsl::p >> (rule_t::must >> rule_t::flag.set()) >> Production; - static constexpr auto value = construct; + static constexpr auto value = construct; }; template @@ -147,4 +212,71 @@ namespace ovdl::dsl { static constexpr auto flags = (Args::make_flag + ...); static constexpr auto p = (lexy::dsl::p | ...); }; + + template + struct _peek : lexyd::branch_base { + template + struct bp { + typename Reader::iterator begin; + typename Reader::marker end; + + constexpr bool try_parse(const void*, Reader reader) { + using encoding = typename Reader::encoding; + + auto parser = [&] { + if constexpr (std::same_as || std::same_as) { + // We need to match the entire rule. + return lexy::token_parser_for { reader }; + } else { + // We need to match the entire rule. + return lexy::token_parser_for { reader }; + } + }(); + + begin = reader.position(); + auto result = parser.try_parse(reader); + end = parser.end; + + return result; + } + + template + constexpr void cancel(Context& context) { + context.on(lexyd::_ev::backtracked {}, begin, end.position()); + } + + template + LEXY_PARSER_FUNC bool finish(Context& context, Reader& reader, Args&&... args) { + context.on(lexyd::_ev::backtracked {}, begin, end.position()); + return NextParser::parse(context, reader, LEXY_FWD(args)...); + } + }; + + template + struct p { + template + LEXY_PARSER_FUNC static bool parse(Context& context, Reader& reader, Args&&... args) { + bp impl {}; + if (!impl.try_parse(context.control_block, reader)) { + // Report that we've failed. + using tag = lexy::_detail::type_or; + auto err = lexy::error(impl.begin, impl.end.position()); + context.on(lexyd::_ev::error {}, err); + + // But recover immediately, as we wouldn't have consumed anything either way. + } + + context.on(lexyd::_ev::backtracked {}, impl.begin, impl.end); + return NextParser::parse(context, reader, LEXY_FWD(args)...); + } + }; + + template + static constexpr _peek error = {}; + }; + + template + constexpr auto peek(Rule, RuleUtf) { + return _peek {}; + } } \ No newline at end of file diff --git a/src/openvic-dataloader/v2script/AbstractSyntaxTree.cpp b/src/openvic-dataloader/v2script/AbstractSyntaxTree.cpp index abade40..5a98b40 100644 --- a/src/openvic-dataloader/v2script/AbstractSyntaxTree.cpp +++ b/src/openvic-dataloader/v2script/AbstractSyntaxTree.cpp @@ -1,8 +1,7 @@ -#include - -#include +#include "openvic-dataloader/v2script/AbstractSyntaxTree.hpp" #include +#include #include #include @@ -23,6 +22,15 @@ ListValue::ListValue(dryad::node_ctor ctor, StatementList statements) } } +ListValue::ListValue(dryad::node_ctor ctor, AssignStatementList statements) : node_base(ctor) { + insert_child_list_after(nullptr, statements); + if (statements.empty()) { + _last_statement = nullptr; + } else { + _last_statement = statements.back(); + } +} + FileTree::FileTree(dryad::node_ctor ctor, StatementList statements) : node_base(ctor) { insert_child_list_after(nullptr, statements); if (statements.empty()) { @@ -32,29 +40,22 @@ FileTree::FileTree(dryad::node_ctor ctor, StatementList statements) : node_base( } } -// static void _handle_string_characters(std::string& string, bool allow_newline) { -// size_t position = 0; -// for (auto& c : string) { -// switch (c) { -// case '\r': -// case '\n': -// if (allow_newline) goto END_LOOP; -// c = ' '; -// break; -// default: break; -// } -// END_LOOP: -// position++; -// } -// } - -std::string AbstractSyntaxTree::make_list_visualizer() const { +FileTree::FileTree(dryad::node_ctor ctor, AssignStatementList statements) : node_base(ctor) { + insert_child_list_after(nullptr, statements); + if (statements.empty()) { + _last_node = nullptr; + } else { + _last_node = statements.back(); + } +} + +std::string FileAbstractSyntaxTree::make_list_visualizer() const { const int INDENT_SIZE = 2; std::string result; unsigned int level = 0; - for (auto [event, node] : dryad::traverse(_tree)) { + for (auto [event, node] : dryad::traverse(this->_tree)) { if (event == dryad::traverse_event::exit) { --level; continue; @@ -66,7 +67,7 @@ std::string AbstractSyntaxTree::make_list_visualizer() const { dryad::visit_node( node, [&](const FlatValue* value) { - result.append(value->value(_symbol_interner)); + result.append(value->value(this->_symbol_interner)); }, [&](const ListValue* value) { }, @@ -89,19 +90,19 @@ std::string AbstractSyntaxTree::make_list_visualizer() const { return result; } -std::string AbstractSyntaxTree::make_native_visualizer() const { +std::string FileAbstractSyntaxTree::make_native_visualizer() const { constexpr int INDENT_SIZE = 2; std::string result; unsigned int level = 0; dryad::visit_tree( - _tree, + this->_tree, [&](const IdentifierValue* value) { - result.append(value->value(_symbol_interner)); + result.append(value->value(this->_symbol_interner)); }, [&](const StringValue* value) { - result.append(1, '"').append(value->value(_symbol_interner)).append(1, '"'); + result.append(1, '"').append(value->value(this->_symbol_interner)).append(1, '"'); }, [&](dryad::child_visitor visitor, const ValueStatement* statement) { visitor(statement->value()); diff --git a/src/openvic-dataloader/v2script/EventGrammar.hpp b/src/openvic-dataloader/v2script/EventGrammar.hpp index 27f6459..130a233 100644 --- a/src/openvic-dataloader/v2script/EventGrammar.hpp +++ b/src/openvic-dataloader/v2script/EventGrammar.hpp @@ -11,8 +11,8 @@ #include "openvic-dataloader/NodeLocation.hpp" -#include "ParseState.hpp" #include "SimpleGrammar.hpp" +#include "detail/InternalConcepts.hpp" #include "detail/dsl.hpp" #include "v2script/AiBehaviorGrammar.hpp" #include "v2script/EffectGrammar.hpp" @@ -28,7 +28,7 @@ namespace ovdl::v2script::grammar { struct MonthValue { static constexpr auto rule = lexy::dsl::p>; static constexpr auto value = dsl::callback( - [](ast::ParseState& state, ast::IdentifierValue* value) { + [](detail::IsParseState auto& state, ast::IdentifierValue* value) { bool is_number = true; for (auto* current = value->value(state.ast().symbol_interner()); *current; current++) { is_number = is_number && std::isdigit(*current); @@ -94,7 +94,7 @@ namespace ovdl::v2script::grammar { static constexpr auto value = dsl::callback( - [](ast::ParseState& state, NodeLocation loc, ast::IdentifierValue* name, ast::ListValue* list) { + [](detail::IsParseState auto& state, NodeLocation loc, ast::IdentifierValue* name, ast::ListValue* list) { static auto country_decl = state.ast().intern_cstr("country_event"); static auto province_decl = state.ast().intern_cstr("province_event"); @@ -104,7 +104,7 @@ namespace ovdl::v2script::grammar { .finish(); } - return state.ast().create(loc, name->value(state.ast().symbol_interner()) == province_decl, list); + return state.ast().template create(loc, name->value(state.ast().symbol_interner()) == province_decl, list); }); }; diff --git a/src/openvic-dataloader/v2script/LuaDefinesGrammar.hpp b/src/openvic-dataloader/v2script/LuaDefinesGrammar.hpp index 96cce99..885413c 100644 --- a/src/openvic-dataloader/v2script/LuaDefinesGrammar.hpp +++ b/src/openvic-dataloader/v2script/LuaDefinesGrammar.hpp @@ -4,9 +4,12 @@ #include #include +#include +#include +#include -#include "ParseState.hpp" #include "SimpleGrammar.hpp" +#include "detail/InternalConcepts.hpp" #include "detail/dsl.hpp" namespace ovdl::v2script::lua::grammar { @@ -21,90 +24,118 @@ namespace ovdl::v2script::lua::grammar { template constexpr auto construct_list = v2script::grammar::construct_list; - struct ParseOptions { - }; - - template struct StatementListBlock; static constexpr auto comment_specifier = LEXY_LIT("--") >> lexy::dsl::until(lexy::dsl::newline).or_eof(); - template struct Identifier { static constexpr auto rule = lexy::dsl::identifier(lexy::dsl::ascii::alpha_underscore, lexy::dsl::ascii::alpha_digit_underscore); - static constexpr auto value = callback( - [](ast::ParseState& state, auto lexeme) { - auto value = state.ast().intern(lexeme.data(), lexeme.size()); - return state.ast().create(lexeme.begin(), lexeme.end(), value); - }); + static constexpr auto value = + callback( + [](detail::IsParseState auto& state, auto lexeme) { + auto value = state.ast().intern(lexeme.data(), lexeme.size()); + return state.ast().template create(lexeme.begin(), lexeme.end(), value); + }); }; - template struct Value { static constexpr auto rule = lexy::dsl::identifier(lexy::dsl::ascii::digit / lexy::dsl::lit_c<'.'> / lexy::dsl::lit_c<'-'>); - static constexpr auto value = callback( - [](ast::ParseState& state, auto lexeme) { - auto value = state.ast().intern(lexeme.data(), lexeme.size()); - return state.ast().create(lexeme.begin(), lexeme.end(), value); - }); - }; - - template - struct String { - static constexpr auto rule = [] { - // Arbitrary code points that aren't control characters. - auto c = dsl::make_range<0x20, 0xFF>() - lexy::dsl::ascii::control; - - return lexy::dsl::delimited(lexy::dsl::position(lexy::dsl::lit_b<'"'>))(c) | lexy::dsl::delimited(lexy::dsl::position(lexy::dsl::lit_b<'\''>))(c); - }(); - static constexpr auto value = - lexy::as_string >> - callback( - [](ast::ParseState& state, const char* begin, const std::string& str, const char* end) { - auto value = state.ast().intern(str.data(), str.length()); - return state.ast().create(begin, end, value); + callback( + [](detail::IsParseState auto& state, auto lexeme) { + auto value = state.ast().intern(lexeme.data(), lexeme.size()); + return state.ast().template create(lexeme.begin(), lexeme.end(), value); }); }; - template + struct String : lexy::scan_production, + lexy::token_production { + template + static constexpr scan_result scan(lexy::rule_scanner& scanner, detail::IsParseState auto& state) { + using encoding = typename Reader::encoding; + + constexpr auto c = [] { + if constexpr (std::same_as || std::same_as) { + // Arbitrary code points that aren't control characters. + return dsl::lit_b_range<0x20, 0xFF> - lexy::dsl::ascii::control; + } else { + return -lexy::dsl::unicode::control; + } + }(); + auto rule = lexy::dsl::quoted(c) | lexy::dsl::single_quoted(c); + auto begin = scanner.position(); + lexy::scan_result str_result; + scanner.parse(str_result, rule); + if (!scanner || !str_result) + return lexy::scan_failed; + auto end = scanner.position(); + auto str = str_result.value(); + auto value = state.ast().intern(str.data(), str.size()); + return state.ast().template create(begin, end, value); + } + + static constexpr auto rule = lexy::dsl::peek(lexy::dsl::quoted.open() | lexy::dsl::single_quoted.open()) >> lexy::dsl::scan; + static constexpr auto value = ovdl::v2script::grammar::convert_as_string >> lexy::forward; + }; + struct Expression { - static constexpr auto rule = lexy::dsl::p> | lexy::dsl::p>; + static constexpr auto rule = lexy::dsl::p | lexy::dsl::p; static constexpr auto value = lexy::forward; }; - template struct AssignmentStatement { - static constexpr auto rule = - dsl::p> >> - lexy::dsl::equal_sign >> - (lexy::dsl::p> | lexy::dsl::recurse_branch>); + static constexpr auto rule = [] { + auto right_brace = lexy::dsl::lit_c<'}'>; + + auto expression = lexy::dsl::p; + auto statement_list = lexy::dsl::recurse_branch; + + auto rhs_recover = lexy::dsl::recover(expression, statement_list).limit(right_brace); + auto rhs_try = lexy::dsl::try_(expression | statement_list, rhs_recover); + + auto identifier = dsl::p >> lexy::dsl::equal_sign + rhs_try; + + auto recover = lexy::dsl::recover(identifier).limit(right_brace); + return lexy::dsl::try_(identifier, recover); + }(); static constexpr auto value = callback( - [](ast::ParseState& state, const char* pos, ast::IdentifierValue* name, ast::Value* initializer) { - return state.ast().create(pos, name, initializer); + [](detail::IsParseState auto& state, const char* pos, ast::IdentifierValue* name, ast::Value* initializer) -> ast::AssignStatement* { + if (initializer == nullptr) return nullptr; + return state.ast().template create(pos, name, initializer); + }, + [](detail::IsParseState auto& state, ast::Value*) { + return nullptr; + }, + [](detail::IsParseState auto& state) { + return nullptr; }); }; - template struct StatementListBlock { - static constexpr auto rule = - dsl::curly_bracketed( - lexy::dsl::opt( - lexy::dsl::list( - lexy::dsl::recurse_branch>, - lexy::dsl::trailing_sep(lexy::dsl::lit_c<','>)))); + static constexpr auto rule = [] { + auto right_brace = lexy::dsl::lit_c<'}'>; + auto comma = lexy::dsl::lit_c<','>; + + auto assign_statement = lexy::dsl::recurse_branch; + auto assign_try = lexy::dsl::try_(assign_statement); + + auto curly_bracket = dsl::curly_bracketed.opt_list( + assign_try, + lexy::dsl::trailing_sep(comma)); + + return lexy::dsl::try_(curly_bracket, lexy::dsl::find(right_brace)); + }(); static constexpr auto value = lexy::as_list >> construct_list; }; - template struct File { // Allow arbitrary spaces between individual tokens. static constexpr auto whitespace = ovdl::v2script::grammar::whitespace_specifier | comment_specifier; - static constexpr auto rule = lexy::dsl::position + lexy::dsl::terminator(lexy::dsl::eof).opt_list(lexy::dsl::p>); + static constexpr auto rule = lexy::dsl::position + lexy::dsl::terminator(lexy::dsl::eof).opt_list(lexy::dsl::p); static constexpr auto value = lexy::as_list >> construct; }; diff --git a/src/openvic-dataloader/v2script/ModifierGrammar.hpp b/src/openvic-dataloader/v2script/ModifierGrammar.hpp index 22592d4..122a8c7 100644 --- a/src/openvic-dataloader/v2script/ModifierGrammar.hpp +++ b/src/openvic-dataloader/v2script/ModifierGrammar.hpp @@ -10,9 +10,9 @@ #include "openvic-dataloader/NodeLocation.hpp" -#include "ParseState.hpp" #include "SimpleGrammar.hpp" #include "TriggerGrammar.hpp" +#include "detail/InternalConcepts.hpp" #include "detail/dsl.hpp" namespace ovdl::v2script::grammar { @@ -22,9 +22,9 @@ namespace ovdl::v2script::grammar { struct FactorStatement { static constexpr auto rule = lexy::dsl::position(factor_keyword) >> (lexy::dsl::equal_sign + lexy::dsl::p>); static constexpr auto value = dsl::callback( - [](ast::ParseState& state, NodeLocation loc, ast::IdentifierValue* value) { - auto* factor = state.ast().create(loc, state.ast().intern("factor")); - return state.ast().create(loc, factor, value); + [](detail::IsParseState auto& state, NodeLocation loc, ast::IdentifierValue* value) { + auto* factor = state.ast().template create(loc, state.ast().intern("factor")); + return state.ast().template create(loc, factor, value); }); }; @@ -49,9 +49,9 @@ namespace ovdl::v2script::grammar { lexy::dsl::position(modifier_keyword) >> lexy::dsl::equal_sign >> lexy::dsl::p; static constexpr auto value = dsl::callback( - [](ast::ParseState& state, NodeLocation loc, ast::ListValue* list) { - auto* factor = state.ast().create(loc, state.ast().intern("modifier")); - return state.ast().create(loc, factor, list); + [](detail::IsParseState auto& state, NodeLocation loc, ast::ListValue* list) { + auto* factor = state.ast().template create(loc, state.ast().intern("modifier")); + return state.ast().template create(loc, factor, list); }); }; } \ No newline at end of file diff --git a/src/openvic-dataloader/v2script/ParseState.hpp b/src/openvic-dataloader/v2script/ParseState.hpp index 8e29bf5..954e39d 100644 --- a/src/openvic-dataloader/v2script/ParseState.hpp +++ b/src/openvic-dataloader/v2script/ParseState.hpp @@ -1,23 +1,24 @@ #pragma once -#include -#include #include #include +#include "../openvic-dataloader/ParseState.hpp" +#include "AbstractSyntaxTree.hpp" +#include "File.hpp" +#include "detail/InternalConcepts.hpp" + namespace ovdl::v2script::ast { - using File = ovdl::BasicFile; - struct AbstractSyntaxTree : ovdl::BasicAbstractSyntaxTree { - using BasicAbstractSyntaxTree::BasicAbstractSyntaxTree; + + struct FileAbstractSyntaxTree : ovdl::BasicAbstractSyntaxTree, FileTree> { + using ovdl::BasicAbstractSyntaxTree, FileTree>::BasicAbstractSyntaxTree; std::string make_list_visualizer() const; std::string make_native_visualizer() const; }; - using ParseState = ovdl::ParseState; + using ParseState = ovdl::ParseState; - static_assert(IsFile, "File failed IsFile concept"); - static_assert(IsAst, "AbstractSyntaxTree failed IsAst concept"); - static_assert(IsParseState, "ParseState failed IsParseState concept"); + static_assert(detail::IsParseState, "ParseState failed IsParseState concept"); } \ No newline at end of file diff --git a/src/openvic-dataloader/v2script/Parser.cpp b/src/openvic-dataloader/v2script/Parser.cpp index eb491d5..23dada7 100644 --- a/src/openvic-dataloader/v2script/Parser.cpp +++ b/src/openvic-dataloader/v2script/Parser.cpp @@ -4,16 +4,15 @@ #include #include #include +#include #include -#include +#include #include -#include -#include -#include +#include +#include #include -#include -#include +#include #include #include @@ -29,10 +28,8 @@ #include -#include "openvic-dataloader/Error.hpp" - +#include "DiagnosticLogger.hpp" #include "ParseState.hpp" -#include "detail/DetectUtf8.hpp" #include "detail/NullBuff.hpp" #include "detail/ParseHandler.hpp" #include "detail/Warnings.hpp" @@ -44,29 +41,46 @@ using namespace ovdl; using namespace ovdl::v2script; -/// BufferHandler /// +/// ParseHandler /// struct Parser::ParseHandler final : detail::BasicStateParseHandler { - constexpr bool is_exclusive_utf8() const { - return detail::is_utf8_no_ascii(buffer()); - } - template std::optional parse() { - auto result = lexy::parse(buffer(), *_parse_state, _parse_state->logger().error_callback()); + if (parse_state().encoding() == ovdl::detail::Encoding::Utf8) { + parse_state().logger().warning(warnings::make_utf8_warning(path())); + } + + auto result = [&] { + switch (parse_state().encoding()) { + using enum detail::Encoding; + case Ascii: + case Utf8: + return lexy::parse(buffer(), parse_state(), parse_state().logger().error_callback()); + case Unknown: + case Windows1251: + case Windows1252: + return lexy::parse(buffer(), parse_state(), parse_state().logger().error_callback()); + default: + ovdl::detail::unreachable(); + } + }(); if (!result) { - return _parse_state->logger().get_errors(); + return parse_state().logger().get_errors(); } - _parse_state->ast().set_root(result.value()); + parse_state().ast().set_root(result.value()); return std::nullopt; } ast::FileTree* root() { - return _parse_state->ast().root(); + return parse_state().ast().root(); + } + + Parser::error_range get_errors() { + return parse_state().logger().get_errors(); } }; -/// BufferHandler /// +/// ParseHandler /// Parser::Parser() : _parse_handler(std::make_unique()) { @@ -82,29 +96,29 @@ Parser::Parser(Parser&&) = default; Parser& Parser::operator=(Parser&&) = default; Parser::~Parser() = default; -Parser Parser::from_buffer(const char* data, std::size_t size) { +Parser Parser::from_buffer(const char* data, std::size_t size, std::optional encoding_fallback) { Parser result; - return std::move(result.load_from_buffer(data, size)); + return std::move(result.load_from_buffer(data, size, encoding_fallback)); } -Parser Parser::from_buffer(const char* start, const char* end) { +Parser Parser::from_buffer(const char* start, const char* end, std::optional encoding_fallback) { Parser result; - return std::move(result.load_from_buffer(start, end)); + return std::move(result.load_from_buffer(start, end, encoding_fallback)); } -Parser Parser::from_string(const std::string_view string) { +Parser Parser::from_string(const std::string_view string, std::optional encoding_fallback) { Parser result; - return std::move(result.load_from_string(string)); + return std::move(result.load_from_string(string, encoding_fallback)); } -Parser Parser::from_file(const char* path) { +Parser Parser::from_file(const char* path, std::optional encoding_fallback) { Parser result; - return std::move(result.load_from_file(path)); + return std::move(result.load_from_file(path, encoding_fallback)); } -Parser Parser::from_file(const std::filesystem::path& path) { +Parser Parser::from_file(const std::filesystem::path& path, std::optional encoding_fallback) { Parser result; - return std::move(result.load_from_file(path)); + return std::move(result.load_from_file(path, encoding_fallback)); } /// @@ -128,38 +142,38 @@ constexpr void Parser::_run_load_func(detail::LoadCallbackparse_state().logger().create_log(DiagnosticLogger::DiagnosticKind::error, fmt::runtime(error_message)); + _parse_handler->parse_state().logger().template create_log(DiagnosticLogger::DiagnosticKind::error, fmt::runtime(error_message)); } if (has_error() && &_error_stream.get() != &detail::cnull) { print_errors_to(_error_stream.get()); } } -constexpr Parser& Parser::load_from_buffer(const char* data, std::size_t size) { +constexpr Parser& Parser::load_from_buffer(const char* data, std::size_t size, std::optional encoding_fallback) { // Type can't be deduced? - _run_load_func(std::mem_fn(&ParseHandler::load_buffer_size), data, size); + _run_load_func(std::mem_fn(&ParseHandler::load_buffer_size), data, size, encoding_fallback); return *this; } -constexpr Parser& Parser::load_from_buffer(const char* start, const char* end) { +constexpr Parser& Parser::load_from_buffer(const char* start, const char* end, std::optional encoding_fallback) { // Type can't be deduced? - _run_load_func(std::mem_fn(&ParseHandler::load_buffer), start, end); + _run_load_func(std::mem_fn(&ParseHandler::load_buffer), start, end, encoding_fallback); return *this; } -constexpr Parser& Parser::load_from_string(const std::string_view string) { - return load_from_buffer(string.data(), string.size()); +constexpr Parser& Parser::load_from_string(const std::string_view string, std::optional encoding_fallback) { + return load_from_buffer(string.data(), string.size(), encoding_fallback); } -Parser& Parser::load_from_file(const char* path) { +Parser& Parser::load_from_file(const char* path, std::optional encoding_fallback) { set_file_path(path); // Type can be deduced?? - _run_load_func(std::mem_fn(&ParseHandler::load_file), path); + _run_load_func(std::mem_fn(&ParseHandler::load_file), get_file_path().data(), encoding_fallback); return *this; } -Parser& Parser::load_from_file(const std::filesystem::path& path) { - return load_from_file(path.string().c_str()); +Parser& Parser::load_from_file(const std::filesystem::path& path, std::optional encoding_fallback) { + return load_from_file(path.string().c_str(), encoding_fallback); } /* REQUIREMENTS: @@ -173,11 +187,7 @@ bool Parser::simple_parse() { return false; } - if (_parse_handler->is_exclusive_utf8()) { - _parse_handler->parse_state().logger().warning(warnings::make_utf8_warning(_file_path)); - } - - auto errors = _parse_handler->parse>(); + std::optional errors = _parse_handler->parse(); _has_error = _parse_handler->parse_state().logger().errored(); _has_warning = _parse_handler->parse_state().logger().warned(); if (!_parse_handler->root()) { @@ -196,14 +206,11 @@ bool Parser::event_parse() { return false; } - if (_parse_handler->is_exclusive_utf8()) { - _parse_handler->parse_state().logger().warning(warnings::make_utf8_warning(_file_path)); - } - - auto errors = _parse_handler->parse(); + std::optional errors = _parse_handler->parse(); _has_error = _parse_handler->parse_state().logger().errored(); _has_warning = _parse_handler->parse_state().logger().warned(); if (!_parse_handler->root()) { + _has_error = true; _has_fatal_error = true; if (&_error_stream.get() != &detail::cnull) { print_errors_to(_error_stream); @@ -218,14 +225,11 @@ bool Parser::decision_parse() { return false; } - if (_parse_handler->is_exclusive_utf8()) { - _parse_handler->parse_state().logger().warning(warnings::make_utf8_warning(_file_path)); - } - - auto errors = _parse_handler->parse(); + std::optional errors = _parse_handler->parse(); _has_error = _parse_handler->parse_state().logger().errored(); _has_warning = _parse_handler->parse_state().logger().warned(); if (!_parse_handler->root()) { + _has_error = true; _has_fatal_error = true; if (&_error_stream.get() != &detail::cnull) { print_errors_to(_error_stream); @@ -240,14 +244,11 @@ bool Parser::lua_defines_parse() { return false; } - if (_parse_handler->is_exclusive_utf8()) { - _parse_handler->parse_state().logger().warning(warnings::make_utf8_warning(_file_path)); - } - - auto errors = _parse_handler->parse>(); + std::optional errors = _parse_handler->parse(); _has_error = _parse_handler->parse_state().logger().errored(); _has_warning = _parse_handler->parse_state().logger().warned(); if (!_parse_handler->root()) { + _has_error = true; _has_fatal_error = true; if (&_error_stream.get() != &detail::cnull) { print_errors_to(_error_stream); @@ -273,48 +274,66 @@ std::string Parser::make_list_string() const { return _parse_handler->parse_state().ast().make_list_visualizer(); } +// TODO: Remove reinterpret_cast +// WARNING: This almost certainly breaks on utf16 and utf32 encodings, luckily we don't parse in that format +// This is purely to silence the node_location errors because char8_t is useless +#define REINTERPRET_IT(IT) reinterpret_cast::encoding::char_type*>((IT)) + const FilePosition Parser::get_position(const ast::Node* node) const { if (!node || !node->is_linked_in_tree()) { return {}; } - auto node_location = _parse_handler->parse_state().ast().location_of(node); + + NodeLocation node_location; + + node_location = _parse_handler->parse_state().ast().location_of(node); + if (node_location.is_synthesized()) { - return {}; + return FilePosition {}; } - auto loc_begin = lexy::get_input_location(_parse_handler->buffer(), node_location.begin()); - FilePosition result { loc_begin.line_nr(), loc_begin.line_nr(), loc_begin.column_nr(), loc_begin.column_nr() }; - if (node_location.begin() < node_location.end()) { - auto loc_end = lexy::get_input_location(_parse_handler->buffer(), node_location.end(), loc_begin.anchor()); - result.end_line = loc_end.line_nr(); - result.end_column = loc_end.column_nr(); - } - return result; + return _parse_handler->parse_state().ast().file().visit_buffer( + [&](auto&& buffer) -> FilePosition { + auto loc_begin = lexy::get_input_location(buffer, REINTERPRET_IT(node_location.begin())); + FilePosition result { loc_begin.line_nr(), loc_begin.line_nr(), loc_begin.column_nr(), loc_begin.column_nr() }; + if (node_location.begin() < node_location.end()) { + auto loc_end = lexy::get_input_location(buffer, REINTERPRET_IT(node_location.end()), loc_begin.anchor()); + result.end_line = loc_end.line_nr(); + result.end_column = loc_end.column_nr(); + } + return result; + }); } Parser::error_range Parser::get_errors() const { - return _parse_handler->parse_state().logger().get_errors(); + return _parse_handler->get_errors(); } const FilePosition Parser::get_error_position(const error::Error* error) const { if (!error || !error->is_linked_in_tree()) { return {}; } + auto err_location = _parse_handler->parse_state().logger().location_of(error); if (err_location.is_synthesized()) { - return {}; + return FilePosition {}; } - auto loc_begin = lexy::get_input_location(_parse_handler->buffer(), err_location.begin()); - FilePosition result { loc_begin.line_nr(), loc_begin.line_nr(), loc_begin.column_nr(), loc_begin.column_nr() }; - if (err_location.begin() < err_location.end()) { - auto loc_end = lexy::get_input_location(_parse_handler->buffer(), err_location.end(), loc_begin.anchor()); - result.end_line = loc_end.line_nr(); - result.end_column = loc_end.column_nr(); - } - return result; + return _parse_handler->parse_state().ast().file().visit_buffer( + [&](auto&& buffer) -> FilePosition { + auto loc_begin = lexy::get_input_location(buffer, REINTERPRET_IT(err_location.begin())); + FilePosition result { loc_begin.line_nr(), loc_begin.line_nr(), loc_begin.column_nr(), loc_begin.column_nr() }; + if (err_location.begin() < err_location.end()) { + auto loc_end = lexy::get_input_location(buffer, REINTERPRET_IT(err_location.end()), loc_begin.anchor()); + result.end_line = loc_end.line_nr(); + result.end_column = loc_end.column_nr(); + } + return result; + }); } +#undef REINTERPRET_IT + void Parser::print_errors_to(std::basic_ostream& stream) const { auto errors = get_errors(); if (errors.empty()) return; @@ -324,19 +343,9 @@ void Parser::print_errors_to(std::basic_ostream& stream) const { [&](const error::BufferError* buffer_error) { stream << "buffer error: " << buffer_error->message() << '\n'; }, - [&](const error::ParseError* parse_error) { - auto position = get_error_position(parse_error); - std::string pos_str = fmt::format(":{}:{}: ", position.start_line, position.start_column); - stream << _file_path << pos_str << "parse error for '" << parse_error->production_name() << "': " << parse_error->message() << '\n'; - }, - [&](dryad::child_visitor visitor, const error::Semantic* semantic) { - auto position = get_error_position(semantic); - std::string pos_str = ": "; - if (!position.is_empty()) { - pos_str = fmt::format(":{}:{}: ", position.start_line, position.start_column); - } - stream << _file_path << pos_str << semantic->message() << '\n'; - auto annotations = semantic->annotations(); + [&](dryad::child_visitor visitor, const error::AnnotatedError* annotated_error) { + stream << annotated_error->message() << '\n'; + auto annotations = annotated_error->annotations(); for (auto annotation : annotations) { visitor(annotation); } diff --git a/src/openvic-dataloader/v2script/SimpleGrammar.hpp b/src/openvic-dataloader/v2script/SimpleGrammar.hpp index 37e295f..d42ce07 100644 --- a/src/openvic-dataloader/v2script/SimpleGrammar.hpp +++ b/src/openvic-dataloader/v2script/SimpleGrammar.hpp @@ -5,10 +5,22 @@ #include #include +#include #include +#include +#include +#include +#include +#include #include - -#include "ParseState.hpp" +#include +#include +#include +#include +#include + +#include "detail/Convert.hpp" +#include "detail/InternalConcepts.hpp" #include "detail/dsl.hpp" // Grammar Definitions // @@ -23,17 +35,28 @@ */ namespace ovdl::v2script::grammar { template - constexpr auto construct = dsl::construct; + constexpr auto construct = dsl::construct; template - constexpr auto construct_list = dsl::construct_list; + constexpr auto construct_list = dsl::construct_list; + + struct ConvertErrorHandler { + static constexpr void on_invalid_character(detail::IsStateType auto& state, auto reader) { + state.logger().warning("invalid character value '{}' found.", static_cast(reader.peek())) // + .primary(BasicNodeLocation { reader.position() }, "here") + .finish(); + } + }; + + template + constexpr auto convert_as_string = convert::convert_as_string; struct ParseOptions { /// @brief Makes string parsing avoid string escapes bool NoStringEscape; }; - static constexpr ParseOptions NoStringEscapeOption = ParseOptions { true }; - static constexpr ParseOptions StringEscapeOption = ParseOptions { false }; + static constexpr auto NoStringEscapeOption = ParseOptions { true }; + static constexpr auto StringEscapeOption = ParseOptions { false }; /* REQUIREMENTS: DAT-630 */ static constexpr auto whitespace_specifier = lexy::dsl::ascii::blank / lexy::dsl::ascii::newline; @@ -50,24 +73,28 @@ namespace ovdl::v2script::grammar { ascii / lexy::dsl::lit_b<0x8A> / lexy::dsl::lit_b<0x8C> / lexy::dsl::lit_b<0x8E> / lexy::dsl::lit_b<0x92> / lexy::dsl::lit_b<0x97> / lexy::dsl::lit_b<0x9A> / lexy::dsl::lit_b<0x9C> / - dsl::make_range<0x9E, 0x9F>() / - dsl::make_range<0xC0, 0xD6>() / - dsl::make_range<0xD8, 0xF6>() / - dsl::make_range<0xF8, 0xFF>(); + dsl::lit_b_range<0x9E, 0x9F> / + dsl::lit_b_range<0xC0, 0xD6> / + dsl::lit_b_range<0xD8, 0xF6> / + dsl::lit_b_range<0xF8, 0xFF>; static constexpr auto windows_1251_data_specifier_additions = - dsl::make_range<0x80, 0x81>() / lexy::dsl::lit_b<0x83> / lexy::dsl::lit_b<0x8D> / lexy::dsl::lit_b<0x8F> / + dsl::lit_b_range<0x80, 0x81> / lexy::dsl::lit_b<0x83> / lexy::dsl::lit_b<0x8D> / lexy::dsl::lit_b<0x8F> / lexy::dsl::lit_b<0x90> / lexy::dsl::lit_b<0x9D> / lexy::dsl::lit_b<0x9F> / - dsl::make_range<0xA1, 0xA3>() / lexy::dsl::lit_b<0xA5> / lexy::dsl::lit_b<0xA8> / lexy::dsl::lit_b<0xAA> / + dsl::lit_b_range<0xA1, 0xA3> / lexy::dsl::lit_b<0xA5> / lexy::dsl::lit_b<0xA8> / lexy::dsl::lit_b<0xAA> / lexy::dsl::lit_b<0xAF> / - dsl::make_range<0xB2, 0xB4>() / lexy::dsl::lit_b<0xB8> / lexy::dsl::lit_b<0xBA> / - dsl::make_range<0xBC, 0xBF>() / + dsl::lit_b_range<0xB2, 0xB4> / lexy::dsl::lit_b<0xB8> / lexy::dsl::lit_b<0xBA> / + dsl::lit_b_range<0xBC, 0xBF> / lexy::dsl::lit_b<0xD7> / lexy::dsl::lit_b<0xF7>; static constexpr auto data_specifier = windows_1252_data_specifier / windows_1251_data_specifier_additions; static constexpr auto data_char_class = LEXY_CHAR_CLASS("DataSpecifier", data_specifier); + static constexpr auto utf_data_specifier = lexy::dsl::unicode::xid_continue / LEXY_ASCII_ONE_OF("+:@%&'-."); + + static constexpr auto utf_char_class = LEXY_CHAR_CLASS("DataSpecifier", utf_data_specifier); + static constexpr auto escaped_symbols = lexy::symbol_table // .map<'"'>('"') .map<'\''>('\'') @@ -79,50 +106,121 @@ namespace ovdl::v2script::grammar { .map<'r'>('\r') .map<'t'>('\t'); - static constexpr auto id = lexy::dsl::identifier(data_char_class); + static constexpr auto id = lexy::dsl::identifier(ascii); template struct SimpleGrammar { struct StatementListBlock; - struct Identifier { - static constexpr auto rule = lexy::dsl::identifier(data_char_class); - static constexpr auto value = dsl::callback( - [](ast::ParseState& state, auto lexeme) { - auto value = state.ast().intern(lexeme.data(), lexeme.size()); - return state.ast().create(ovdl::NodeLocation::make_from(lexeme.begin(), lexeme.end()), value); - }); + struct Identifier : lexy::scan_production, + lexy::token_production { + + template + static constexpr scan_result scan(lexy::rule_scanner& scanner, detail::IsParseState auto& state) { + using encoding = typename Reader::encoding; + using char_type = typename encoding::char_type; + + std::basic_string value_result; + + auto content_begin = scanner.position(); + do { + if constexpr (std::same_as || std::same_as) { + if (lexy::scan_result> ascii_result; scanner.branch(ascii_result, lexy::dsl::identifier(ascii))) { + value_result.append(ascii_result.value().begin(), ascii_result.value().end()); + continue; + } + + char_type char_array[] { *scanner.position(), char_type {} }; + auto input = lexy::range_input(&char_array[0], &char_array[1]); + auto reader = input.reader(); + convert::map_value val = convert::try_parse_map(state.encoding(), reader); + + if (val.is_invalid()) { + ConvertErrorHandler::on_invalid_character(state, reader); + continue; + } + + if (!val.is_pass()) { + // non-pass characters are not valid ascii and are mapped to utf8 values + value_result.append(val._value); + scanner.parse(data_char_class); + } else { + break; + } + } else { + auto lexeme_result = scanner.template parse>(lexy::dsl::identifier(utf_char_class)); + if (lexeme_result) { + value_result.append(lexeme_result.value().begin(), lexeme_result.value().size()); + break; + } + } + } while (scanner); + auto content_end = scanner.position(); + + if (value_result.empty()) { + return lexy::scan_failed; + } + + auto value = state.ast().intern(value_result); + return state.ast().template create(ovdl::NodeLocation::make_from(content_begin, content_end), value); + } + + static constexpr auto rule = dsl::peek(data_char_class, utf_char_class) >> lexy::dsl::scan; }; /* REQUIREMENTS: * DAT-633 * DAT-634 */ - struct StringExpression { - static constexpr auto rule = [] { - if constexpr (Options.NoStringEscape) { - auto c = dsl::make_range<0x20, 0xFF>() / lexy::dsl::lit_b<0x07> / lexy::dsl::lit_b<0x09> / lexy::dsl::lit_b<0x0A> / lexy::dsl::lit_b<0x0D>; - return lexy::dsl::delimited(lexy::dsl::position(lexy::dsl::lit_b<'"'>))(c); - } else { - // Arbitrary code points that aren't control characters. - auto c = dsl::make_range<0x20, 0xFF>() - lexy::dsl::ascii::control; - - // Escape sequences start with a backlash. - // They either map one of the symbols, - // or a Unicode code point of the form uXXXX. - auto escape = lexy::dsl::backslash_escape // - .symbol(); - return lexy::dsl::delimited(lexy::dsl::position(lexy::dsl::lit_b<'"'>))(c, escape); - } - }(); - - static constexpr auto value = - lexy::as_string >> - dsl::callback( - [](ast::ParseState& state, const char* begin, auto&& str, const char* end) { - auto value = state.ast().intern(str.data(), str.length()); - return state.ast().create(ovdl::NodeLocation::make_from(begin, end), value); - }); + struct StringExpression : lexy::scan_production, + lexy::token_production { + + template + static constexpr scan_result scan(lexy::rule_scanner& scanner, detail::IsParseState auto& state) { + using encoding = typename Reader::encoding; + + constexpr auto rule = [] { + if constexpr (Options.NoStringEscape) { + auto c = [] { + if constexpr (std::same_as || std::same_as) { + return dsl::lit_b_range<0x20, 0xFF> / lexy::dsl::lit_b<0x07> / lexy::dsl::lit_b<0x09> / lexy::dsl::lit_b<0x0A> / lexy::dsl::lit_b<0x0D>; + } else { + return -lexy::dsl::unicode::control; + } + }(); + return lexy::dsl::quoted(c); + } else { + // Arbitrary code points that aren't control characters. + auto c = [] { + if constexpr (std::same_as || std::same_as) { + return dsl::lit_b_range<0x20, 0xFF> - lexy::dsl::ascii::control; + } else { + return -lexy::dsl::unicode::control; + } + }(); + + // Escape sequences start with a backlash. + // They either map one of the symbols, + // or a Unicode code point of the form uXXXX. + auto escape = lexy::dsl::backslash_escape // + .symbol(); + return lexy::dsl::quoted(c, escape); + } + }(); + + auto begin = scanner.position(); + lexy::scan_result str_result; + scanner.parse(str_result, rule); + if (!scanner || !str_result) + return lexy::scan_failed; + auto end = scanner.position(); + auto str = str_result.value(); + auto value = state.ast().intern(str.data(), str.size()); + return state.ast().template create(ovdl::NodeLocation::make_from(begin, end), value); + } + + static constexpr auto rule = lexy::dsl::peek(lexy::dsl::quoted.open()) >> lexy::dsl::scan; + static constexpr auto value = convert_as_string >> lexy::forward; }; /* REQUIREMENTS: DAT-638 */ @@ -132,59 +230,112 @@ namespace ovdl::v2script::grammar { }; struct SimpleAssignmentStatement { - static constexpr auto rule = - dsl::p >> - (lexy::dsl::equal_sign >> - (lexy::dsl::p | lexy::dsl::recurse_branch)); + static constexpr auto rule = [] { + auto right_brace = lexy::dsl::lit_c<'}'>; + + auto value_expression = lexy::dsl::p; + auto statement_list_expression = lexy::dsl::recurse_branch; + + auto rhs_recover = lexy::dsl::recover(value_expression, statement_list_expression).limit(right_brace); + auto rhs_try = lexy::dsl::try_(value_expression | statement_list_expression, rhs_recover); + + auto identifier = + dsl::p >> + (lexy::dsl::equal_sign >> rhs_try); + + auto recover = lexy::dsl::recover(identifier).limit(right_brace); + return lexy::dsl::try_(identifier, recover); + }(); static constexpr auto value = construct; }; /* REQUIREMENTS: DAT-639 */ struct AssignmentStatement { - static constexpr auto rule = - dsl::p >> + static constexpr auto rule = [] { + auto right_brace = lexy::dsl::lit_c<'}'>; + + auto value_expression = lexy::dsl::p; + auto statement_list_expression = lexy::dsl::recurse_branch; + + auto rhs_recover = lexy::dsl::recover(value_expression, statement_list_expression).limit(right_brace); + auto rhs_try = lexy::dsl::try_(value_expression | statement_list_expression, rhs_recover); + + auto identifier = + dsl::p >> (lexy::dsl::equal_sign >> - (lexy::dsl::p | lexy::dsl::recurse_branch) | - lexy::dsl::else_ >> lexy::dsl::return_) | - dsl::p | - lexy::dsl::recurse_branch; + rhs_try | + lexy::dsl::else_ >> lexy::dsl::return_); + + auto string_expression = dsl::p; + auto statement_list = lexy::dsl::recurse_branch; + + return identifier | string_expression | statement_list; + }(); static constexpr auto value = dsl::callback( - [](ast::ParseState& state, const char* pos, ast::IdentifierValue* name, ast::Value* initializer) { - return state.ast().create(pos, name, initializer); + [](detail::IsParseState auto& state, const char* pos, ast::IdentifierValue* name, ast::Value* initializer) { + return state.ast().template create(pos, name, initializer); }, - [](ast::ParseState& state, const char* pos, ast::Value* left, lexy::nullopt = {}) { - return state.ast().create(pos, left); + [](detail::IsParseState auto& state, bool&, const char* pos, ast::IdentifierValue* name, ast::Value* initializer) { + return state.ast().template create(pos, name, initializer); }, - [](ast::ParseState& state, ast::Value* left) { - return state.ast().create(state.ast().location_of(left), left); + [](detail::IsParseState auto& state, bool&, bool&, const char* pos, ast::IdentifierValue* name, ast::Value* initializer) { + return state.ast().template create(pos, name, initializer); + }, + [](detail::IsParseState auto& state, bool&, bool&, const char* pos, ast::Value* name) { + return state.ast().template create(pos, name); + }, + [](detail::IsParseState auto& state, const char* pos, ast::Value* left, lexy::nullopt = {}) { + return state.ast().template create(pos, left); + }, + [](detail::IsParseState auto& state, bool&, const char* pos, ast::Value* left, lexy::nullopt = {}) { + return state.ast().template create(pos, left); + }, + [](detail::IsParseState auto& state, ast::Value* left) -> ast::ValueStatement* { + if (left == nullptr) return nullptr; + return state.ast().template create(state.ast().location_of(left), left); + }, + [](detail::IsParseState auto& state, bool&, ast::Value* left) -> ast::ValueStatement* { + if (left == nullptr) return nullptr; + return state.ast().template create(state.ast().location_of(left), left); }); }; /* REQUIREMENTS: DAT-640 */ struct StatementListBlock { - static constexpr auto rule = - dsl::curly_bracketed( - (lexy::dsl::opt(lexy::dsl::list(lexy::dsl::recurse_branch)) + - lexy::dsl::opt(lexy::dsl::semicolon))); + static constexpr auto rule = [] { + auto right_brace = lexy::dsl::lit_c<'}'>; + + auto assign_statement = lexy::dsl::recurse_branch; + + auto assign_try = lexy::dsl::try_(assign_statement); + auto assign_opt = lexy::dsl::opt(lexy::dsl::list(assign_try)); + + auto curly_bracket = dsl::curly_bracketed(assign_opt + lexy::dsl::opt(lexy::dsl::semicolon)); + + return lexy::dsl::try_(curly_bracket, lexy::dsl::find(right_brace)); + }(); static constexpr auto value = lexy::as_list >> dsl::callback( - [](ast::ParseState& state, const char* begin, auto&& list, const char* end) { + [](detail::IsParseState auto& state, const char* begin, auto&& list, const char* end) { if constexpr (std::is_same_v, lexy::nullopt>) { - return state.ast().create(ovdl::NodeLocation::make_from(begin, end)); + return state.ast().template create(ovdl::NodeLocation::make_from(begin, end)); } else { - return state.ast().create(ovdl::NodeLocation::make_from(begin, end), LEXY_MOV(list)); + return state.ast().template create(ovdl::NodeLocation::make_from(begin, end), LEXY_MOV(list)); } }, - [](ast::ParseState& state, const char* begin, auto&& list, auto&& semicolon, const char* end) { + [](detail::IsParseState auto& state, const char* begin, auto&& list, auto&& semicolon, const char* end) { if constexpr (std::is_same_v, lexy::nullopt>) { - return state.ast().create(ovdl::NodeLocation::make_from(begin, end)); + return state.ast().template create(ovdl::NodeLocation::make_from(begin, end)); } else { - return state.ast().create(ovdl::NodeLocation::make_from(begin, end), LEXY_MOV(list)); + return state.ast().template create(ovdl::NodeLocation::make_from(begin, end), LEXY_MOV(list)); } + }, + [](detail::IsParseState auto& state, lexy::nullopt fail = {}) { + return fail; }); }; }; @@ -198,22 +349,20 @@ namespace ovdl::v2script::grammar { template using SAssignStatement = typename SimpleGrammar::SimpleAssignmentStatement; - template> + template> using keyword_rule = dsl::keyword_rule< - ast::ParseState, id, ast::AssignStatement, Keyword, Production, Value>; - template> + template> using fkeyword_rule = dsl::fkeyword_rule< - ast::ParseState, id, ast::AssignStatement, Keyword, Production, Value>; template - struct File { + struct BasicFile { // Allow arbitrary spaces between individual tokens. static constexpr auto whitespace = whitespace_specifier | comment_specifier; @@ -223,4 +372,6 @@ namespace ovdl::v2script::grammar { static constexpr auto value = lexy::as_list >> construct; }; + + using File = BasicFile; } -- cgit v1.2.3-56-ga3b1