diff options
author | Spartan322 <Megacake1234@gmail.com> | 2024-05-09 16:06:02 +0200 |
---|---|---|
committer | Spartan322 <Megacake1234@gmail.com> | 2024-06-18 01:31:12 +0200 |
commit | b0c3ba3f91926b0c95625bdbf4aab69269130b13 (patch) | |
tree | f15ebc47d6bf370031af28e4bb4814ae30ef46e1 /src/openvic-dataloader | |
parent | 7b521d6023113372cf6b02e562828273c4040f0e (diff) |
Add runtime encoding detection and conversionfix/char-detection
Win-1251/1252 detection is a reduced C++ version of https://github.com/hsivonen/chardetng
Add manually-specified encoding fallback
Add default system encoding fallback
Add error recovery to v2script
Add unknown encoding detection warning
Remove csv::Parser templating
Fix lua files dropping data
Update lexy to foonathan/lexy@1e5d99fa3826b1c3c8628d3a11117fb4fb4cc0d0
Remove exclusive reliance on lexy::default_encoding for v2script
Move internal concepts to src/openvic-detail/InternalConcepts.hpp
Move contents of DetectUtf8.hpp to src/detail/Detect.hpp
Move openvic-dataloader/AbstractSyntaxTree.hpp to src
Move DiagnosticLogger.hpp to src
Move File.hpp to src
Move openvic-dataloader/detail/utlity files to openvic-dataloader/detail
Add ovdl::utility::type_concat
Add ovdl::utility::type_prepend
Add ovdl::utility::is_instance_of
Overhaul parse error messages
Diffstat (limited to 'src/openvic-dataloader')
28 files changed, 3852 insertions, 708 deletions
diff --git a/src/openvic-dataloader/AbstractSyntaxTree.cpp b/src/openvic-dataloader/AbstractSyntaxTree.cpp index 11a90dc..d6f58f7 100644 --- a/src/openvic-dataloader/AbstractSyntaxTree.cpp +++ b/src/openvic-dataloader/AbstractSyntaxTree.cpp @@ -1,4 +1,4 @@ -#include <openvic-dataloader/AbstractSyntaxTree.hpp> +#include "AbstractSyntaxTree.hpp" using namespace ovdl; diff --git a/src/openvic-dataloader/AbstractSyntaxTree.hpp b/src/openvic-dataloader/AbstractSyntaxTree.hpp new file mode 100644 index 0000000..a5b8886 --- /dev/null +++ b/src/openvic-dataloader/AbstractSyntaxTree.hpp @@ -0,0 +1,89 @@ +#pragma once + +#include <concepts> +#include <cstdio> +#include <string_view> +#include <utility> + +#include <openvic-dataloader/NodeLocation.hpp> +#include <openvic-dataloader/detail/SymbolIntern.hpp> +#include <openvic-dataloader/detail/Utility.hpp> + +#include <dryad/node.hpp> +#include <dryad/node_map.hpp> +#include <dryad/symbol.hpp> +#include <dryad/tree.hpp> + +#include <fmt/core.h> + +#include "detail/InternalConcepts.hpp" + +namespace ovdl { + struct AbstractSyntaxTree : SymbolIntern { + symbol_type intern(const char* str, std::size_t length); + symbol_type intern(std::string_view str); + const char* intern_cstr(const char* str, std::size_t length); + const char* intern_cstr(std::string_view str); + symbol_interner_type& symbol_interner(); + const symbol_interner_type& symbol_interner() const; + + protected: + symbol_interner_type _symbol_interner; + }; + + template<detail::IsFile FileT, std::derived_from<typename FileT::node_type> RootNodeT> + struct BasicAbstractSyntaxTree : AbstractSyntaxTree { + using file_type = FileT; + using root_node_type = RootNodeT; + using node_type = typename file_type::node_type; + + explicit BasicAbstractSyntaxTree(file_type&& file) : _file { std::move(file) } {} + + template<typename Encoding, typename MemoryResource = void> + explicit BasicAbstractSyntaxTree(lexy::buffer<Encoding, MemoryResource>&& buffer) : _file { std::move(buffer) } {} + + void set_location(const node_type* n, NodeLocation loc) { + _file.set_location(n, loc); + } + + NodeLocation location_of(const node_type* n) const { + return _file.location_of(n); + } + + root_node_type* root() { + return _tree.root(); + } + + const root_node_type* root() const { + return _tree.root(); + } + + file_type& file() { + return _file; + } + + const file_type& file() const { + return _file; + } + + template<typename T, typename... Args> + T* create(NodeLocation loc, Args&&... args) { + auto node = _tree.template create<T>(DRYAD_FWD(args)...); + set_location(node, loc); + return node; + } + + template<typename T, typename... Args> + T* create(const char* begin, const char* end, Args&&... args) { + return create<T>(NodeLocation::make_from(begin, end), DRYAD_FWD(args)...); + } + + void set_root(root_node_type* node) { + _tree.set_root(node); + } + + protected: + dryad::tree<root_node_type> _tree; + file_type _file; + }; +}
\ No newline at end of file diff --git a/src/openvic-dataloader/DiagnosticLogger.cpp b/src/openvic-dataloader/DiagnosticLogger.cpp index aae3dcb..9fe5e93 100644 --- a/src/openvic-dataloader/DiagnosticLogger.cpp +++ b/src/openvic-dataloader/DiagnosticLogger.cpp @@ -1,4 +1,4 @@ -#include <openvic-dataloader/DiagnosticLogger.hpp> +#include "DiagnosticLogger.hpp" using namespace ovdl; @@ -9,8 +9,7 @@ DiagnosticLogger::operator bool() const { bool DiagnosticLogger::errored() const { return _errored; } bool DiagnosticLogger::warned() const { return _warned; } - NodeLocation DiagnosticLogger::location_of(const error::Error* error) const { auto result = _map.lookup(error); - return result ? *result : NodeLocation{}; + return result ? *result : NodeLocation {}; }
\ No newline at end of file diff --git a/src/openvic-dataloader/DiagnosticLogger.hpp b/src/openvic-dataloader/DiagnosticLogger.hpp new file mode 100644 index 0000000..2a655a9 --- /dev/null +++ b/src/openvic-dataloader/DiagnosticLogger.hpp @@ -0,0 +1,492 @@ +#pragma once + +#include <concepts> // IWYU pragma: keep +#include <cstdio> +#include <ostream> +#include <string> +#include <type_traits> +#include <utility> + +#include <openvic-dataloader/Error.hpp> +#include <openvic-dataloader/NodeLocation.hpp> +#include <openvic-dataloader/detail/CallbackOStream.hpp> +#include <openvic-dataloader/detail/ErrorRange.hpp> +#include <openvic-dataloader/detail/OStreamOutputIterator.hpp> +#include <openvic-dataloader/detail/SymbolIntern.hpp> +#include <openvic-dataloader/detail/Utility.hpp> + +#include <lexy/error.hpp> +#include <lexy/input/base.hpp> +#include <lexy/input/buffer.hpp> +#include <lexy/input_location.hpp> +#include <lexy/visualize.hpp> + +#include <dryad/_detail/config.hpp> +#include <dryad/abstract_node.hpp> +#include <dryad/arena.hpp> +#include <dryad/node.hpp> +#include <dryad/node_map.hpp> +#include <dryad/tree.hpp> + +#include <fmt/core.h> + +#include <lexy_ext/report_error.hpp> + +namespace ovdl { + template<typename ParseState> + struct BasicDiagnosticLogger; + + struct DiagnosticLogger : SymbolIntern { + using AnnotationKind = lexy_ext::annotation_kind; + using DiagnosticKind = lexy_ext::diagnostic_kind; + + using error_range = detail::error_range<error::Root>; + + explicit operator bool() const; + bool errored() const; + bool warned() const; + + NodeLocation location_of(const error::Error* error) const; + + template<std::derived_from<DiagnosticLogger> Logger> + struct ErrorCallback { + ErrorCallback(Logger& logger) : _logger(&logger) {} + + struct sink_t { + using return_type = std::size_t; + + template<typename Input, typename Tag> + void operator()(lexy::error_context<Input> const& context, lexy::error_for<Input, Tag> const& error) { + using Reader = lexy::input_reader<Input>; + using Encoding = typename Reader::encoding; + using char_type = typename Encoding::char_type; + error::Error* result; + + std::string production_name = context.production(); + auto left_strip = production_name.find_first_of('<'); + if (left_strip != std::string::npos) { + auto right_strip = production_name.find_first_of('>', left_strip); + if (right_strip != std::string::npos) { + production_name.erase(left_strip, right_strip - left_strip + 1); + } + } + + auto context_location = lexy::get_input_location(context.input(), context.position()); + auto location = lexy::get_input_location(context.input(), error.position(), context_location.anchor()); + + if constexpr (detail::is_instance_of_v<Logger, BasicDiagnosticLogger>) { + lexy_ext::diagnostic_writer impl { context.input() }; + + BasicNodeLocation loc = [&] { + if constexpr (std::is_same_v<Tag, lexy::expected_literal>) { + return BasicNodeLocation<char_type>::make_from(error.position(), error.position() + error.index() + 1); + } else if constexpr (std::is_same_v<Tag, lexy::expected_keyword>) { + return BasicNodeLocation<char_type>::make_from(error.position(), error.end()); + } else if constexpr (std::is_same_v<Tag, lexy::expected_char_class>) { + return BasicNodeLocation<char_type>::make_from(error.position(), error.position() + 1); + } else { + return BasicNodeLocation<char_type>::make_from(error.position(), error.end()); + } + }(); + + auto writer = _logger.template parse_error<Tag>(impl, loc, production_name.c_str()); + if (location.line_nr() != context_location.line_nr()) + writer.secondary(BasicNodeLocation { context.position(), lexy::_detail::next(context.position()) }, "beginning here").finish(); + + if constexpr (std::is_same_v<Tag, lexy::expected_literal>) { + auto string = lexy::_detail::make_literal_lexeme<typename Reader::encoding>(error.string(), error.length()); + writer.primary(loc, "expected '{}'", string.data()) + .finish(); + } else if constexpr (std::is_same_v<Tag, lexy::expected_keyword>) { + auto string = lexy::_detail::make_literal_lexeme<typename Reader::encoding>(error.string(), error.length()); + writer.primary(loc, "expected keyword '{}'", string.data()) + .finish(); + } else if constexpr (std::is_same_v<Tag, lexy::expected_char_class>) { + writer.primary(loc, "expected {}", error.name()) + .finish(); + } else { + writer.primary(loc, error.message()) + .finish(); + } + result = writer.error(); + } else { + auto production = _logger.intern_cstr(production_name); + if constexpr (std::is_same_v<Tag, lexy::expected_literal>) { + auto string = lexy::_detail::make_literal_lexeme<typename Reader::encoding>(error.string(), error.length()); + NodeLocation loc = NodeLocation::make_from(context.position(), error.position() - 1); + auto message = _logger.intern_cstr(fmt::format("expected '{}'", string.data())); + result = _logger.template create<error::ExpectedLiteral>(loc, message, production); + } else if constexpr (std::is_same_v<Tag, lexy::expected_keyword>) { + auto string = lexy::_detail::make_literal_lexeme<typename Reader::encoding>(error.string(), error.length()); + NodeLocation loc = NodeLocation::make_from(context.position(), error.position() - 1); + auto message = _logger.intern_cstr(fmt::format("expected keyword '{}'", string.data())); + result = _logger.template create<error::ExpectedKeyword>(loc, message, production); + } else if constexpr (std::is_same_v<Tag, lexy::expected_char_class>) { + auto message = _logger.intern_cstr(fmt::format("expected {}", error.name())); + result = _logger.template create<error::ExpectedCharClass>(error.position(), message, production); + } else { + NodeLocation loc = NodeLocation::make_from(error.begin(), error.end()); + auto message = _logger.intern_cstr(error.message()); + result = _logger.template create<error::GenericParseError>(loc, message, production); + } + } + _logger.insert(result); + + _count++; + } + + std::size_t finish() && { + return _count; + } + + Logger& _logger; + std::size_t _count; + }; + + constexpr auto sink() const { + return sink_t { *_logger, 0 }; + } + + mutable Logger* _logger; + }; + + template<typename T, typename LocCharT, typename... Args> + T* create(BasicNodeLocation<LocCharT> loc, Args&&... args) { + using node_creator = dryad::node_creator<decltype(DRYAD_DECLVAL(T).kind()), void>; + T* result = _tree.create<T>(DRYAD_FWD(args)...); + _map.insert(result, loc); + return result; + } + + template<typename T> + T* create() { + using node_creator = dryad::node_creator<decltype(DRYAD_DECLVAL(T).kind()), void>; + T* result = _tree.create<T>(); + return result; + } + + error_range get_errors() const { + return _tree.root()->errors(); + } + + protected: + bool _errored = false; + bool _warned = false; + dryad::node_map<const error::Error, NodeLocation> _map; + dryad::tree<error::Root> _tree; + + symbol_interner_type _symbol_interner; + + void insert(error::Error* root) { + _tree.root()->insert_back(root); + } + + public: + symbol_type intern(const char* str, std::size_t length) { + return _symbol_interner.intern(str, length); + } + symbol_type intern(std::string_view str) { + return intern(str.data(), str.size()); + } + const char* intern_cstr(const char* str, std::size_t length) { + return intern(str, length).c_str(_symbol_interner); + } + const char* intern_cstr(std::string_view str) { + return intern_cstr(str.data(), str.size()); + } + symbol_interner_type& symbol_interner() { + return _symbol_interner; + } + const symbol_interner_type& symbol_interner() const { + return _symbol_interner; + } + }; + + template<typename ParseState> + struct BasicDiagnosticLogger : DiagnosticLogger { + using parse_state_type = ParseState; + using file_type = typename parse_state_type::file_type; + + template<typename... Args> + using format_str = fmt::basic_format_string<char, fmt::type_identity_t<Args>...>; + + explicit BasicDiagnosticLogger(const file_type& file) + : _file(&file) { + _tree.set_root(_tree.create<error::Root>()); + } + + struct Writer; + + template<typename... Args> + Writer error(format_str<Args...> fmt, Args&&... args) { + return log(DiagnosticKind::error, fmt, std::forward<Args>(args)...); + } + + template<typename... Args> + Writer warning(format_str<Args...> fmt, Args&&... args) { + return log(DiagnosticKind::warning, fmt, std::forward<Args>(args)...); + } + + template<typename... Args> + Writer note(format_str<Args...> fmt, Args&&... args) { + return log(DiagnosticKind::note, fmt, std::forward<Args>(args)...); + } + + template<typename... Args> + Writer info(format_str<Args...> fmt, Args&&... args) { + return log(DiagnosticKind::info, fmt, std::forward<Args>(args)...); + } + + template<typename... Args> + Writer debug(format_str<Args...> fmt, Args&&... args) { + return log(DiagnosticKind::debug, fmt, std::forward<Args>(args)...); + } + + template<typename... Args> + Writer fixit(format_str<Args...> fmt, Args&&... args) { + return log(DiagnosticKind::fixit, fmt, std::forward<Args>(args)...); + } + + template<typename... Args> + Writer help(format_str<Args...> fmt, Args&&... args) { + return log(DiagnosticKind::help, fmt, std::forward<Args>(args)...); + } + + Writer error(std::string_view sv) { + return log(DiagnosticKind::error, fmt::runtime(sv)); + } + + Writer warning(std::string_view sv) { + return log(DiagnosticKind::warning, fmt::runtime(sv)); + } + + Writer note(std::string_view sv) { + return log(DiagnosticKind::note, fmt::runtime(sv)); + } + + Writer info(std::string_view sv) { + return log(DiagnosticKind::info, fmt::runtime(sv)); + } + + Writer debug(std::string_view sv) { + return log(DiagnosticKind::debug, fmt::runtime(sv)); + } + + Writer fixit(std::string_view sv) { + return log(DiagnosticKind::fixit, fmt::runtime(sv)); + } + + Writer help(std::string_view sv) { + return log(DiagnosticKind::help, fmt::runtime(sv)); + } + + auto error_callback() { + return ErrorCallback(*this); + } + + template<typename CharT> + static void _write_to_buffer(const CharT* s, std::streamsize n, void* output_str) { + auto* output = reinterpret_cast<std::basic_string<CharT>*>(output_str); + output->append(s, n); + } + + template<typename CharT> + auto make_callback_stream(std::basic_string<CharT>& output) { + return detail::make_callback_stream<CharT>(&_write_to_buffer<CharT>, reinterpret_cast<void*>(&output)); + } + + template<typename CharT> + detail::OStreamOutputIterator make_ostream_iterator(std::basic_ostream<CharT>& stream) { + return detail::OStreamOutputIterator { stream }; + } + + struct Writer { + template<typename LocCharT, typename... Args> + [[nodiscard]] Writer& primary(BasicNodeLocation<LocCharT> loc, format_str<Args...> fmt, Args&&... args) { + return annotation(AnnotationKind::primary, loc, fmt, std::forward<Args>(args)...); + } + + template<typename LocCharT, typename... Args> + [[nodiscard]] Writer& secondary(BasicNodeLocation<LocCharT> loc, format_str<Args...> fmt, Args&&... args) { + return annotation(AnnotationKind::secondary, loc, fmt, std::forward<Args>(args)...); + } + + template<typename LocCharT> + [[nodiscard]] Writer& primary(BasicNodeLocation<LocCharT> loc, const char* sv) { + return annotation(AnnotationKind::primary, loc, fmt::runtime(sv)); + } + + template<typename LocCharT> + [[nodiscard]] Writer& secondary(BasicNodeLocation<LocCharT> loc, const char* sv) { + return annotation(AnnotationKind::secondary, loc, fmt::runtime(sv)); + } + + void finish() {} + + template<typename LocCharT, typename... Args> + [[nodiscard]] Writer& annotation(AnnotationKind kind, BasicNodeLocation<LocCharT> loc, format_str<Args...> fmt, Args&&... args) { + std::basic_string<typename decltype(fmt.get())::value_type> output; + + _file.visit_buffer([&](auto&& buffer) { + using char_type = typename std::decay_t<decltype(buffer)>::encoding::char_type; + + BasicNodeLocation<char_type> converted_loc = loc; + + auto begin_loc = lexy::get_input_location(buffer, converted_loc.begin()); + + auto stream = _logger.make_callback_stream(output); + auto iter = _logger.make_ostream_iterator(stream); + + lexy_ext::diagnostic_writer _impl { buffer, { lexy::visualize_fancy } }; + _impl.write_empty_annotation(iter); + _impl.write_annotation(iter, kind, begin_loc, converted_loc.end(), + [&](auto out, lexy::visualization_options) { + return lexy::_detail::write_str(out, fmt::format(fmt, std::forward<Args>(args)...).c_str()); + }); + }); + + error::Annotation* annotation; + auto message = _logger.intern_cstr(output); + switch (kind) { + case AnnotationKind::primary: + annotation = _logger.create<error::PrimaryAnnotation>(loc, message); + break; + case AnnotationKind::secondary: + annotation = _logger.create<error::SecondaryAnnotation>(loc, message); + break; + default: detail::unreachable(); + } + _annotated->push_back(annotation); + return *this; + } + + error::AnnotatedError* error() { + return _annotated; + } + + private: + Writer(BasicDiagnosticLogger& logger, const file_type& file, error::AnnotatedError* annotated) + : _file(file), + _logger(logger), + _annotated(annotated) {} + + const file_type& _file; + BasicDiagnosticLogger& _logger; + error::AnnotatedError* _annotated; + + friend BasicDiagnosticLogger; + }; + + template<std::derived_from<error::Error> T, typename Buffer, typename... Args> + void log_with_impl(lexy_ext::diagnostic_writer<Buffer>& impl, T* error, DiagnosticKind kind, format_str<Args...> fmt, Args&&... args) { + std::basic_string<typename decltype(fmt.get())::value_type> output; + auto stream = make_callback_stream(output); + auto iter = make_ostream_iterator(stream); + + impl.write_message(iter, kind, + [&](auto out, lexy::visualization_options) { + return lexy::_detail::write_str(out, fmt::format(fmt, std::forward<Args>(args)...).c_str()); + }); + impl.write_path(iter, file().path()); + + auto message = intern_cstr(output); + error->_set_message(message); + if (!error->is_linked_in_tree()) + insert(error); + } + + template<typename Tag, typename Buffer> + Writer parse_error(lexy_ext::diagnostic_writer<Buffer>& impl, NodeLocation loc, const char* production_name) { + std::basic_string<typename Buffer::encoding::char_type> output; + auto stream = make_callback_stream(output); + auto iter = make_ostream_iterator(stream); + + impl.write_message(iter, DiagnosticKind::error, + [&](auto out, lexy::visualization_options) { + return lexy::_detail::write_str(out, fmt::format("while parsing {}", production_name).c_str()); + }); + impl.write_path(iter, file().path()); + + auto production = intern_cstr(production_name); + auto message = intern_cstr(output); + auto* error = [&] { + if constexpr (std::is_same_v<Tag, lexy::expected_literal>) { + return create<error::ExpectedLiteral>(loc, message, production); + } else if constexpr (std::is_same_v<Tag, lexy::expected_keyword>) { + return create<error::ExpectedKeyword>(loc, message, production); + } else if constexpr (std::is_same_v<Tag, lexy::expected_char_class>) { + return create<error::ExpectedCharClass>(loc, message, production); + } else { + return create<error::GenericParseError>(loc, message, production); + } + }(); + + Writer result(*this, file(), error); + _errored = true; + + return result; + } + + template<std::derived_from<error::Error> T, typename... Args> + void log_with_error(T* error, DiagnosticKind kind, format_str<Args...> fmt, Args&&... args) { + file().visit_buffer( + [&](auto&& buffer) { + lexy_ext::diagnostic_writer impl { buffer }; + log_with_impl(impl, error, kind, fmt, std::forward<Args>(args)...); + }); + } + + template<std::derived_from<error::Error> T, typename... Args> + void create_log(DiagnosticKind kind, format_str<Args...> fmt, Args&&... args) { + log_with_error(create<T>(), kind, fmt, std::forward<Args>(args)...); + } + + template<typename... Args> + Writer log(DiagnosticKind kind, format_str<Args...> fmt, Args&&... args) { + error::Semantic* semantic; + + switch (kind) { + case DiagnosticKind::error: + semantic = create<error::SemanticError>(); + break; + case DiagnosticKind::warning: + semantic = create<error::SemanticWarning>(); + break; + case DiagnosticKind::info: + semantic = create<error::SemanticInfo>(); + break; + case DiagnosticKind::debug: + semantic = create<error::SemanticDebug>(); + break; + case DiagnosticKind::fixit: + semantic = create<error::SemanticFixit>(); + break; + case DiagnosticKind::help: + semantic = create<error::SemanticHelp>(); + break; + default: detail::unreachable(); + } + + Writer result(*this, file(), semantic); + + file().visit_buffer([&](auto&& buffer) { + lexy_ext::diagnostic_writer impl { buffer }; + log_with_impl(impl, semantic, kind, fmt, std::forward<Args>(args)...); + }); + + if (kind == DiagnosticKind::error) + _errored = true; + if (kind == DiagnosticKind::warning) + _warned = true; + + return result; + } + + const auto& file() const { + return *_file; + } + + private: + const file_type* _file; + }; +}
\ No newline at end of file diff --git a/src/openvic-dataloader/File.cpp b/src/openvic-dataloader/File.cpp index 9b27bf0..e4d3773 100644 --- a/src/openvic-dataloader/File.cpp +++ b/src/openvic-dataloader/File.cpp @@ -1,4 +1,10 @@ -#include <openvic-dataloader/File.hpp> +#include "File.hpp" + +#include <cstring> + +#include <openvic-dataloader/detail/Utility.hpp> + +#include <lexy/encoding.hpp> using namespace ovdl; @@ -6,4 +12,8 @@ File::File(const char* path) : _path(path) {} const char* File::path() const noexcept { return _path; +} + +bool File::is_valid() const noexcept { + return _buffer.index() != 0 && !_buffer.valueless_by_exception() && visit_buffer([](auto&& buffer) { return buffer.data() != nullptr; }); }
\ No newline at end of file diff --git a/src/openvic-dataloader/File.hpp b/src/openvic-dataloader/File.hpp new file mode 100644 index 0000000..90fcb11 --- /dev/null +++ b/src/openvic-dataloader/File.hpp @@ -0,0 +1,139 @@ +#pragma once + +#include <cassert> +#include <concepts> // IWYU pragma: keep +#include <type_traits> +#include <variant> + +#include <openvic-dataloader/NodeLocation.hpp> +#include <openvic-dataloader/detail/Utility.hpp> + +#include <lexy/encoding.hpp> +#include <lexy/input/buffer.hpp> + +#include <dryad/node_map.hpp> + +namespace ovdl { + struct File { + using buffer_ids = detail::TypeRegister< + lexy::buffer<lexy::default_encoding, void>, + lexy::buffer<lexy::utf8_char_encoding, void>, + lexy::buffer<lexy::utf8_encoding, void>, + lexy::buffer<lexy::utf16_encoding, void>, + lexy::buffer<lexy::utf32_encoding, void>, + lexy::buffer<lexy::byte_encoding, void>>; + + explicit File(const char* path); + + const char* path() const noexcept; + + bool is_valid() const noexcept; + + template<typename Encoding, typename MemoryResource = void> + constexpr bool is_buffer() const { + return buffer_ids::type_id<lexy::buffer<Encoding, MemoryResource>>() + 1 == _buffer.index(); + } + + template<typename Encoding, typename MemoryResource = void> + lexy::buffer<Encoding, MemoryResource>* try_get_buffer_as() { + return std::get_if<lexy::buffer<Encoding, MemoryResource>>(&_buffer); + } + + template<typename Encoding, typename MemoryResource = void> + const lexy::buffer<Encoding, MemoryResource>* try_get_buffer_as() const { + return std::get_if<lexy::buffer<Encoding, MemoryResource>>(&_buffer); + } + + template<typename Encoding, typename MemoryResource = void> + lexy::buffer<Encoding, MemoryResource>& get_buffer_as() { + assert((is_buffer<Encoding, MemoryResource>())); + return *std::get_if<lexy::buffer<Encoding, MemoryResource>>(&_buffer); + } + + template<typename Encoding, typename MemoryResource = void> + const lexy::buffer<Encoding, MemoryResource>& get_buffer_as() const { + assert((is_buffer<Encoding, MemoryResource>())); + return *std::get_if<lexy::buffer<Encoding, MemoryResource>>(&_buffer); + } + +#define SWITCH_LIST \ + X(1) \ + X(2) \ + X(3) \ + X(4) \ + X(5) \ + X(6) + +#define X(NUM) \ + case NUM: \ + return visitor(std::get<NUM>(_buffer)); + + template<typename Visitor> + decltype(auto) visit_buffer(Visitor&& visitor) { + switch (_buffer.index()) { + SWITCH_LIST + default: ovdl::detail::unreachable(); + } + } + + template<typename Return, typename Visitor> + Return visit_buffer(Visitor&& visitor) { + switch (_buffer.index()) { + SWITCH_LIST + default: ovdl::detail::unreachable(); + } + } + + template<typename Visitor> + decltype(auto) visit_buffer(Visitor&& visitor) const { + switch (_buffer.index()) { + SWITCH_LIST + default: ovdl::detail::unreachable(); + } + } + + template<typename Return, typename Visitor> + Return visit_buffer(Visitor&& visitor) const { + switch (_buffer.index()) { + SWITCH_LIST + default: ovdl::detail::unreachable(); + } + } +#undef X +#undef SWITCH_LIST + + protected: + const char* _path; + detail::type_prepend_t<buffer_ids::variant_type, std::monostate> _buffer; + }; + + template<typename NodeT> + struct BasicFile : File { + using node_type = NodeT; + + template<typename Encoding, typename MemoryResource = void> + explicit BasicFile(const char* path, lexy::buffer<Encoding, MemoryResource>&& buffer) + : File(path) { + _buffer = static_cast<std::remove_reference_t<decltype(buffer)>&&>(buffer); + } + + template<typename Encoding, typename MemoryResource = void> + explicit BasicFile(lexy::buffer<Encoding, MemoryResource>&& buffer) + : File("") { + _buffer = static_cast<std::remove_reference_t<decltype(buffer)>&&>(buffer); + } + + void set_location(const node_type* n, NodeLocation loc) { + _map.insert(n, loc); + } + + NodeLocation location_of(const node_type* n) const { + auto result = _map.lookup(n); + DRYAD_ASSERT(result != nullptr, "every Node should have a NodeLocation"); + return *result; + } + + protected: + dryad::node_map<const node_type, NodeLocation> _map; + }; +}
\ No newline at end of file diff --git a/src/openvic-dataloader/NodeLocation.cpp b/src/openvic-dataloader/NodeLocation.cpp deleted file mode 100644 index 9e4f669..0000000 --- a/src/openvic-dataloader/NodeLocation.cpp +++ /dev/null @@ -1,26 +0,0 @@ -#include <openvic-dataloader/NodeLocation.hpp> - -using namespace ovdl; - -NodeLocation::NodeLocation() = default; -NodeLocation::NodeLocation(const char* pos) : _begin(pos), - _end(pos) {} -NodeLocation::NodeLocation(const char* begin, const char* end) : _begin(begin), - _end(end) {} - -NodeLocation::NodeLocation(const NodeLocation&) noexcept = default; -NodeLocation& NodeLocation::operator=(const NodeLocation&) = default; - -NodeLocation::NodeLocation(NodeLocation&&) = default; -NodeLocation& NodeLocation::operator=(NodeLocation&&) = default; - -const char* NodeLocation::begin() const { return _begin; } -const char* NodeLocation::end() const { return _end; } - -bool NodeLocation::is_synthesized() const { return _begin == nullptr && _end == nullptr; } - -NodeLocation NodeLocation::make_from(const char* begin, const char* end) { - end++; - if (begin >= end) return NodeLocation(begin); - return NodeLocation(begin, end); -} diff --git a/src/openvic-dataloader/ParseState.hpp b/src/openvic-dataloader/ParseState.hpp new file mode 100644 index 0000000..806829c --- /dev/null +++ b/src/openvic-dataloader/ParseState.hpp @@ -0,0 +1,105 @@ +#pragma once + +#include <utility> + +#include <openvic-dataloader/detail/Encoding.hpp> + +#include <lexy/encoding.hpp> +#include <lexy/input/buffer.hpp> + +#include <dryad/tree.hpp> + +#include "DiagnosticLogger.hpp" +#include "detail/InternalConcepts.hpp" + +namespace ovdl { + struct BasicParseState { + explicit BasicParseState(detail::Encoding encoding = detail::Encoding::Unknown) : _encoding(encoding) {} + + detail::Encoding encoding() const { + return _encoding; + } + + protected: + detail::Encoding _encoding; + }; + + template<detail::IsAst AstT> + struct ParseState : BasicParseState { + using ast_type = AstT; + using file_type = typename ast_type::file_type; + using diagnostic_logger_type = BasicDiagnosticLogger<ParseState>; + + ParseState(typename ast_type::file_type&& file, detail::Encoding encoding) + : _ast { std::move(file) }, + _logger { this->ast().file() }, + BasicParseState(encoding) {} + + template<typename Encoding, typename MemoryResource = void> + ParseState(lexy::buffer<Encoding, MemoryResource>&& buffer, detail::Encoding encoding) + : ParseState(typename ast_type::file_type { std::move(buffer) }, encoding) {} + + template<typename Encoding, typename MemoryResource = void> + ParseState(const char* path, lexy::buffer<Encoding, MemoryResource>&& buffer, detail::Encoding encoding) + : ParseState(typename ast_type::file_type { path, std::move(buffer) }, encoding) {} + + ast_type& ast() { + return _ast; + } + + const ast_type& ast() const { + return _ast; + } + + diagnostic_logger_type& logger() { + return _logger; + } + + const diagnostic_logger_type& logger() const { + return _logger; + } + + private: + ast_type _ast; + diagnostic_logger_type _logger; + }; + + template<detail::IsFile FileT> + struct FileParseState : BasicParseState { + using file_type = FileT; + using diagnostic_logger_type = BasicDiagnosticLogger<FileParseState>; + + FileParseState(file_type&& file, detail::Encoding encoding) + : _file { std::move(file) }, + _logger { this->file() }, + BasicParseState(encoding) {} + + template<typename Encoding, typename MemoryResource = void> + FileParseState(lexy::buffer<Encoding, MemoryResource>&& buffer, detail::Encoding encoding) + : FileParseState(file_type { std::move(buffer) }, encoding) {} + + template<typename Encoding, typename MemoryResource = void> + FileParseState(const char* path, lexy::buffer<Encoding, MemoryResource>&& buffer, detail::Encoding encoding) + : FileParseState(file_type { path, std::move(buffer) }, encoding) {} + + file_type& file() { + return _file; + } + + const file_type& file() const { + return _file; + } + + diagnostic_logger_type& logger() { + return _logger; + } + + const diagnostic_logger_type& logger() const { + return _logger; + } + + private: + file_type _file; + diagnostic_logger_type _logger; + }; +}
\ No newline at end of file diff --git a/src/openvic-dataloader/csv/CsvGrammar.hpp b/src/openvic-dataloader/csv/CsvGrammar.hpp index 5451f26..19aee54 100644 --- a/src/openvic-dataloader/csv/CsvGrammar.hpp +++ b/src/openvic-dataloader/csv/CsvGrammar.hpp @@ -9,22 +9,20 @@ #include <openvic-dataloader/csv/LineObject.hpp> #include <openvic-dataloader/csv/Parser.hpp> +#include <lexy/_detail/config.hpp> #include <lexy/callback.hpp> +#include <lexy/callback/string.hpp> #include <lexy/dsl.hpp> +#include <lexy/dsl/ascii.hpp> +#include <lexy/dsl/option.hpp> +#include <lexy/encoding.hpp> +#include "detail/Convert.hpp" +#include "detail/InternalConcepts.hpp" #include "detail/dsl.hpp" // Grammar Definitions // namespace ovdl::csv::grammar { - using EncodingType = ovdl::csv::EncodingType; - - template<typename T> - concept ParseChars = requires() { - { T::character }; - { T::control }; - }; - - template<ParseChars T> struct ParseOptions { /// @brief Seperator character char SepChar; @@ -33,12 +31,34 @@ namespace ovdl::csv::grammar { /// @brief Paradox-style localization escape characters /// @note Is ignored if SupportStrings is true char EscapeChar; + }; - static constexpr auto parse_chars = T {}; - static constexpr auto character = parse_chars.character; - static constexpr auto control = parse_chars.control; + struct ConvertErrorHandler { + static constexpr void on_invalid_character(detail::IsStateType auto& state, auto reader) { + state.logger().warning("invalid character value '{}' found", static_cast<int>(reader.peek())) // + .primary(BasicNodeLocation { reader.position() }, "here") + .finish(); + } }; + constexpr bool IsUtf8(auto encoding) { + return std::same_as<std::decay_t<decltype(encoding)>, lexy::utf8_char_encoding>; + } + + template<ParseOptions Options, typename String> + constexpr auto convert_as_string = convert::convert_as_string< + String, + ConvertErrorHandler>; + + constexpr auto ansi_character = lexy::dsl::ascii::character / dsl::lit_b_range<0x80, 0xFF>; + constexpr auto ansi_control = + lexy::dsl::ascii::control / + lexy::dsl::lit_b<0x81> / lexy::dsl::lit_b<0x8D> / lexy::dsl::lit_b<0x8F> / + lexy::dsl::lit_b<0x90> / lexy::dsl::lit_b<0x9D>; + + constexpr auto utf_character = lexy::dsl::unicode::character; + constexpr auto utf_control = lexy::dsl::unicode::control; + constexpr auto escaped_symbols = lexy::symbol_table<char> // .map<'"'>('"') .map<'\''>('\'') @@ -55,38 +75,95 @@ namespace ovdl::csv::grammar { template<ParseOptions Options> struct CsvGrammar { - struct StringValue { - static constexpr auto rule = [] { - // Arbitrary code points - auto c = Options.character - Options.control; + struct StringValue : lexy::scan_production<std::string>, + lexy::token_production { + + template<typename Context, typename Reader> + static constexpr scan_result scan(lexy::rule_scanner<Context, Reader>& scanner, detail::IsFileParseState auto& state) { + using encoding = typename Reader::encoding; + + constexpr auto rule = [] { + // Arbitrary code points + auto c = [] { + if constexpr (std::same_as<encoding, lexy::default_encoding> || std::same_as<encoding, lexy::byte_encoding>) { + return ansi_character - ansi_control; + } else { + return utf_character - utf_control; + } + }(); - auto back_escape = lexy::dsl::backslash_escape // - .symbol<escaped_symbols>(); + auto back_escape = lexy::dsl::backslash_escape // + .symbol<escaped_symbols>(); - auto quote_escape = lexy::dsl::escape(lexy::dsl::lit_c<'"'>) // - .template symbol<escaped_quote>(); + auto quote_escape = lexy::dsl::escape(lexy::dsl::lit_c<'"'>) // + .template symbol<escaped_quote>(); - return lexy::dsl::delimited(lexy::dsl::lit_c<'"'>, lexy::dsl::not_followed_by(lexy::dsl::lit_c<'"'>, lexy::dsl::lit_c<'"'>))(c, back_escape, quote_escape); - }(); + return lexy::dsl::delimited(lexy::dsl::lit_c<'"'>, lexy::dsl::not_followed_by(lexy::dsl::lit_c<'"'>, lexy::dsl::lit_c<'"'>))(c, back_escape, quote_escape); + }(); + + lexy::scan_result<std::string> str_result = scanner.template parse<std::string>(rule); + if (!scanner || !str_result) + return lexy::scan_failed; + return str_result.value(); + } - static constexpr auto value = lexy::as_string<std::string>; + static constexpr auto rule = lexy::dsl::peek(lexy::dsl::lit_c<'"'>) >> lexy::dsl::scan; + + static constexpr auto value = convert_as_string<Options, std::string> >> lexy::forward<std::string>; }; - struct PlainValue { - static constexpr auto rule = [] { + struct PlainValue : lexy::scan_production<std::string>, + lexy::token_production { + + template<auto character> + static constexpr auto _escape_check = character - (lexy::dsl::lit_b<Options.SepChar> / lexy::dsl::ascii::newline); + + template<typename Context, typename Reader> + static constexpr scan_result scan(lexy::rule_scanner<Context, Reader>& scanner, detail::IsFileParseState auto& state) { + using encoding = typename Reader::encoding; + + constexpr auto rule = [] { + constexpr auto character = [] { + if constexpr (std::same_as<encoding, lexy::default_encoding> || std::same_as<encoding, lexy::byte_encoding>) { + return ansi_character; + } else { + return utf_character; + } + }(); + + if constexpr (Options.SupportStrings) { + return lexy::dsl::identifier(character - (lexy::dsl::lit_b<Options.SepChar> / lexy::dsl::ascii::newline)); + } else { + auto escape_check_char = _escape_check<character>; + auto id_check_char = escape_check_char - lexy::dsl::lit_b<'\\'>; + auto id_segment = lexy::dsl::identifier(id_check_char); + auto escape_segement = lexy::dsl::token(escape_check_char); + auto escape_sym = lexy::dsl::symbol<escaped_symbols>(escape_segement); + auto escape_rule = lexy::dsl::lit_b<'\\'> >> escape_sym; + return lexy::dsl::list(id_segment | escape_rule); + } + }(); + if constexpr (Options.SupportStrings) { - return lexy::dsl::identifier(Options.character - (lexy::dsl::lit_b<Options.SepChar> / lexy::dsl::ascii::newline)); + auto lexeme_result = scanner.template parse<lexy::lexeme<Reader>>(rule); + if (!scanner || !lexeme_result) + return lexy::scan_failed; + return std::string { lexeme_result.value().begin(), lexeme_result.value().end() }; } else { - auto escape_check_char = Options.character - (lexy::dsl::lit_b<Options.SepChar> / lexy::dsl::ascii::newline); - auto id_check_char = escape_check_char - lexy::dsl::lit_b<'\\'>; - auto id_segment = lexy::dsl::identifier(id_check_char); - auto escape_segement = lexy::dsl::token(escape_check_char); - auto escape_sym = lexy::dsl::symbol<escaped_symbols>(escape_segement); - auto escape_rule = lexy::dsl::lit_b<'\\'> >> escape_sym; - return lexy::dsl::list(id_segment | escape_rule); + lexy::scan_result<std::string> str_result = scanner.template parse<std::string>(rule); + if (!scanner || !str_result) + return lexy::scan_failed; + return str_result.value(); } - }(); - static constexpr auto value = lexy::as_string<std::string>; + } + + static constexpr auto rule = + dsl::peek( + _escape_check<ansi_character>, + _escape_check<utf_character>) >> + lexy::dsl::scan; + + static constexpr auto value = convert_as_string<Options, std::string> >> lexy::forward<std::string>; }; struct Value { @@ -114,17 +191,17 @@ namespace ovdl::csv::grammar { static constexpr auto rule = lexy::dsl::list(lexy::dsl::p<Value>, lexy::dsl::trailing_sep(lexy::dsl::p<Seperator>)); static constexpr auto value = lexy::fold_inplace<ovdl::csv::LineObject>( std::initializer_list<ovdl::csv::LineObject::value_type> {}, - [](ovdl::csv::LineObject& result, auto&& arg) { - if constexpr (std::is_same_v<std::decay_t<decltype(arg)>, std::size_t>) { - // Count seperators, adds to previous value, making it a position - using position_type = ovdl::csv::LineObject::position_type; - result.emplace_back(static_cast<position_type>(arg + result.back().first), ""); + [](ovdl::csv::LineObject& result, std::size_t&& arg) { + // Count seperators, adds to previous value, making it a position + using position_type = ovdl::csv::LineObject::position_type; + result.emplace_back(static_cast<position_type>(arg + result.back().first), ""); + }, + [](ovdl::csv::LineObject& result, std::string&& arg) { + if (result.empty()) { + result.emplace_back(0u, LEXY_MOV(arg)); } else { - if (result.empty()) result.emplace_back(0u, LEXY_MOV(arg)); - else { - auto& [pos, value] = result.back(); - value = arg; - } + auto& [pos, value] = result.back(); + value = LEXY_MOV(arg); } }); }; @@ -169,74 +246,17 @@ namespace ovdl::csv::grammar { static constexpr auto value = lexy::as_list<std::vector<ovdl::csv::LineObject>>; }; - template<ParseChars T> - using CommaFile = File<ParseOptions<T> { ',', false, '$' }>; - template<ParseChars T> - using ColonFile = File<ParseOptions<T> { ':', false, '$' }>; - template<ParseChars T> - using SemiColonFile = File<ParseOptions<T> { ';', false, '$' }>; - template<ParseChars T> - using TabFile = File<ParseOptions<T> { '\t', false, '$' }>; - template<ParseChars T> - using BarFile = File<ParseOptions<T> { '|', false, '$' }>; - - namespace strings { - template<ParseChars T> - using CommaFile = File<ParseOptions<T> { ',', true, '$' }>; - template<ParseChars T> - using ColonFile = File<ParseOptions<T> { ':', true, '$' }>; - template<ParseChars T> - using SemiColonFile = File<ParseOptions<T> { ';', true, '$' }>; - template<ParseChars T> - using TabFile = File<ParseOptions<T> { '\t', true, '$' }>; - template<ParseChars T> - using BarFile = File<ParseOptions<T> { '|', true, '$' }>; - } -} - -namespace ovdl::csv::grammar::windows1252 { - struct windows1252_t { - static constexpr auto character = dsl::make_range<0x01, 0xFF>(); - static constexpr auto control = - lexy::dsl::ascii::control / - lexy::dsl::lit_b<0x81> / lexy::dsl::lit_b<0x8D> / lexy::dsl::lit_b<0x8F> / - lexy::dsl::lit_b<0x90> / lexy::dsl::lit_b<0x9D>; - }; - - using CommaFile = CommaFile<windows1252_t>; - using ColonFile = ColonFile<windows1252_t>; - using SemiColonFile = SemiColonFile<windows1252_t>; - using TabFile = TabFile<windows1252_t>; - using BarFile = BarFile<windows1252_t>; - - namespace strings { - using CommaFile = grammar::strings::CommaFile<windows1252_t>; - using ColonFile = grammar::strings::ColonFile<windows1252_t>; - using SemiColonFile = grammar::strings::SemiColonFile<windows1252_t>; - using TabFile = grammar::strings::TabFile<windows1252_t>; - using BarFile = grammar::strings::BarFile<windows1252_t>; - - } -} - -namespace ovdl::csv::grammar::utf8 { - struct unicode_t { - static constexpr auto character = lexy::dsl::unicode::character; - static constexpr auto control = lexy::dsl::unicode::control; - }; - - using CommaFile = CommaFile<unicode_t>; - using ColonFile = ColonFile<unicode_t>; - using SemiColonFile = SemiColonFile<unicode_t>; - using TabFile = TabFile<unicode_t>; - using BarFile = BarFile<unicode_t>; + using CommaFile = File<ParseOptions { ',', false, '$' }>; + using ColonFile = File<ParseOptions { ':', false, '$' }>; + using SemiColonFile = File<ParseOptions { ';', false, '$' }>; + using TabFile = File<ParseOptions { '\t', false, '$' }>; + using BarFile = File<ParseOptions { '|', false, '$' }>; namespace strings { - using CommaFile = grammar::strings::CommaFile<unicode_t>; - using ColonFile = grammar::strings::ColonFile<unicode_t>; - using SemiColonFile = grammar::strings::SemiColonFile<unicode_t>; - using TabFile = grammar::strings::TabFile<unicode_t>; - using BarFile = grammar::strings::BarFile<unicode_t>; - + using CommaFile = File<ParseOptions { ',', true, '$' }>; + using ColonFile = File<ParseOptions { ':', true, '$' }>; + using SemiColonFile = File<ParseOptions { ';', true, '$' }>; + using TabFile = File<ParseOptions { '\t', true, '$' }>; + using BarFile = File<ParseOptions { '|', true, '$' }>; } }
\ No newline at end of file diff --git a/src/openvic-dataloader/csv/CsvParseState.hpp b/src/openvic-dataloader/csv/CsvParseState.hpp index 2390453..ee60c34 100644 --- a/src/openvic-dataloader/csv/CsvParseState.hpp +++ b/src/openvic-dataloader/csv/CsvParseState.hpp @@ -1,28 +1,16 @@ #pragma once -#include <openvic-dataloader/File.hpp> -#include <openvic-dataloader/ParseState.hpp> #include <openvic-dataloader/csv/LineObject.hpp> #include <openvic-dataloader/csv/Parser.hpp> #include <lexy/encoding.hpp> -template<ovdl::csv::EncodingType> -struct LexyEncodingFrom { -}; +#include "File.hpp" +#include "ParseState.hpp" +#include "detail/InternalConcepts.hpp" -template<> -struct LexyEncodingFrom<ovdl::csv::EncodingType::Windows1252> { - using encoding = lexy::default_encoding; -}; +namespace ovdl::csv { + using CsvParseState = ovdl::FileParseState<ovdl::BasicFile<std::vector<ovdl::csv::LineObject>>>; -template<> -struct LexyEncodingFrom<ovdl::csv::EncodingType::Utf8> { - using encoding = lexy::utf8_char_encoding; -}; - -template<ovdl::csv::EncodingType Encoding> -using CsvFile = ovdl::BasicFile<typename LexyEncodingFrom<Encoding>::encoding, std::vector<ovdl::csv::LineObject>>; - -template<ovdl::csv::EncodingType Encoding> -using CsvParseState = ovdl::FileParseState<CsvFile<Encoding>>;
\ No newline at end of file + static_assert(detail::IsFileParseState<CsvParseState>, "CsvParseState failed IsFileParseState concept"); +}
\ No newline at end of file diff --git a/src/openvic-dataloader/csv/Parser.cpp b/src/openvic-dataloader/csv/Parser.cpp index 361f6ad..5dbee32 100644 --- a/src/openvic-dataloader/csv/Parser.cpp +++ b/src/openvic-dataloader/csv/Parser.cpp @@ -1,11 +1,14 @@ +#include <iostream> +#include <optional> +#include <type_traits> #include <vector> -#include <openvic-dataloader/File.hpp> +#include <openvic-dataloader/NodeLocation.hpp> #include <openvic-dataloader/csv/LineObject.hpp> #include <openvic-dataloader/csv/Parser.hpp> -#include <openvic-dataloader/detail/LexyReportError.hpp> +#include <openvic-dataloader/detail/Encoding.hpp> #include <openvic-dataloader/detail/OStreamOutputIterator.hpp> -#include <openvic-dataloader/detail/utility/Utility.hpp> +#include <openvic-dataloader/detail/Utility.hpp> #include <lexy/action/parse.hpp> #include <lexy/encoding.hpp> @@ -22,15 +25,27 @@ using namespace ovdl::csv; /// ParseHandler /// -template<EncodingType Encoding> -struct Parser<Encoding>::ParseHandler final : detail::BasicFileParseHandler<CsvParseState<Encoding>> { +struct Parser::ParseHandler final : detail::BasicFileParseHandler<CsvParseState> { template<typename Node> std::optional<DiagnosticLogger::error_range> parse() { - auto result = lexy::parse<Node>(this->buffer(), *this->_parse_state, this->_parse_state->logger().error_callback()); + auto result = [&] { + switch (parse_state().encoding()) { + using enum detail::Encoding; + case Ascii: + case Utf8: + return lexy::parse<Node>(buffer<lexy::utf8_char_encoding>(), parse_state(), parse_state().logger().error_callback()); + case Unknown: + case Windows1251: + case Windows1252: + return lexy::parse<Node>(buffer<lexy::default_encoding>(), parse_state(), parse_state().logger().error_callback()); + default: + ovdl::detail::unreachable(); + } + }(); if (!result) { - return this->_parse_state->logger().get_errors(); + return this->parse_state().logger().get_errors(); } - _lines = std::move(result.value()); + _lines = LEXY_MOV(result).value(); return std::nullopt; } @@ -42,55 +57,45 @@ private: std::vector<csv::LineObject> _lines; }; -/// BufferHandler /// +/// ParserHandler /// -template<EncodingType Encoding> -Parser<Encoding>::Parser() +Parser::Parser() : _parse_handler(std::make_unique<ParseHandler>()) { set_error_log_to_null(); } -template<EncodingType Encoding> -Parser<Encoding>::Parser(std::basic_ostream<char>& error_stream) +Parser::Parser(std::basic_ostream<char>& error_stream) : _parse_handler(std::make_unique<ParseHandler>()) { set_error_log_to(error_stream); } -template<EncodingType Encoding> -Parser<Encoding>::Parser(Parser&&) = default; -template<EncodingType Encoding> -Parser<Encoding>& Parser<Encoding>::operator=(Parser&&) = default; -template<EncodingType Encoding> -Parser<Encoding>::~Parser() = default; +Parser::Parser(Parser&&) = default; +Parser& Parser::operator=(Parser&&) = default; +Parser::~Parser() = default; -template<EncodingType Encoding> -Parser<Encoding> Parser<Encoding>::from_buffer(const char* data, std::size_t size) { +Parser Parser::from_buffer(const char* data, std::size_t size, std::optional<detail::Encoding> encoding_fallback) { Parser result; - return std::move(result.load_from_buffer(data, size)); + return std::move(result.load_from_buffer(data, size, encoding_fallback)); } -template<EncodingType Encoding> -Parser<Encoding> Parser<Encoding>::from_buffer(const char* start, const char* end) { +Parser Parser::from_buffer(const char* start, const char* end, std::optional<detail::Encoding> encoding_fallback) { Parser result; - return std::move(result.load_from_buffer(start, end)); + return std::move(result.load_from_buffer(start, end, encoding_fallback)); } -template<EncodingType Encoding> -Parser<Encoding> Parser<Encoding>::from_string(const std::string_view string) { +Parser Parser::from_string(const std::string_view string, std::optional<detail::Encoding> encoding_fallback) { Parser result; - return std::move(result.load_from_string(string)); + return std::move(result.load_from_string(string, encoding_fallback)); } -template<EncodingType Encoding> -Parser<Encoding> Parser<Encoding>::from_file(const char* path) { +Parser Parser::from_file(const char* path, std::optional<detail::Encoding> encoding_fallback) { Parser result; - return std::move(result.load_from_file(path)); + return std::move(result.load_from_file(path, encoding_fallback)); } -template<EncodingType Encoding> -Parser<Encoding> Parser<Encoding>::from_file(const std::filesystem::path& path) { +Parser Parser::from_file(const std::filesystem::path& path, std::optional<detail::Encoding> encoding_fallback) { Parser result; - return std::move(result.load_from_file(path)); + return std::move(result.load_from_file(path, encoding_fallback)); } /// @@ -106,9 +111,8 @@ Parser<Encoding> Parser<Encoding>::from_file(const std::filesystem::path& path) /// @param func /// @param args /// -template<EncodingType Encoding> template<typename... Args> -constexpr void Parser<Encoding>::_run_load_func(detail::LoadCallback<ParseHandler, Args...> auto func, Args... args) { +constexpr void Parser::_run_load_func(detail::LoadCallback<ParseHandler, Args...> auto func, Args... args) { _has_fatal_error = false; auto error = func(_parse_handler.get(), std::forward<Args>(args)...); auto error_message = _parse_handler->make_error_from(error); @@ -122,82 +126,66 @@ constexpr void Parser<Encoding>::_run_load_func(detail::LoadCallback<ParseHandle } } -template<EncodingType Encoding> -constexpr Parser<Encoding>& Parser<Encoding>::load_from_buffer(const char* data, std::size_t size) { +constexpr Parser& Parser::load_from_buffer(const char* data, std::size_t size, std::optional<detail::Encoding> encoding_fallback) { // Type can't be deduced? - _run_load_func(std::mem_fn(&ParseHandler::load_buffer_size), data, size); + _run_load_func(std::mem_fn(&ParseHandler::load_buffer_size), data, size, encoding_fallback); return *this; } -template<EncodingType Encoding> -constexpr Parser<Encoding>& Parser<Encoding>::load_from_buffer(const char* start, const char* end) { +constexpr Parser& Parser::load_from_buffer(const char* start, const char* end, std::optional<detail::Encoding> encoding_fallback) { // Type can't be deduced? - _run_load_func(std::mem_fn(&ParseHandler::load_buffer), start, end); + _run_load_func(std::mem_fn(&ParseHandler::load_buffer), start, end, encoding_fallback); return *this; } -template<EncodingType Encoding> -constexpr Parser<Encoding>& Parser<Encoding>::load_from_string(const std::string_view string) { - return load_from_buffer(string.data(), string.size()); +constexpr Parser& Parser::load_from_string(const std::string_view string, std::optional<detail::Encoding> encoding_fallback) { + return load_from_buffer(string.data(), string.size(), encoding_fallback); } -template<EncodingType Encoding> -Parser<Encoding>& Parser<Encoding>::load_from_file(const char* path) { +Parser& Parser::load_from_file(const char* path, std::optional<detail::Encoding> encoding_fallback) { set_file_path(path); // Type can be deduced?? - _run_load_func(std::mem_fn(&ParseHandler::load_file), path); + _run_load_func(std::mem_fn(&ParseHandler::load_file), get_file_path().data(), encoding_fallback); return *this; } -template<EncodingType Encoding> -Parser<Encoding>& Parser<Encoding>::load_from_file(const std::filesystem::path& path) { - return load_from_file(path.string().c_str()); +Parser& Parser::load_from_file(const std::filesystem::path& path, std::optional<detail::Encoding> encoding_fallback) { + return load_from_file(path.string().c_str(), encoding_fallback); } -template<EncodingType Encoding> -bool Parser<Encoding>::parse_csv(bool handle_strings) { +bool Parser::parse_csv(bool handle_strings) { if (!_parse_handler->is_valid()) { return false; } - std::optional<Parser<Encoding>::error_range> errors; - // auto report_error = ovdl::detail::ReporError.path(_file_path).to(detail::OStreamOutputIterator { _error_stream }); - if constexpr (Encoding == EncodingType::Windows1252) { + std::optional<Parser::error_range> errors = [&] { if (handle_strings) - errors = _parse_handler->template parse<csv::grammar::windows1252::strings::SemiColonFile>(); + return _parse_handler->template parse<csv::grammar::strings::SemiColonFile>(); else - errors = _parse_handler->template parse<csv::grammar::windows1252::SemiColonFile>(); - } else { - if (handle_strings) - errors = _parse_handler->template parse<csv::grammar::utf8::strings::SemiColonFile>(); - else - errors = _parse_handler->template parse<csv::grammar::utf8::SemiColonFile>(); - } + return _parse_handler->template parse<csv::grammar::SemiColonFile>(); + }(); _has_error = _parse_handler->parse_state().logger().errored(); _has_warning = _parse_handler->parse_state().logger().warned(); if (!errors->empty()) { + _has_error = true; _has_fatal_error = true; if (&_error_stream.get() != &detail::cnull) { print_errors_to(_error_stream); } return false; } - _lines = std::move(_parse_handler->get_lines()); return true; } -template<EncodingType Encoding> -const std::vector<csv::LineObject>& Parser<Encoding>::get_lines() const { - return _lines; +const std::vector<csv::LineObject>& Parser::get_lines() const { + return _parse_handler->get_lines(); } -template<EncodingType Encoding> -typename Parser<Encoding>::error_range Parser<Encoding>::get_errors() const { +typename Parser::error_range Parser::get_errors() const { return _parse_handler->parse_state().logger().get_errors(); } -template<EncodingType Encoding> -const FilePosition Parser<Encoding>::get_error_position(const error::Error* error) const { +const FilePosition Parser::get_error_position(const error::Error* error) const { if (!error || !error->is_linked_in_tree()) { return {}; } @@ -206,18 +194,27 @@ const FilePosition Parser<Encoding>::get_error_position(const error::Error* erro return {}; } - auto loc_begin = lexy::get_input_location(_parse_handler->buffer(), err_location.begin()); - FilePosition result { loc_begin.line_nr(), loc_begin.line_nr(), loc_begin.column_nr(), loc_begin.column_nr() }; - if (err_location.begin() < err_location.end()) { - auto loc_end = lexy::get_input_location(_parse_handler->buffer(), err_location.end(), loc_begin.anchor()); - result.end_line = loc_end.line_nr(); - result.end_column = loc_end.column_nr(); - } - return result; +// TODO: Remove reinterpret_cast +// WARNING: This almost certainly breaks on utf16 and utf32 encodings, luckily we don't parse in that format +// This is purely to silence the node_location errors because char8_t is useless +#define REINTERPRET_IT(IT) reinterpret_cast<const std::decay_t<decltype(buffer)>::encoding::char_type*>((IT)) + + return _parse_handler->parse_state().file().visit_buffer( + [&](auto&& buffer) -> FilePosition { + auto loc_begin = lexy::get_input_location(buffer, REINTERPRET_IT(err_location.begin())); + FilePosition result { loc_begin.line_nr(), loc_begin.line_nr(), loc_begin.column_nr(), loc_begin.column_nr() }; + if (err_location.begin() < err_location.end()) { + auto loc_end = lexy::get_input_location(buffer, REINTERPRET_IT(err_location.end()), loc_begin.anchor()); + result.end_line = loc_end.line_nr(); + result.end_column = loc_end.column_nr(); + } + return result; + }); + +#undef REINTERPRET_IT } -template<EncodingType Encoding> -void Parser<Encoding>::print_errors_to(std::basic_ostream<char>& stream) const { +void Parser::print_errors_to(std::basic_ostream<char>& stream) const { auto errors = get_errors(); if (errors.empty()) return; for (const auto error : errors) { @@ -226,19 +223,9 @@ void Parser<Encoding>::print_errors_to(std::basic_ostream<char>& stream) const { [&](const error::BufferError* buffer_error) { stream << "buffer error: " << buffer_error->message() << '\n'; }, - [&](const error::ParseError* parse_error) { - auto position = get_error_position(parse_error); - std::string pos_str = fmt::format(":{}:{}: ", position.start_line, position.start_column); - stream << _file_path << pos_str << "parse error for '" << parse_error->production_name() << "': " << parse_error->message() << '\n'; - }, - [&](dryad::child_visitor<error::ErrorKind> visitor, const error::Semantic* semantic) { - auto position = get_error_position(semantic); - std::string pos_str = ": "; - if (!position.is_empty()) { - pos_str = fmt::format(":{}:{}: ", position.start_line, position.start_column); - } - stream << _file_path << pos_str << semantic->message() << '\n'; - auto annotations = semantic->annotations(); + [&](dryad::child_visitor<error::ErrorKind> visitor, const error::AnnotatedError* annotated_error) { + stream << annotated_error->message() << '\n'; + auto annotations = annotated_error->annotations(); for (auto annotation : annotations) { visitor(annotation); } @@ -250,7 +237,4 @@ void Parser<Encoding>::print_errors_to(std::basic_ostream<char>& stream) const { stream << secondary->message() << '\n'; }); } -} - -template class ovdl::csv::Parser<EncodingType::Windows1252>; -template class ovdl::csv::Parser<EncodingType::Utf8>;
\ No newline at end of file +}
\ No newline at end of file diff --git a/src/openvic-dataloader/detail/Convert.hpp b/src/openvic-dataloader/detail/Convert.hpp new file mode 100644 index 0000000..5d9fca0 --- /dev/null +++ b/src/openvic-dataloader/detail/Convert.hpp @@ -0,0 +1,577 @@ +#pragma once + +#include <cstddef> +#include <string_view> +#include <type_traits> + +#include <lexy/_detail/config.hpp> +#include <lexy/callback/string.hpp> +#include <lexy/code_point.hpp> +#include <lexy/dsl/option.hpp> +#include <lexy/dsl/symbol.hpp> +#include <lexy/encoding.hpp> +#include <lexy/input/base.hpp> +#include <lexy/input/file.hpp> +#include <lexy/input/string_input.hpp> +#include <lexy/lexeme.hpp> + +#include "openvic-dataloader/detail/Encoding.hpp" + +#include "ParseState.hpp" // IWYU pragma: keep +#include "detail/InternalConcepts.hpp" +#include "detail/dsl.hpp" +#include "v2script/ParseState.hpp" + +namespace ovdl::convert { + struct MappedChar { + char value; + std::string_view utf8; + + constexpr bool is_invalid() const { return value == 0; } + constexpr bool is_pass() const { return value == 1; } + }; + constexpr MappedChar invalid_map { 0, "" }; + constexpr MappedChar pass_map { 1, "" }; + + struct map_value { + std::string_view _value; + + constexpr map_value() noexcept : _value("") {} + constexpr map_value(std::nullptr_t) noexcept : _value("\0") {} + constexpr explicit map_value(std::string_view val) noexcept : _value(val) {} + + constexpr bool is_invalid() const { + return !_value.empty() && _value[0] == '\0'; + } + + constexpr bool is_pass() const { + return _value.empty(); + } + + constexpr bool is_valid() const noexcept { + return !_value.empty() && _value[0] != '\0'; + } + + constexpr explicit operator bool() const noexcept { + return is_valid(); + } + }; + + template<typename T> + concept IsConverter = requires(unsigned char c, lexy::_pr<lexy::deduce_encoding<char>>& reader) { + { T::try_parse(reader) } -> std::same_as<map_value>; + }; + + struct Utf8 { + static constexpr auto map = lexy::symbol_table<std::string_view>; + + template<typename Reader> + static constexpr map_value try_parse(Reader& reader) { + return {}; + } + }; + static_assert(IsConverter<Utf8>); + + struct Windows1252 { + static constexpr auto map = lexy::symbol_table<std::string_view> // + .map<'\x80'>("€") + .map<'\x82'>("‚") + .map<'\x83'>("ƒ") + .map<'\x84'>("„") + .map<'\x85'>("…") + .map<'\x86'>("†") + .map<'\x87'>("‡") + .map<'\x88'>("ˆ") + .map<'\x89'>("‰") + .map<'\x8A'>("Š") + .map<'\x8B'>("‹") + .map<'\x8C'>("Œ") + .map<'\x8E'>("Ž") + + .map<'\x91'>("‘") + .map<'\x92'>("’") + .map<'\x93'>("“") + .map<'\x94'>("”") + .map<'\x95'>("•") + .map<'\x96'>("–") + .map<'\x97'>("—") + .map<'\x98'>("˜") + .map<'\x99'>("™") + .map<'\x9A'>("š") + .map<'\x9B'>("›") + .map<'\x9C'>("œ") + .map<'\x9E'>("ž") + .map<'\x9F'>("Ÿ") + + .map<'\xA0'>(" ") + .map<'\xA1'>("¡") + .map<'\xA2'>("¢") + .map<'\xA3'>("£") + .map<'\xA4'>("¤") + .map<'\xA5'>("¥") + .map<'\xA6'>("¦") + .map<'\xA7'>("§") + .map<'\xA8'>("¨") + .map<'\xA9'>("©") + .map<'\xAA'>("ª") + .map<'\xAB'>("«") + .map<'\xAC'>("¬") + .map<'\xAD'>("") // Soft Hyphen + .map<'\xAE'>("®") + .map<'\xAF'>("¯") + + .map<'\xB0'>("°") + .map<'\xB1'>("±") + .map<'\xB2'>("²") + .map<'\xB3'>("³") + .map<'\xB4'>("´") + .map<'\xB5'>("µ") + .map<'\xB6'>("¶") + .map<'\xB7'>("·") + .map<'\xB8'>("¸") + .map<'\xB9'>("¹") + .map<'\xBA'>("º") + .map<'\xBB'>("»") + .map<'\xBC'>("¼") + .map<'\xBD'>("½") + .map<'\xBE'>("¾") + .map<'\xBF'>("¿") + + .map<'\xC0'>("À") + .map<'\xC1'>("Á") + .map<'\xC2'>("Â") + .map<'\xC3'>("Ã") + .map<'\xC4'>("Ä") + .map<'\xC5'>("Å") + .map<'\xC6'>("Æ") + .map<'\xC7'>("Ç") + .map<'\xC8'>("È") + .map<'\xC9'>("É") + .map<'\xCA'>("Ê") + .map<'\xCB'>("Ë") + .map<'\xCC'>("Ì") + .map<'\xCD'>("Í") + .map<'\xCE'>("Î") + .map<'\xCF'>("Ï") + + .map<'\xD0'>("Ð") + .map<'\xD1'>("Ñ") + .map<'\xD2'>("Ò") + .map<'\xD3'>("Ó") + .map<'\xD4'>("Ô") + .map<'\xD5'>("Õ") + .map<'\xD6'>("Ö") + .map<'\xD7'>("×") + .map<'\xD8'>("Ø") + .map<'\xD9'>("Ù") + .map<'\xDA'>("Ú") + .map<'\xDB'>("Û") + .map<'\xDC'>("Ü") + .map<'\xDD'>("Ý") + .map<'\xDE'>("Þ") + .map<'\xDF'>("ß") + + .map<'\xE0'>("à") + .map<'\xE1'>("á") + .map<'\xE2'>("â") + .map<'\xE3'>("ã") + .map<'\xE4'>("ä") + .map<'\xE5'>("å") + .map<'\xE6'>("æ") + .map<'\xE7'>("ç") + .map<'\xE8'>("è") + .map<'\xE9'>("é") + .map<'\xEA'>("ê") + .map<'\xEB'>("ë") + .map<'\xEC'>("ì") + .map<'\xED'>("í") + .map<'\xEE'>("î") + .map<'\xEF'>("ï") + + .map<'\xF0'>("ð") + .map<'\xF1'>("ñ") + .map<'\xF2'>("ò") + .map<'\xF3'>("ó") + .map<'\xF4'>("ô") + .map<'\xF5'>("õ") + .map<'\xF6'>("ö") + .map<'\xF7'>("÷") + .map<'\xF8'>("ø") + .map<'\xF9'>("ù") + .map<'\xFA'>("ú") + .map<'\xFB'>("û") + .map<'\xFC'>("ü") + .map<'\xFD'>("ý") + .map<'\xFE'>("þ") + .map<'\xFF'>("ÿ"); + + template<typename Reader> + static constexpr map_value try_parse(Reader& reader) { + auto index = map.try_parse(reader); + if (index) { + return map_value(map[index]); + } + return {}; + } + }; + static_assert(IsConverter<Windows1252>); + + struct Windows1251 { + static constexpr auto map = lexy::symbol_table<std::string_view> // + .map<'\x80'>("Ђ") + .map<'\x81'>("Ѓ") + .map<'\x82'>("‚") + .map<'\x83'>("ѓ") + .map<'\x84'>("„") + .map<'\x85'>("…") + .map<'\x86'>("†") + .map<'\x87'>("‡") + .map<'\x88'>("€") + .map<'\x89'>("‰") + .map<'\x8A'>("Љ") + .map<'\x8B'>("‹") + .map<'\x8C'>("Њ") + .map<'\x8D'>("Ќ") + .map<'\x8E'>("Ћ") + .map<'\x8F'>("Џ") + + .map<'\x90'>("ђ") + .map<'\x91'>("‘") + .map<'\x92'>("’") + .map<'\x93'>("“") + .map<'\x94'>("”") + .map<'\x95'>("•") + .map<'\x96'>("–") + .map<'\x97'>("—") + .map<'\x99'>("™") + .map<'\x9A'>("љ") + .map<'\x9B'>("›") + .map<'\x9C'>("њ") + .map<'\x9D'>("ќ") + .map<'\x9E'>("ћ") + .map<'\x9F'>("џ") + + .map<'\xA0'>(" ") + .map<'\xA1'>("Ў") + .map<'\xA2'>("ў") + .map<'\xA3'>("Ј") + .map<'\xA4'>("¤") + .map<'\xA5'>("Ґ") + .map<'\xA6'>("¦") + .map<'\xA7'>("§") + .map<'\xA8'>("Ё") + .map<'\xA9'>("©") + .map<'\xAA'>("Є") + .map<'\xAB'>("«") + .map<'\xAC'>("¬") + .map<'\xAD'>("") // Soft Hyphen + .map<'\xAE'>("®") + .map<'\xAF'>("Ї") + + .map<'\xB0'>("°") + .map<'\xB1'>("±") + .map<'\xB2'>("І") + .map<'\xB3'>("і") + .map<'\xB4'>("ґ") + .map<'\xB5'>("µ") + .map<'\xB6'>("¶") + .map<'\xB7'>("·") + .map<'\xB8'>("ё") + .map<'\xB9'>("№") + .map<'\xBA'>("є") + .map<'\xBB'>("»") + .map<'\xBC'>("ј") + .map<'\xBD'>("Ѕ") + .map<'\xBE'>("ѕ") + .map<'\xBF'>("ї") + + .map<'\xC0'>("А") + .map<'\xC1'>("Б") + .map<'\xC2'>("В") + .map<'\xC3'>("Г") + .map<'\xC4'>("Д") + .map<'\xC5'>("Е") + .map<'\xC6'>("Ж") + .map<'\xC7'>("З") + .map<'\xC8'>("И") + .map<'\xC9'>("Й") + .map<'\xCA'>("К") + .map<'\xCB'>("Л") + .map<'\xCC'>("М") + .map<'\xCD'>("Н") + .map<'\xCE'>("О") + .map<'\xCF'>("П") + + .map<'\xD0'>("Р") + .map<'\xD1'>("С") + .map<'\xD2'>("Т") + .map<'\xD3'>("У") + .map<'\xD4'>("Ф") + .map<'\xD5'>("Х") + .map<'\xD6'>("Ц") + .map<'\xD7'>("Ч") + .map<'\xD8'>("Ш") + .map<'\xD9'>("Щ") + .map<'\xDA'>("Ъ") + .map<'\xDB'>("Ы") + .map<'\xDC'>("Ь") + .map<'\xDD'>("Э") + .map<'\xDE'>("Ю") + .map<'\xDF'>("Я") + + .map<'\xE0'>("а") + .map<'\xE1'>("б") + .map<'\xE2'>("в") + .map<'\xE3'>("г") + .map<'\xE4'>("д") + .map<'\xE5'>("е") + .map<'\xE6'>("ж") + .map<'\xE7'>("з") + .map<'\xE8'>("и") + .map<'\xE9'>("й") + .map<'\xEA'>("к") + .map<'\xEB'>("л") + .map<'\xEC'>("м") + .map<'\xED'>("н") + .map<'\xEE'>("о") + .map<'\xEF'>("п") + + .map<'\xF0'>("р") + .map<'\xF1'>("с") + .map<'\xF2'>("т") + .map<'\xF3'>("у") + .map<'\xF4'>("ф") + .map<'\xF5'>("х") + .map<'\xF6'>("ц") + .map<'\xF7'>("ч") + .map<'\xF8'>("ш") + .map<'\xF9'>("щ") + .map<'\xFA'>("ъ") + .map<'\xFB'>("ы") + .map<'\xFC'>("ь") + .map<'\xFD'>("э") + .map<'\xFE'>("ю") + .map<'\xFF'>("я"); + + template<typename Reader> + static constexpr map_value try_parse(Reader& reader) { + auto index = map.try_parse(reader); + if (index) { + return map_value(map[index]); + } + return {}; + } + }; + static_assert(IsConverter<Windows1251>); + + template<typename Reader> + constexpr map_value try_parse_map(detail::Encoding&& encoding, Reader& reader) { + switch (encoding) { + case detail::Encoding::Unknown: + case detail::Encoding::Ascii: + case detail::Encoding::Utf8: return Utf8::try_parse(reader); + case detail::Encoding::Windows1251: return Windows1251::try_parse(reader); + case detail::Encoding::Windows1252: return Windows1252::try_parse(reader); + } + ovdl::detail::unreachable(); + } + + template<typename String> + using _string_char_type = LEXY_DECAY_DECLTYPE(LEXY_DECLVAL(String)[0]); + + template<typename T, typename CharT> + concept IsErrorHandler = + std::is_convertible_v<CharT, char> // + && requires(T t, ovdl::v2script::ast::ParseState& state, lexy::_pr<lexy::deduce_encoding<CharT>> reader) { + { T::on_invalid_character(state, reader) }; + }; + + struct EmptyHandler { + static constexpr void on_invalid_character(detail::IsStateType auto& state, auto reader) {} + }; + + template<typename String, + IsErrorHandler<_string_char_type<String>> Error = EmptyHandler> + constexpr auto convert_as_string = + dsl::sink<String>( + lexy::fold_inplace<String>( + std::initializer_list<_string_char_type<String>> {}, // + []<typename CharT, typename = decltype(LEXY_DECLVAL(String).push_back(CharT()))>(String& result, detail::IsStateType auto& state, CharT c) { + if constexpr (std::is_convertible_v<CharT, char>) { + switch (state.encoding()) { + using enum ovdl::detail::Encoding; + case Ascii: + case Utf8: + break; + // Skip Ascii and Utf8 encoding + default: { + map_value val = {}; + CharT char_array[] { c, CharT() }; + auto input = lexy::range_input(&char_array[0], &char_array[1]); + auto reader = input.reader(); + + // prefer preserving unknown conversion maps, least things will work, they'll just probably display wrong + // map = make_map_from(state.encoding(), c); + val = try_parse_map(state.encoding(), reader); + + // Invalid characters are dropped + if (val.is_invalid()) { + Error::on_invalid_character(state, reader); + return; + } + + // non-pass characters are not valid ascii and are mapped to utf8 values + if (!val.is_pass()) { + result.append(val._value); + return; + } + + break; + } + } + } + + result.push_back(c); // + }, // + [](String& result, detail::IsStateType auto& state, String&& str) { + if constexpr (std::is_convertible_v<typename String::value_type, char>) { + switch (state.encoding()) { + using enum ovdl::detail::Encoding; + case Ascii: + case Utf8: + break; + // Skip Ascii and Utf8 encoding + default: { + auto input = lexy::string_input(str); + auto reader = input.reader(); + using encoding = decltype(reader)::encoding; + constexpr auto eof = encoding::eof(); + + if constexpr (requires { result.reserve(str.size()); }) { + result.reserve(str.size()); + } + + auto begin = reader.position(); + auto last_it = begin; + while (reader.peek() != eof) { + map_value val = try_parse_map(state.encoding(), reader); + + if (val.is_invalid()) { + Error::on_invalid_character(state, reader); + reader.bump(); + continue; + } else if (!val.is_pass()) { + result.append(val._value); + last_it = reader.position(); + continue; + } + + reader.bump(); + result.append(last_it, reader.position()); + last_it = reader.position(); + } + if (last_it != begin) { + result.append(last_it, reader.position()); + return; + } + break; + } + } + } + + result.append(LEXY_MOV(str)); // + }, // + []<typename Str = String, typename Iterator>(String& result, detail::IsStateType auto& state, Iterator begin, Iterator end) // + -> decltype(void(LEXY_DECLVAL(Str).append(begin, end))) { + if constexpr (std::is_convertible_v<typename String::value_type, char>) { + switch (state.encoding()) { + using enum ovdl::detail::Encoding; + case Ascii: + case Utf8: + break; + // Skip Ascii and Utf8 encoding + default: { + auto input = lexy::range_input(begin, end); + auto reader = input.reader(); + using encoding = decltype(reader)::encoding; + constexpr auto eof = encoding::eof(); + + if constexpr (requires { result.reserve(end - begin); }) { + result.reserve(end - begin); + } + + auto begin = reader.position(); + auto last_it = begin; + while (reader.peek() != eof) { + map_value val = try_parse_map(state.encoding(), reader); + + if (val.is_invalid()) { + Error::on_invalid_character(state, reader); + reader.bump(); + continue; + } else if (!val.is_pass()) { + result.append(val._value); + last_it = reader.position(); + continue; + } + + reader.bump(); + result.append(last_it, reader.position()); + last_it = reader.position(); + } + if (last_it != begin) { + result.append(last_it, reader.position()); + return; + } + break; + } + } + } + + result.append(begin, end); // + }, // + []<typename Reader>(String& result, detail::IsStateType auto& state, lexy::lexeme<Reader> lex) { + using encoding = typename Reader::encoding; + using _char_type = _string_char_type<String>; + static_assert(lexy::char_type_compatible_with_reader<Reader, _char_type>, + "cannot convert lexeme to this string type"); + + if constexpr ((std::same_as<encoding, lexy::default_encoding> || std::same_as<encoding, lexy::byte_encoding>) && + std::convertible_to<typename String::value_type, char>) { + auto input = lexy::range_input(lex.begin(), lex.end()); + auto reader = input.reader(); + using encoding = decltype(reader)::encoding; + constexpr auto eof = encoding::eof(); + + if constexpr (requires { result.reserve(lex.end() - lex.begin()); }) { + result.reserve(lex.end() - lex.begin()); + } + + auto begin = reader.position(); + auto last_it = begin; + while (reader.peek() != eof) { + map_value val = try_parse_map(state.encoding(), reader); + + if (val.is_invalid()) { + Error::on_invalid_character(state, reader); + reader.bump(); + continue; + } else if (!val.is_pass()) { + result.append(val._value); + last_it = reader.position(); + continue; + } + + reader.bump(); + result.append(last_it, reader.position()); + last_it = reader.position(); + } + if (last_it != begin) { + result.append(last_it, reader.position()); + return; + } + } + + result.append(lex.begin(), lex.end()); // + })); +}
\ No newline at end of file diff --git a/src/openvic-dataloader/detail/Detect.cpp b/src/openvic-dataloader/detail/Detect.cpp new file mode 100644 index 0000000..1516fc7 --- /dev/null +++ b/src/openvic-dataloader/detail/Detect.cpp @@ -0,0 +1,351 @@ +#include "detail/Detect.hpp" + +using namespace ovdl; +using namespace ovdl::encoding_detect; + +static constexpr int64_t INVALID_CLASS = 255; + +std::optional<int64_t> Utf8Canidate::read(const std::span<const cbyte>& buffer) { + auto lexy_buffer = lexy::make_buffer_from_raw<lexy::default_encoding, lexy::encoding_endianness::little>(buffer.data(), buffer.size()); + if (is_utf8(lexy_buffer)) { + return 0; + } + + return std::nullopt; +} + +std::optional<int64_t> AsciiCanidate::read(const std::span<const cbyte>& buffer) { + auto lexy_buffer = lexy::make_buffer_from_raw<lexy::default_encoding, lexy::encoding_endianness::little>(buffer.data(), buffer.size()); + if (is_ascii(lexy_buffer)) { + return 0; + } + + return std::nullopt; +} + +std::optional<int64_t> NonLatinCasedCanidate::read(const std::span<const cbyte>& buffer) { + static constexpr cbyte LATIN_LETTER = 1; + static constexpr int64_t NON_LATIN_MIXED_CASE_PENALTY = -20; + static constexpr int64_t NON_LATIN_ALL_CAPS_PENALTY = -40; + static constexpr int64_t NON_LATIN_CAPITALIZATION_BONUS = 40; + static constexpr int64_t LATIN_ADJACENCY_PENALTY = -50; + + int64_t score = 0; + for (const ubyte& b : buffer) { + const ubyte byte_class = score_data.classify(b); + if (byte_class == INVALID_CLASS) { + return std::nullopt; + } + + const ubyte caseless_class = byte_class & 0x7F; + const bool ascii = b < 0x80; + const bool ascii_pair = prev_ascii == 0 && ascii; + const bool non_ascii_alphabetic = score_data.is_non_latin_alphabetic(caseless_class); + + if (caseless_class == LATIN_LETTER) { + case_state = CaseState::Mix; + } else if (!non_ascii_alphabetic) { + switch (case_state) { + default: break; + case CaseState::UpperLower: + score += NON_LATIN_CAPITALIZATION_BONUS; + break; + case CaseState::AllCaps: + // pass + break; + case CaseState::Mix: + score += NON_LATIN_MIXED_CASE_PENALTY * current_word_len; + break; + } + case_state = CaseState::Space; + } else if (byte_class >> 7 == 0) { + switch (case_state) { + default: break; + case CaseState::Space: + case_state = CaseState::Lower; + break; + case CaseState::Upper: + case_state = CaseState::UpperLower; + break; + case CaseState::AllCaps: + case_state = CaseState::Mix; + break; + } + } else { + switch (case_state) { + default: break; + case CaseState::Space: + case_state = CaseState::Upper; + break; + case CaseState::Upper: + case_state = CaseState::AllCaps; + break; + case CaseState::Lower: + case CaseState::UpperLower: + case_state = CaseState::Mix; + break; + } + } + + if (non_ascii_alphabetic) { + current_word_len += 1; + } else { + if (current_word_len > longest_word) { + longest_word = current_word_len; + } + current_word_len = 0; + } + + const bool is_a0 = b == 0xA0; + + if (!ascii_pair) { + // 0xA0 is no-break space in many other encodings, so avoid + // assigning score to IBM866 when 0xA0 occurs next to itself + // or a space-like byte. + if (!(ibm866 && ((is_a0 && (prev_was_a0 || prev == 0)) || caseless_class == 0 && prev_was_a0))) { + score += score_data.score(caseless_class, prev); + } + + if (prev == LATIN_LETTER && + non_ascii_alphabetic) { + score += LATIN_ADJACENCY_PENALTY; + } else if (caseless_class == LATIN_LETTER && score_data.is_non_latin_alphabetic(prev)) { + score += LATIN_ADJACENCY_PENALTY; + } + } + + prev_ascii = ascii; + prev = caseless_class; + prev_was_a0 = is_a0; + } + return score; +} + +std::optional<int64_t> LatinCanidate::read(const std::span<const cbyte>& buffer) { + static constexpr int64_t IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY = -180; + static constexpr int64_t ORDINAL_BONUS = 300; + static constexpr int64_t COPYRIGHT_BONUS = 222; + static constexpr int64_t IMPLAUSIBILITY_PENALTY = -220; + + int64_t score = 0; + for (const ubyte& b : buffer) { + const ubyte byte_class = score_data.classify(b); + if (byte_class == INVALID_CLASS) { + return std::nullopt; + } + + const ubyte caseless_class = byte_class & 0x7F; + const bool ascii = b < 0x80; + const bool ascii_pair = prev_non_ascii == 0 && ascii; + + int16_t non_ascii_penalty = -200; + switch (prev_non_ascii) { + case 0: + case 1: + case 2: + non_ascii_penalty = 0; + break; + case 3: + non_ascii_penalty = -5; + break; + case 4: + non_ascii_penalty = 20; + break; + } + score += non_ascii_penalty; + + if (!score_data.is_latin_alphabetic(caseless_class)) { + case_state = CaseState::Space; + } else if (byte_class >> 7 == 0) { + if (case_state == CaseState::AllCaps && !ascii_pair) { + score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY; + } + case_state = CaseState::Lower; + } else { + switch (case_state) { + case CaseState::Lower: + if (!ascii_pair) { + score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY; + } + [[fallthrough]]; + case CaseState::Space: + case_state = CaseState::Upper; + break; + case CaseState::Upper: + case CaseState::AllCaps: + case_state = CaseState::AllCaps; + break; + } + } + + bool ascii_ish_pair = ascii_pair || (ascii && prev == 0) || (caseless_class == 0 && prev_non_ascii == 0); + + if (!ascii_ish_pair) { + score += score_data.score(caseless_class, prev); + } + + if (windows1252) { + switch (ordinal_state) { + case OrdinalState::Other: + if (caseless_class == 0) { + ordinal_state = OrdinalState::Space; + } + break; + case OrdinalState::Space: + if (caseless_class == 0) { + // pass + } else if (b == 0xAA || b == 0xBA) { + ordinal_state = OrdinalState::OrdinalExpectingSpace; + } else if (b == 'M' || b == 'D' || b == 'S') { + ordinal_state = OrdinalState::FeminineAbbreviationStartLetter; + } else if (b == 'N') { + // numero or Nuestra + ordinal_state = OrdinalState::UpperN; + } else if (b == 'n') { + // numero + ordinal_state = OrdinalState::LowerN; + } else if (caseless_class == ASCII_DIGIT) { + ordinal_state = OrdinalState::Digit; + } else if (caseless_class == 9 /* I */ || caseless_class == 22 /* V */ || caseless_class == 24) + /* X */ + { + ordinal_state = OrdinalState::Roman; + } else if (b == 0xA9) { + ordinal_state = OrdinalState::Copyright; + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::OrdinalExpectingSpace: + if (caseless_class == 0) { + score += ORDINAL_BONUS; + ordinal_state = OrdinalState::Space; + } else { + ordinal_state = OrdinalState::Other; + } + case OrdinalState::OrdinalExpectingSpaceUndoImplausibility: + if (caseless_class == 0) { + score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY; + ordinal_state = OrdinalState::Space; + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::OrdinalExpectingSpaceOrDigit: + if (caseless_class == 0) { + score += ORDINAL_BONUS; + ordinal_state = OrdinalState::Space; + } else if (caseless_class == ASCII_DIGIT) { + score += ORDINAL_BONUS; + // Deliberately set to `Other` + ordinal_state = OrdinalState::Other; + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily: + if (caseless_class == 0) { + score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY; + ordinal_state = OrdinalState::Space; + } else if (caseless_class == ASCII_DIGIT) { + score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY; + // Deliberately set to `Other` + ordinal_state = OrdinalState::Other; + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::UpperN: + if (b == 0xAA) { + ordinal_state = + OrdinalState::OrdinalExpectingSpaceUndoImplausibility; + } else if (b == 0xBA) { + ordinal_state = + OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily; + } else if (b == '.') { + ordinal_state = OrdinalState::PeriodAfterN; + } else if (caseless_class == 0) { + ordinal_state = OrdinalState::Space; + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::LowerN: + if (b == 0xBA) { + ordinal_state = + OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily; + } else if (b == '.') { + ordinal_state = OrdinalState::PeriodAfterN; + } else if (caseless_class == 0) { + ordinal_state = OrdinalState::Space; + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::FeminineAbbreviationStartLetter: + if (b == 0xAA) { + ordinal_state = + OrdinalState::OrdinalExpectingSpaceUndoImplausibility; + } else if (caseless_class == 0) { + ordinal_state = OrdinalState::Space; + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::Digit: + if (b == 0xAA || b == 0xBA) { + ordinal_state = OrdinalState::OrdinalExpectingSpace; + } else if (caseless_class == 0) { + ordinal_state = OrdinalState::Space; + } else if (caseless_class == ASCII_DIGIT) { + // pass + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::Roman: + if (b == 0xAA || b == 0xBA) { + ordinal_state = + OrdinalState::OrdinalExpectingSpaceUndoImplausibility; + } else if (caseless_class == 0) { + ordinal_state = OrdinalState::Space; + } else if (caseless_class == 9 /* I */ || caseless_class == 22 /* V */ || caseless_class == 24) + /* X */ + { + // pass + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::PeriodAfterN: + if (b == 0xBA) { + ordinal_state = OrdinalState::OrdinalExpectingSpaceOrDigit; + } else if (caseless_class == 0) { + ordinal_state = OrdinalState::Space; + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::Copyright: + if (caseless_class == 0) { + score += COPYRIGHT_BONUS; + ordinal_state = OrdinalState::Space; + } else { + ordinal_state = OrdinalState::Other; + } + break; + } + } + + if (ascii) { + prev_non_ascii = 0; + } else { + prev_non_ascii += 1; + } + prev = caseless_class; + } + return score; +} + +template struct ovdl::encoding_detect::DetectUtf8<true>; +template struct ovdl::encoding_detect::DetectUtf8<false>; diff --git a/src/openvic-dataloader/detail/Detect.hpp b/src/openvic-dataloader/detail/Detect.hpp new file mode 100644 index 0000000..ad36d04 --- /dev/null +++ b/src/openvic-dataloader/detail/Detect.hpp @@ -0,0 +1,627 @@ +/// Based heavily on https://github.com/hsivonen/chardetng/tree/143dadde20e283a46ef33ba960b517a3283a3d22 + +#pragma once + +#include <array> +#include <cassert> +#include <cstdint> +#include <optional> +#include <span> +#include <type_traits> +#include <variant> +#include <vector> + +#include <openvic-dataloader/detail/Encoding.hpp> + +#include <lexy/action/match.hpp> +#include <lexy/callback/constant.hpp> +#include <lexy/dsl.hpp> +#include <lexy/dsl/ascii.hpp> +#include <lexy/dsl/newline.hpp> +#include <lexy/encoding.hpp> +#include <lexy/input/buffer.hpp> + +#include "detail/dsl.hpp" + +namespace ovdl::encoding_detect { + using cbyte = char; + using ubyte = unsigned char; + + using Encoding = detail::Encoding; + + struct DetectAscii { + // & 0b10000000 == 0b00000000 + static constexpr auto rule = lexy::dsl::while_(lexy::dsl::ascii::character) + lexy::dsl::eol; + static constexpr auto value = lexy::constant(true); + }; + + template<bool IncludeAscii> + struct DetectUtf8 { + struct not_utf8 { + static constexpr auto name = "not utf8"; + }; + + static constexpr auto rule = [] { + constexpr auto is_not_ascii_flag = lexy::dsl::context_flag<DetectUtf8>; + + // & 0b10000000 == 0b00000000 + constexpr auto ascii_values = lexy::dsl::ascii::character; + // & 0b11100000 == 0b11000000 + constexpr auto two_byte = dsl::lit_b_range<0b11000000, 0b11011111>; + // & 0b11110000 == 0b11100000 + constexpr auto three_byte = dsl::lit_b_range<0b11100000, 0b11101111>; + // & 0b11111000 == 0b11110000 + constexpr auto four_byte = dsl::lit_b_range<0b11110000, 0b11110111>; + // & 0b11000000 == 0b10000000 + constexpr auto check_bytes = dsl::lit_b_range<0b10000000, 0b10111111>; + + constexpr auto utf8_check = + ((four_byte >> lexy::dsl::times<3>(check_bytes)) | + (three_byte >> lexy::dsl::times<2>(check_bytes)) | + (two_byte >> lexy::dsl::times<1>(check_bytes))) >> + is_not_ascii_flag.set(); + + return is_not_ascii_flag.template create<IncludeAscii>() + + lexy::dsl::while_(utf8_check | ascii_values) + + lexy::dsl::must(is_not_ascii_flag.is_set()).template error<not_utf8> + lexy::dsl::eof; + }(); + + static constexpr auto value = lexy::constant(true); + }; + + extern template struct DetectUtf8<true>; + extern template struct DetectUtf8<false>; + + template<typename Input> + constexpr bool is_ascii(const Input& input) { + return lexy::match<DetectAscii>(input); + } + + template<typename Input> + constexpr bool is_utf8_no_ascii(const Input& input) { + return lexy::match<DetectUtf8<false>>(input); + } + + template<typename Input> + constexpr bool is_utf8(const Input& input) { + return lexy::match<DetectUtf8<true>>(input); + } + + struct DetectorData { + static constexpr std::array latin_ascii = std::to_array<ubyte>({ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 0, 0, 0, 0, 0, 0, // + 0, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, // + 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 0, 0, 0, 0, 0, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 0, 0, 0, 0, 0, // + }); + + static constexpr std::array non_latin_ascii = std::to_array<ubyte>({ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 0, 0, 0, 0, 0, 0, // + 0, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, // + 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 0, 0, 0, 0, 0, // + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, // + }); + + static constexpr std::array windows_1251 = std::to_array<ubyte>({ + 131, 130, 0, 2, 0, 0, 0, 0, 0, 0, 132, 0, 133, 130, 134, 135, // + 3, 0, 0, 0, 0, 0, 0, 0, 255, 0, 4, 0, 5, 2, 6, 7, // + 0, 136, 8, 140, 47, 130, 46, 47, 138, 49, 139, 49, 50, 46, 48, 141, // + 49, 50, 137, 9, 2, 49, 48, 46, 10, 47, 11, 48, 12, 130, 2, 13, // + 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, // + 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, // + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, // + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, // + }); + + static constexpr std::array windows_1252 = std::to_array<ubyte>({ + 0, 255, 0, 60, 0, 0, 0, 0, 0, 0, 156, 0, 157, 255, 185, 255, // + 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 28, 0, 29, 255, 57, 186, // + 0, 62, 60, 60, 60, 60, 59, 60, 60, 62, 60, 59, 63, 59, 61, 60, // + 62, 63, 61, 61, 60, 62, 61, 59, 60, 61, 60, 59, 62, 62, 62, 62, // + 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, // + 188, 174, 175, 176, 177, 178, 179, 63, 180, 181, 182, 183, 184, 188, 188, 27, // + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, // + 60, 46, 47, 48, 49, 50, 51, 63, 52, 53, 54, 55, 56, 60, 60, 58, // + }); + + // clang-format off + static constexpr std::array cyrillic = std::to_array<ubyte>({ + 0, 0, 0, 0, 1, 0, 16, 38, 0, 2, 5, 10,121, 4, 20, 25, 26, 53, 9, 5, 61, 23, 20, 26, 15, 95, 60, 2, 26, 15, 25, 29, 0, 14, 6, 6, 25, 1, 0, 27, 25, 8, 5, 39, // , + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // a, + 0, 0, 0,255, 0, 0,255, 0,255, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0,255, 0, 0, // ѓ, + 0, 0,255, 0, 0, 0, 0, 0,255,255,255,255, 0,255, 2, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255, // ђ, + 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255, 0,255, 0, 0, 0, 0, 0, 4, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255, // љ, + 0, 0, 0, 0, 0, 0, 0, 0,255,255,255, 0, 0,255, 5, 0, 0, 0, 0, 2, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255, // њ, + 0, 0,255, 0, 0, 0, 0, 0,255, 0,255,255, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 1,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255, // ћ, + 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255, // џ, + 7, 0, 0,255,255,255,255,255, 0, 1, 0,255,255,255, 15, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 1, 0, 0, 0, 1, // ў, + 12, 0, 0,255,255, 0,255,255, 0, 2, 0, 0, 0, 0, 2, 3, 15, 5, 5, 0, 0, 4, 0, 0, 21, 15, 10, 17, 0, 6, 14, 4, 6, 0, 3, 1, 8, 1, 0, 0, 0, 2, 0, 0, 0, 0, // і, + 0, 0,255,255,255,255,255,255, 0, 0, 0,255,255, 0, 4, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ё, + 6, 0, 0,255,255,255,255,255, 0, 0,255, 5,255, 0, 1, 7, 0, 3, 2, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 2, 2, 5, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // є, + 12, 0, 0, 0, 0, 0, 0, 0,255, 0,255, 0, 0, 0, 5, 1, 0, 0, 0, 2, 0, 0, 20,255, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0,255,255,255,255, // ј, + 9, 0, 0,255,255,255,255,255,255, 5,255, 0, 0, 13, 3, 3, 0, 4, 1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 4, 0, 0, 1, 3, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ї, + 32, 0, 0, 2, 2, 2, 0, 0, 0, 1, 0, 0, 28, 0, 23, 22, 26, 22, 19, 0, 3, 12, 5, 0, 44, 38, 18, 58, 1, 21, 44, 17, 54, 1, 2, 28, 5, 8, 3, 1, 9, 0, 12, 0, 0, 0, // а, + 40, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 7, 0, 0, 0, 1, 7, 0, 1, 1, 0, 0, 7, 4, 1, 9, 0, 1, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, // б, + 31, 0, 0, 0, 0, 0, 0, 0, 0, 11, 0, 3, 0, 0, 19, 0, 0, 1, 1, 6, 0, 2, 6, 0, 1, 0, 1, 0, 32, 0, 2, 2, 23, 9, 0, 0, 0, 1, 0, 0, 1, 1, 0, 3, 0, 2, // в, + 23, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 7, 0, 1, 20, 0, 0, 1, 0, 9, 0, 0, 9, 7, 0, 5, 2, 18, 11, 0, 8, 3, 2, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 13, 0, 3, // г, + 26, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 2, 0, 2, 19, 0, 1, 5, 0, 13, 2, 2, 3, 2, 0, 6, 1, 12, 30, 0, 4, 0, 0, 7, 0, 0, 0, 0, 0, 0, 1, 0, 0, 5, 0, 1, // д, + 12, 0, 0, 1, 4, 5, 0, 0, 0, 0, 0, 0, 24, 1, 5, 7, 11, 3, 12, 1, 6, 6, 11, 0, 3, 15, 14, 14, 4, 8, 25, 14, 29, 0, 1, 1, 4, 8, 8, 2, 0, 3, 1, 0, 0, 0, // е, + 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 3, 2, 1, 2, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, // ж, + 19, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 1, 6, 0, 0, 0, 11, 8, 0, 0, 8, 0, 0, 0, 0, 0, 4, 0, 1, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, // з, + 24, 0, 0, 0, 0, 1, 5, 0, 0, 0, 0, 0, 1, 0, 1, 10, 16, 21, 22, 0, 6, 5, 6, 1, 15, 15, 8, 38, 2, 4, 27, 9, 15, 0, 3, 8, 12, 7, 6, 1, 0, 0, 0, 0, 0, 0, // и, + 6, 0, 0, 0,255,255,255,255, 0, 7, 0, 0,255, 4, 21, 0, 0, 0, 0, 5, 0, 0, 39, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 5, 0, 3, 0, 0, // й, + 54, 0, 0, 0, 0, 0, 0, 0, 1, 8, 0, 0, 0, 0, 10, 0, 1, 0, 1, 11, 0, 0, 12, 0, 1, 2, 0, 4, 8, 0, 2, 23, 2, 4, 0, 2, 3, 3, 8, 0, 0, 3, 16, 1, 4, 3, // к, + 12, 0, 0, 0, 0, 0, 0, 0, 2, 6, 0, 6, 0, 4, 29, 12, 4, 5, 2, 18, 0, 0, 17, 4, 5, 11, 0, 0, 21, 2, 3, 4, 1, 15, 1, 0, 0, 0, 0, 0, 4, 3, 2, 12, 0, 2, // л, + 23, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 4, 0, 0, 17, 1, 0, 0, 0, 7, 0, 1, 13, 2, 0, 0, 0, 0, 13, 0, 2, 4, 0, 2, 0, 0, 0, 0, 0, 0, 1, 4, 2, 4, 1, 1, // м, + 42, 0, 0, 0, 0, 0, 0, 0, 4, 12, 6, 7, 1, 7, 76, 0, 22, 1, 4, 27, 1, 3, 34, 30, 0, 7, 1, 13, 24, 1, 3, 5, 3, 4, 0, 1, 0, 4, 1, 0, 2, 18, 7, 16, 0, 4, // н, + 37, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 1, 0, 1, 10, 27, 22, 15, 1, 2, 3, 7, 5, 32, 11, 7, 38, 8, 21, 24, 11, 23, 0, 2, 10, 2, 2, 3, 2, 0, 0, 1, 0, 0, 0, // о, + 47, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 0, 0, 2, 0, 1, 2, 4, 0, 0, 2, 0, 6, 0, 0, 5, 0, 2, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, // п, + 19, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 8, 0, 5, 47, 4, 6, 6, 5, 23, 0, 0, 5, 2, 6, 0, 0, 0, 23, 22, 0, 1, 14, 9, 1, 0, 1, 0, 0, 0, 7, 2, 8, 16, 0, 3, // р, + 53, 0, 0, 0, 0, 0, 0, 0, 4, 9, 2, 0, 1, 2, 21, 1, 4, 1, 2, 11, 0, 0, 12, 2, 4, 7, 1, 13, 15, 1, 4, 6, 3, 6, 0, 0, 0, 0, 0, 0, 1, 2, 3, 5, 0, 1, // с, + 28, 0, 0, 0, 0, 0, 0, 0, 1, 6, 0, 1, 0, 1, 32, 0, 1, 3, 0, 12, 0, 1, 22, 1, 4, 7, 1, 6, 23, 0, 14, 41, 14, 3, 0, 1, 1, 1, 21, 0, 2, 2, 6, 2, 1, 4, // т, + 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 2, 4, 2, 4, 6, 3, 0, 2, 0, 0, 6, 5, 6, 3, 0, 3, 7, 4, 7, 18, 1, 6, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, // у, + 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ф, + 41, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 2, 30, 0, 2, 0, 0, 11, 0, 0, 5, 1, 14, 3, 0, 3, 6, 0, 7, 0, 0, 1, 0, 1, 0, 2, 0, 0, 0, 4, 3, 5, 0, 0, // х, + 8, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 7, 0, 0, 0, 0, 4, 0, 0, 7, 1, 0, 1, 0, 2, 1, 0, 0, 9, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 1, 1, // ц, + 6, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 5, 0, 1, 5, 0, 2, 0, 0, 6, 0, 0, 1, 0, 0, 3, 0, 2, 0, 0, 2, 0, 1, 0, 0, 3, 0, 0, 2, 0, 0, 0, 0, // ч, + 12, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 17, 0, 0, 1, 0, 2, 0, 0, 26, 0, 0, 0, 0, 0, 22, 2, 6, 0, 0, 5, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, // ш, + 2, 0,255, 0,255,255,255,255,255, 0, 0, 0,255, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, // щ, + 0, 0,255,255,255,255, 0,255, 0, 0, 0,255,255,255, 0, 3, 4, 0, 2, 0, 0, 0, 0, 0, 11, 0, 1, 0, 0, 2, 2, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ъ, + 1, 0, 0,255,255,255,255,255, 0, 0, 0, 0, 0,255, 0, 3, 11, 0, 4, 0, 2, 1, 0, 0, 0, 3, 1, 16, 0, 0, 22, 2, 10, 0, 0, 0, 8, 6, 3, 0, 0, 0, 0, 0, 0, 0, // ы, + 0, 0, 0,255,255, 0, 0, 0,255, 0, 0, 0, 0, 0, 5, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 7, 3, 0, 1, 13, 7, 7, 0, 35, 6, 0, 0, 0, 0, 0, 0, 0, 6, 0, // ь, + 10, 0, 0,255,255,255,255,255, 0, 0, 0, 0,255, 0, 0, 1, 1, 10, 11, 0, 2, 2, 0, 0, 0, 9, 3, 9, 0, 0, 7, 6, 9, 0, 0, 8, 3, 2, 1, 0, 0, 0, 0, 17, 0, 0, // э, + 14, 0, 0, 0,255,255,255,255, 0, 0, 0, 0,255, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ю, + 5, 0, 0,255,255,255,255,255, 0, 9, 0, 0,255, 0, 11, 0, 3, 0, 0, 0, 0, 2, 24, 0, 0, 5, 2, 14, 1, 0, 2, 3, 1, 0, 0, 1, 3, 0, 0, 0, 0, 16, 1, 0, 0, 0, // я, + // , a, ѓ, ђ, љ, њ, ћ, џ, ў, і, ё, є, ј, ї, а, б, в, г, д, е, ж, з, и, й, к, л, м, н, о, п, р, с, т, у, ф, х, ц, ч, ш, щ, ъ, ы, ь, э, ю, я, + }); + // clang-format on + + // clang-format off + static constexpr std::array western = std::to_array<ubyte>({ + 18, 3, 0,254, 74, 0, 5,254,254, 2, 25,254,149, 4,254, 66,148,254, 0,254,122,238, 8, 1, 20, 13,254, 35, 20, 3, 1, 0, // , + 0, 3, 0, 0, 0, 0, 0, 5, 2, 0, 86, 9, 76, 0, 0, 0,241, 0, 0, 49, 0, 0, 0, 0, 11, 2, 0, 34, 0, 1, 2, 0, // a, + 19, 0, 0, 5, 5, 0, 0, 8, 13, 5, 0, 34, 22, 0, 0, 0, 4, 0, 0, 0, 6, 1, 3, 3, 42, 37, 8, 8, 0, 67, 0, 0, // b, + 0, 0, 0, 9, 6, 1, 0, 22, 10, 1, 0, 19, 54, 1, 0, 1, 18, 3, 1, 2, 40, 7, 0, 0, 6, 0, 3, 5, 1, 34, 0, 0, // c, + 0, 0, 0, 5, 5, 0, 0, 12, 45, 16, 1, 6, 42, 0, 13, 3, 10, 0, 2, 0, 66, 11, 5, 8, 33,104, 3, 4, 0, 19, 0, 0, // d, + 63, 5, 0, 0, 0, 0, 2, 33, 15, 1, 3, 0, 87, 0, 0, 0, 0, 0, 1, 21, 0, 0, 0, 49, 1, 11, 0, 3, 0, 9, 1, 0, // e, + 0, 0, 0, 8, 8, 0, 0, 10, 2, 7, 0,162, 23, 0, 13, 0, 4, 0, 0, 0, 1, 3, 0, 0, 15, 4, 0, 0, 0, 4, 0, 0, // f, + 1, 0, 0, 14, 16, 24, 0, 29, 11, 41, 0, 13, 86, 0, 14, 9, 3, 0, 0, 0, 20, 8, 7, 7, 13, 37, 14, 0, 0, 12, 0, 0, // g, + 1, 0, 0, 0, 0, 0, 0, 47, 2, 0, 0, 0, 1, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 29, 20, 0, 0, 0, 0, 45, 0, 0, // h, + 5, 4, 0,166,120, 0, 0,144, 0, 2, 3, 88,254, 0, 0, 0, 0, 0, 0, 3, 28,107, 0,112, 8, 2, 44, 32, 0, 3, 3, 0, // i, + 0, 0, 0, 0, 0, 0, 0, 39, 9, 0, 0, 2, 1, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 16, 18, 44, 0, 0, 0, 0, 0,255, // j, + 0, 2, 0, 0, 1, 0, 0, 48, 31, 32, 1, 60, 1, 0, 4, 0, 1, 0, 0, 0, 1, 3, 0, 2, 20, 47, 0, 0, 0, 20, 0, 0, // k, + 4, 0, 0, 12, 16, 0, 0, 54, 40, 48, 0, 64, 36, 0, 39, 6, 12, 3, 0, 0, 27, 9, 3, 24, 42, 33, 2, 9, 7, 77, 0, 0, // l, + 0, 0, 0, 14, 5, 4, 0, 60, 11, 4, 3, 48, 30, 7, 28, 1, 10, 1, 0, 0, 24, 41, 3, 3, 19, 24, 1, 8, 2, 36, 0, 0, // m, + 1, 1, 0, 24, 91, 16, 0,132, 62, 73, 1, 56, 71, 33, 78, 7, 35, 2, 3, 0, 94,254, 10, 21, 33, 38, 24, 21, 1, 61, 0, 0, // n, + 0, 1, 0, 0, 0, 0,254, 6, 0, 1, 27, 0, 13, 0, 0, 84,127, 0, 0, 62, 0, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, // o, + 0, 0, 0, 5, 2, 0, 0, 9, 15, 0, 0, 4, 34, 0, 6, 0, 6, 0, 0, 0, 20, 12, 9, 28, 10, 22, 0, 3, 0, 7, 0, 0, // p, + 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 33, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,255,255, // q, + 0, 0, 0, 83, 62, 1, 0,198,139,125, 0,229, 94, 54,190, 38, 18, 1, 0, 0,176, 24, 16, 29,193,181, 13, 13, 2,131, 0, 0, // r, + 1, 0, 0, 41, 34, 0, 0, 41, 24, 42, 0, 68,113, 15,159, 6, 43, 19, 4, 58, 14, 18, 1, 4, 48, 42, 4, 12, 9, 20, 0, 0, // s, + 7, 1, 0, 14, 20, 8, 0, 56, 37, 31, 0,104, 67, 14,113, 3, 50, 9, 5, 0, 89, 7, 19, 22, 13, 14, 40, 12, 15, 18, 0, 0, // t, + 0, 1, 5, 1, 2, 0, 0, 30, 0, 0, 1, 15, 2, 0, 1, 0, 1, 0, 0, 2, 4, 0, 0, 36, 0, 0, 0, 0, 0, 0, 0, 0, // u, + 0, 2, 0, 1, 6, 0, 0, 29, 33, 13, 0, 19, 46, 0, 15, 0, 7, 0, 1, 31, 2, 2, 3, 1, 32, 27, 0, 0, 1, 1, 0, 0, // v, + 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 3, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0,255, // w, + 0, 0, 0, 1, 16, 0, 0, 23, 0, 0, 0, 3, 14, 0, 0, 0, 2, 3, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, // x, + 0, 0, 0, 0, 0, 0, 0, 58, 8, 0, 0, 1, 1, 62, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 6, 82, 0, 0, 0, 0, 0,255, // y, + 0, 0, 0, 0, 2, 0, 0, 0, 14, 0, 0, 7, 3, 0, 6, 0, 3, 5, 0, 0, 0, 0, 4, 0, 1, 0, 0, 0, 0, 0, 0, 0, // z, + 0, 29, 0, 0, 0, 15, 0, 0, 0, 11, 0, 0, 0, 0, 0, 20, 0, 0, 0, 0, 0, 37, 0, 0, 0, 0, 0, 0,255,255, 0, 0,255,255, 4, 0, 0,255,255, 0,255, 0,255, 0, 0,255,255,255, 0, 0, 0, 8, 0,255, 0, 0, 2, 0, 0, // ß, + 6, 2, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 10, 1, 0, 0, 0, 0, 0, 0, 0,255, 0, 1, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // š, + 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0,255,255,255, 0, 0, 0,255,255,255, 0,255,255,255,255, 0, 0,255,255,255,255,255,255, 0,255,255,255, 0,255,255, // œ, + 107, 0, 22, 16, 18, 14, 6, 24, 46, 15, 2, 0, 42, 18, 17, 0, 36, 0, 34, 4,254, 1, 2, 0, 0, 1, 0, 0, 0,255, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0,255,255,255,255,255, 0, 0,255, 0, 0, 0, // à, + 41, 0, 10, 8, 21, 34, 5, 5, 60, 18, 5, 1, 29, 42, 26, 2, 16, 0, 27, 9, 43, 28, 7, 0, 0, 1, 4, 0, 0,255, 0, 0,255,255,255, 0,255, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255, 0, 0, 0, 0, 0,255, // á, + 24, 0, 1, 2, 0, 0, 0, 0, 7, 0, 0, 0, 3, 1, 0, 0, 0, 0, 2, 0, 5, 0, 1, 0, 0, 0, 0,255, 0,255, 0, 0, 0,255, 0,255, 0, 0, 0, 2, 0,255, 0,255, 0, 0, 0, 0,255, 0,255,255,255,255,255, 0,255, 0,255, // â, + 0, 0, 0, 1, 2, 3, 0, 1, 2, 12, 0, 0, 1, 7, 29, 4, 1,255, 11, 66, 11, 0, 1, 0, 0, 0, 0,255, 0,255,255,255, 0, 0, 0,255,255,127,255,255,255,255,255, 0, 0,255, 0, 0,255,255, 0,255,255,255,255,255,255,255,255, // ã, + 134, 1, 11, 0, 25, 6, 15, 11, 61, 24,123, 95,114, 68, 53, 1, 49, 0, 60, 98,198, 0, 88, 29, 0, 6, 12, 0, 0,255, 0,255, 0, 0,118, 0,255, 0,255, 0,255, 0,255, 0,255,255, 0,255,255, 0,255, 2,255,255,255, 0, 0, 0,255, // ä, + 156, 0, 12, 14, 19, 3, 12, 47, 17, 3, 12, 5, 30, 47, 22, 0,205, 0,184, 70, 19, 0, 22, 8, 0, 6, 1,255, 0,255,255, 0,255, 0, 0, 0, 0, 0,255, 0,255, 0,255, 0, 0,255,255,255,255,255,255, 0, 0,255,255,255,255,255,255, // å, + 26, 0, 7, 0, 4, 0, 23, 8, 15, 0, 18, 19, 56, 23, 24, 0, 9, 0, 82, 37, 24, 0, 71, 0, 0, 0, 0,255, 0,255,255, 0,255,255, 0, 0, 0, 0,255, 0,255,255,255, 0,255,255, 0,255,255,255,255, 0, 0,255,255,255,255, 0,255, // æ, + 17,112, 0, 2, 0, 15, 0, 0, 0, 35, 0, 0, 2, 0, 59, 9, 1, 0, 36, 0, 0, 8, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, // ç, + 254, 0, 9, 14, 20, 0, 15, 6, 70,144, 14, 45, 47, 92, 16, 3,123, 0, 38, 23,115, 52, 22, 42, 2, 80, 19,255, 0,255, 0, 0,255,255, 0,255,255, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0,255,255,255, 0, 0, 0, 1,255,255, // è, + 152, 2, 19, 24, 85, 0, 29, 23, 26, 25, 2, 9, 43, 60, 62, 1, 32, 0,122, 45,169, 15, 13, 30, 7, 4, 8, 0, 0,255, 0, 0, 0, 0, 0,255, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1,255, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, // é, + 5, 0, 0, 3, 7, 0, 0, 10, 2, 3, 0, 26, 6, 6, 20, 1, 2, 0, 20, 1, 11, 5, 5, 2, 0, 0, 1,255, 0,255,255,255, 0,255,255,255,255, 0, 0, 0, 0, 0,255, 0, 0, 0, 0,255, 0, 0,255,255,255, 0,255, 0, 0, 0,255, // ê, + 36, 2, 23, 15, 36,143, 5, 23, 52, 52, 66, 48, 92, 57,216, 10,125, 35, 89, 58,254, 9, 24, 14, 0, 0, 8,255, 0,255, 0,255,255,255, 0, 0,255, 1, 0, 0, 0, 0, 0,255, 0, 0, 0,255,255,255, 0, 0, 0, 0,255, 0, 0, 0,255, // ë, + 12, 0, 1, 4, 6, 0, 3, 21, 10, 0, 0, 0, 18, 8, 4, 0, 1, 0, 65, 35, 8, 3, 0, 0, 0, 0, 0,255, 0,255, 0, 0,255,255,255,255,255,255, 0, 0, 0,255, 0, 0, 0,255, 0, 0,255, 0,255,255,255, 0,255,255, 0, 0,255, // ì, + 40, 72, 7, 10, 16, 2, 23, 10, 34, 0, 0, 1, 34, 15, 21, 1, 3, 0,203, 28, 58, 23, 11, 0, 10, 0, 2, 0, 0, 0, 0, 0, 0,255, 0,255,255, 0, 0, 0, 0,255, 0, 0,255,255, 1,255, 0,255,255, 0,255,255, 0,255, 2, 0,255, // í, + 6, 5, 1, 9, 5, 0, 0, 0, 22, 0, 9, 8, 8, 6, 9, 1, 10, 0, 20, 6,182, 0, 13, 0, 0, 24, 1,255, 0,255,255,255, 0, 0,255, 0,255, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0,255,255,255,255,255, 0,255,255,255, // î, + 0, 6, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0,255, 0,255, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0,255, 0, 0, 0, 0,255,255, 0, 0, 0,255,255, // ï, + 0,254, 0, 0, 0, 26, 0, 0, 0, 61, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 25, 0, 0, 0, 0, 0,255,255,255, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 0,255, 0, 1, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,255, 0,255,255, // ñ, + 20, 0, 56, 43, 8,162, 14, 3, 23, 19, 2,118, 31, 26, 46, 0, 20, 0, 23, 6, 24, 19, 6, 21, 5, 27, 63,255, 0,255, 0, 0,255,255,255,255,255, 3, 0,255,255,255, 0, 0,255, 0, 0, 0, 0,255, 0,255,255, 0,255,255, 0,255,255, // ò, + 67, 0, 12, 15, 9, 7, 8, 66, 13,254, 3, 23, 14, 16, 16, 0, 8, 0, 29, 11, 26, 0, 5, 5, 1, 10, 13,255, 0,255,255, 0,255, 0, 0,255,255, 1,255, 0,255,255, 0, 0,255, 0, 1, 0, 0, 0, 0,255,255,255, 0,255,255, 0,255, // ó, + 18, 3, 3, 12, 1, 0, 2, 0, 7, 0, 1, 0, 2, 2, 8, 0, 6, 0, 6, 7, 4, 0, 2, 0, 0, 0, 1,255, 0, 0,255, 0, 0,255,255,255, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0,255,255,255,255, 0, 0,255,255,255, // ô, + 29, 2, 0, 0, 0, 0, 0, 0, 5, 2, 22, 30, 25, 38, 19, 0, 33,255, 4, 39, 24, 0, 88, 0, 0, 0, 0,255, 0,255,255, 0,255, 0,255,255,255, 36,255,255,255,255,255, 0,255,255, 0,255, 0, 0, 6, 0,255,255,255, 0, 0, 0,255, // õ, + 44, 0, 33, 0, 25, 0,142, 5, 46, 10, 25, 32, 26, 13, 6, 0, 3, 0, 30, 8, 35, 0, 25, 5, 0, 44, 7, 0, 0,255,255, 0,255,255, 73, 0,255, 0, 0, 0,255,255,255,255,255, 0, 0,255, 0, 0, 0, 39, 0,255,255,255, 0, 0, 0, // ö, + 52, 0, 21, 0, 57, 0,119, 12, 47, 3, 59, 33, 45, 15, 12, 0, 3, 0, 52, 82, 49, 1, 11, 0, 0, 0, 0, 0,255, 0,255,255,255,255,255, 0, 0, 0,255, 0,255,255,255, 0,255,255, 0,255,255,255,255, 0, 0,255,255,255,255,255, 0, // ø, + 25, 0, 4, 3, 53, 0, 0, 2, 12, 72, 0, 0, 30, 0, 0,254, 0, 0, 6, 3, 3, 0, 0, 0, 0, 0, 0,255, 0,255, 0,255, 0,255,255,255,255, 0, 0, 0, 0,255, 0,255,255,255,255, 0,255, 0, 0,255,255, 0, 0, 0, 0, 0, 0, // ù, + 19, 2, 1, 7, 9, 1, 12, 5, 9, 41, 1, 0, 10, 7, 9, 0, 8, 0, 12, 28, 8, 0, 0, 0, 0, 1, 0,255, 0,255,255, 0,255,255,255,255, 0, 0,255, 0,255,255,255, 0,255,255, 0, 0, 0,255, 0,255,255, 0, 0,255,255, 0,255, // ú, + 0, 0, 0, 0, 1, 5, 0, 0, 1, 0, 0, 0, 0, 0, 0, 45, 0, 0, 3, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255, 0,255,255,255,255, 0,255, 0,255,255,255, 0, 0,255,255,255,255, 0,255,255,255, 0,255, 0, 0,255, 0, // û, + 95, 2, 19, 0, 6, 2,121, 9, 15, 1, 5, 44, 18, 26, 7, 0, 11, 2, 68, 49, 20, 0, 2, 17, 0, 0, 6, 0, 0,255, 0,255,255,255, 0,255,255, 0,255, 0,255, 0,255,255,255, 0, 0,255,255,255, 0, 0,255, 0, 0, 0, 31, 0, 0, // ü, + 1, 1, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0,255,255, 0, 0,255, 0,255, 0,255,255,255,255, 0, 0, 0, 0,255, 0, 0, 0, 0, 0,255, // ž, + 0, 0, 0, 0, 0, 0,255, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0,255,255, 0,255,255,255,255,255,255, 0,255, 0,255,255,255,255,255,255,255,255,255,255,255,255,255, 0, 0,255, 0,255,255,255, 0, 0, 0, // ÿ, + // , a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, ß, š, œ, à, á, â, ã, ä, å, æ, ç, è, é, ê, ë, ì, í, î, ï, ñ, ò, ó, ô, õ, ö, ø, ù, ú, û, ü, ž, ÿ, + + }); + // clang-format on + }; + + namespace class_size { + constexpr std::size_t cyrillic_ascii = 2; + constexpr std::size_t cyrillic_non_ascii = 44; + constexpr std::size_t western_ascii = 27; + constexpr std::size_t western_non_ascii = 32; + } + + constexpr std::size_t ASCII_DIGIT = 100; + + struct ByteScore { + const Encoding encoding; + const std::array<ubyte, 128>& lower; + const std::array<ubyte, 128>& upper; + const std::span<const ubyte> probabilities; + const std::size_t ascii; + const std::size_t non_ascii; + + static inline constexpr std::optional<std::size_t> compute_index(std::size_t x, std::size_t y, std::size_t ascii_classes, std::size_t non_ascii_classes) { + if (x == 0 && y == 0) { + return std::nullopt; + } + + if (x < ascii_classes && y < ascii_classes) { + return std::nullopt; + } + + if (y >= ascii_classes) { + return (ascii_classes * non_ascii_classes) + (ascii_classes + non_ascii_classes) * (y - ascii_classes) + x; + } + + return y * non_ascii_classes + x - ascii_classes; + } + + inline constexpr cbyte classify(cbyte byte) const { + cbyte high = byte >> 7; + cbyte low = byte & 0x7F; + if (high == 0) { + return lower[low]; + } + + return upper[low]; + } + + inline constexpr bool is_latin_alphabetic(cbyte caseless_class) const { + return caseless_class > 0 && caseless_class < (ascii + non_ascii); + } + + inline constexpr bool is_non_latin_alphabetic(cbyte caseless_class) const { + return caseless_class > 1 && caseless_class < (ascii + non_ascii); + } + + inline constexpr int64_t score(cbyte current_class, cbyte previous_class) const { + constexpr std::size_t IMPLAUSABILITY_PENALTY = -220; + + constexpr std::size_t PLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE = 0; + constexpr std::size_t IMPLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE = 1; + constexpr std::size_t IMPLAUSIBLE_BEFORE_ALPHABETIC = 2; + constexpr std::size_t IMPLAUSIBLE_AFTER_ALPHABETIC = 3; + constexpr std::size_t PLAUSIBLE_NEXT_TO_NON_ASCII_ALPHABETIC_ON_EITHER_SIDE = 4; + constexpr std::size_t PLAUSIBLE_NEXT_TO_ASCII_ALPHABETIC_ON_EITHER_SIDE = 5; + + std::size_t stored_boundary = ascii + non_ascii; + if (current_class < stored_boundary) { + if (previous_class < stored_boundary) { + if (auto index = compute_index(previous_class, current_class, ascii, non_ascii); index) { + ubyte b = probabilities[index.value()]; + if (b == 255) { + return IMPLAUSABILITY_PENALTY; + } + return b; + } + return 0; + } + + if (current_class == 0 || current_class == ASCII_DIGIT) { + return 0; + } + + std::size_t previous_unstored = previous_class - stored_boundary; + switch (previous_unstored) { + case PLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE: + case IMPLAUSIBLE_AFTER_ALPHABETIC: + return 0; + case IMPLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE: + case IMPLAUSIBLE_BEFORE_ALPHABETIC: + return IMPLAUSABILITY_PENALTY; + case PLAUSIBLE_NEXT_TO_NON_ASCII_ALPHABETIC_ON_EITHER_SIDE: + if (current_class < ascii) { + return IMPLAUSABILITY_PENALTY; + } + return 0; + case PLAUSIBLE_NEXT_TO_ASCII_ALPHABETIC_ON_EITHER_SIDE: + if (current_class < ascii) { + return 0; + } + return IMPLAUSABILITY_PENALTY; + default: + assert(previous_class == ASCII_DIGIT); + return 0; + } + } + + if (previous_class < stored_boundary) { + if (previous_class == 0 || previous_class == ASCII_DIGIT) { + return 0; + } + + std::size_t current_unstored = current_class - stored_boundary; + switch (current_unstored) { + case PLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE: + case IMPLAUSIBLE_BEFORE_ALPHABETIC: + return 0; + case IMPLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE: + case IMPLAUSIBLE_AFTER_ALPHABETIC: + return IMPLAUSABILITY_PENALTY; + case PLAUSIBLE_NEXT_TO_NON_ASCII_ALPHABETIC_ON_EITHER_SIDE: + if (previous_class < ascii) { + return IMPLAUSABILITY_PENALTY; + } + return 0; + case PLAUSIBLE_NEXT_TO_ASCII_ALPHABETIC_ON_EITHER_SIDE: + if (previous_class < ascii) { + return 0; + } + return IMPLAUSABILITY_PENALTY; + default: + assert(current_class == ASCII_DIGIT); + return 0; + } + } + + if (current_class == ASCII_DIGIT || previous_class == ASCII_DIGIT) { + return 0; + } + + return IMPLAUSABILITY_PENALTY; + } + }; + + enum class ScoreIndex { + Windows1251, + Windows1252 + }; + + static constexpr std::array byte_scores { + ByteScore { + .encoding = Encoding::Windows1251, + .lower = DetectorData::non_latin_ascii, + .upper = DetectorData::windows_1251, + .probabilities = DetectorData::cyrillic, + .ascii = class_size::cyrillic_ascii, + .non_ascii = class_size::cyrillic_non_ascii }, + ByteScore { + .encoding = Encoding::Windows1252, + .lower = DetectorData::latin_ascii, + .upper = DetectorData::windows_1252, + .probabilities = DetectorData::western, + .ascii = class_size::western_ascii, + .non_ascii = class_size::western_non_ascii } + }; + + constexpr const ByteScore& get_byte_score(ScoreIndex index) { + return byte_scores[static_cast<std::underlying_type_t<ScoreIndex>>(index)]; + } + + struct Utf8Canidate { + std::optional<int64_t> read(const std::span<const cbyte>& buffer); + }; + + struct AsciiCanidate { + std::optional<int64_t> read(const std::span<const cbyte>& buffer); + }; + + struct NonLatinCasedCanidate { + enum class CaseState { + Space, + Upper, + Lower, + UpperLower, + AllCaps, + Mix, + }; + + const ByteScore& score_data; + cbyte prev {}; + CaseState case_state = CaseState::Space; + bool prev_ascii = true; + uint64_t current_word_len {}; + uint64_t longest_word {}; + bool ibm866 = false; + bool prev_was_a0 = false; + + std::optional<int64_t> read(const std::span<const cbyte>& buffer); + }; + + struct LatinCanidate { + enum class CaseState { + Space, + Upper, + Lower, + AllCaps, + }; + + enum class OrdinalState { + Other, + Space, + PeriodAfterN, + OrdinalExpectingSpace, + OrdinalExpectingSpaceUndoImplausibility, + OrdinalExpectingSpaceOrDigit, + OrdinalExpectingSpaceOrDigitUndoImplausibily, + UpperN, + LowerN, + FeminineAbbreviationStartLetter, + Digit, + Roman, + Copyright, + }; + + const ByteScore& score_data; + cbyte prev {}; + CaseState case_state = CaseState::Space; + uint32_t prev_non_ascii {}; + OrdinalState ordinal_state = OrdinalState::Space; // Used only when `windows1252 == true` + bool windows1252; + + constexpr LatinCanidate(const ByteScore& data) : score_data(data) { + windows1252 = data.encoding == Encoding::Windows1252; + } + + std::optional<int64_t> read(const std::span<const cbyte>& buffer); + }; + + using InnerCanidate = std::variant<NonLatinCasedCanidate, LatinCanidate, Utf8Canidate, AsciiCanidate>; + + template<class... Ts> + struct overloaded : Ts... { + using Ts::operator()...; + }; + + template<class... Ts> + overloaded(Ts...) -> overloaded<Ts...>; + + struct Canidate { + InnerCanidate inner; + std::optional<int64_t> score_value; + + template<typename CanidateT> + static constexpr Canidate create_canidate() { + return { + .inner = CanidateT(), + .score_value = 0 + }; + } + + template<typename CanidateT> + static constexpr Canidate create_canidate(const ByteScore& score) { + return { + .inner = CanidateT { score }, + .score_value = 0 + }; + } + + static constexpr Canidate new_utf8() { + return create_canidate<Utf8Canidate>(); + } + + static constexpr Canidate new_ascii() { + return create_canidate<AsciiCanidate>(); + } + + static constexpr Canidate new_latin(ScoreIndex index) { + return create_canidate<LatinCanidate>(get_byte_score(index)); + } + + static constexpr Canidate new_non_latin_cased(ScoreIndex index) { + return create_canidate<NonLatinCasedCanidate>(get_byte_score(index)); + } + + constexpr std::optional<int64_t> score(const std::span<const cbyte>& buffer, std::size_t encoding, bool expectation_is_valid) { + if (auto old_score = score_value) { + auto new_score = std::visit([&](auto& inner) { + return inner.read(buffer); + }, + inner); + if (new_score) { + score_value = old_score.value() + new_score.value(); + } else { + score_value = std::nullopt; + } + } + + if (auto nlcc = std::get_if<NonLatinCasedCanidate>(&inner)) { + if (nlcc->longest_word < 2) { + return std::nullopt; + } + } + return score_value; + } + + constexpr Encoding encoding() const { + return std::visit( + overloaded { + [](const Utf8Canidate& canidate) { + return Encoding::Utf8; + }, + [](const AsciiCanidate& canidate) { + return Encoding::Ascii; + }, + [](const LatinCanidate& canidate) { + return canidate.score_data.encoding; + }, + [](const NonLatinCasedCanidate& canidate) { + return canidate.score_data.encoding; + } }, + inner); + } + }; + + struct Detector { + std::vector<Canidate> canidates { + Canidate::new_ascii(), + Canidate::new_utf8(), + Canidate::new_latin(ScoreIndex::Windows1252), + Canidate::new_non_latin_cased(ScoreIndex::Windows1251), + }; + + Encoding default_fallback = Encoding::Unknown; + + constexpr std::pair<Encoding, bool> detect_assess(std::span<const cbyte> buffer, bool allow_utf8 = true) { + int64_t max = 0; + Encoding encoding = default_fallback; // Presumes fallback, defaults to Unknown encoding if unknown (which skips conversion) + std::size_t i = 0; + for (Canidate& canidate : canidates) { + if (!allow_utf8 && canidate.encoding() == Encoding::Utf8) { + continue; + } + + if (auto score = canidate.score(buffer, i, false)) { + switch (canidate.encoding()) { + using enum Encoding; + case Ascii: + case Utf8: + return { canidate.encoding(), true }; + default: break; + } + + auto value = score.value(); + if (value > max) { + max = value; + encoding = canidate.encoding(); + } + } + i++; + } + return { encoding, max >= 0 }; + } + + constexpr Encoding detect(std::span<const cbyte> buffer, bool allow_utf8 = true) { + return detect_assess(buffer, allow_utf8).first; + } + + template<typename BufferEncoding> + std::pair<Encoding, bool> detect_assess(const lexy::buffer<BufferEncoding, void>& buffer, bool allow_utf8 = true) { + auto span = std::span<const cbyte>(buffer.data(), buffer.size()); + return detect_assess(span); + } + + template<typename BufferEncoding> + constexpr Encoding detect(const lexy::buffer<BufferEncoding, void>& buffer, bool allow_utf8 = true) { + return detect_assess(buffer, allow_utf8).first; + } + }; +}
\ No newline at end of file diff --git a/src/openvic-dataloader/detail/DetectUtf8.hpp b/src/openvic-dataloader/detail/DetectUtf8.hpp deleted file mode 100644 index e9d0350..0000000 --- a/src/openvic-dataloader/detail/DetectUtf8.hpp +++ /dev/null @@ -1,53 +0,0 @@ -#pragma once - -#include <lexy/action/match.hpp> -#include <lexy/dsl.hpp> - -#include "detail/dsl.hpp" - -namespace ovdl::detail { - namespace detect_utf8 { - - template<bool INCLUDE_ASCII> - struct DetectUtf8 { - struct not_utf8 { - static constexpr auto name = "not utf8"; - }; - - static constexpr auto rule = [] { - constexpr auto is_not_ascii_flag = lexy::dsl::context_flag<DetectUtf8>; - - // & 0b10000000 == 0b00000000 - constexpr auto ascii_values = dsl::make_range<0b00000000, 0b01111111>(); - // & 0b11100000 == 0b11000000 - constexpr auto two_byte = dsl::make_range<0b11000000, 0b11011111>(); - // & 0b11110000 == 0b11100000 - constexpr auto three_byte = dsl::make_range<0b11100000, 0b11101111>(); - // & 0b11111000 == 0b11110000 - constexpr auto four_byte = dsl::make_range<0b11110000, 0b11110111>(); - // & 0b11000000 == 0b10000000 - constexpr auto check_bytes = dsl::make_range<0b10000000, 0b10111111>(); - - constexpr auto utf8_check = - ((four_byte >> lexy::dsl::times<3>(check_bytes)) | - (three_byte >> lexy::dsl::times<2>(check_bytes)) | - (two_byte >> lexy::dsl::times<1>(check_bytes))) >> - is_not_ascii_flag.set(); - - return is_not_ascii_flag.template create<INCLUDE_ASCII>() + - lexy::dsl::while_(utf8_check | ascii_values) + - lexy::dsl::must(is_not_ascii_flag.is_set()).template error<not_utf8>; - }(); - }; - } - - template<typename Input> - constexpr bool is_utf8_no_ascii(const Input& input) { - return lexy::match<detect_utf8::DetectUtf8<false>>(input); - } - - template<typename Input> - constexpr bool is_utf8(const Input& input) { - return lexy::match<detect_utf8::DetectUtf8<true>>(input); - } -}
\ No newline at end of file diff --git a/src/openvic-dataloader/detail/Errors.hpp b/src/openvic-dataloader/detail/Errors.hpp deleted file mode 100644 index fbebcc5..0000000 --- a/src/openvic-dataloader/detail/Errors.hpp +++ /dev/null @@ -1,25 +0,0 @@ -#pragma once - -#include <string_view> - -#include <openvic-dataloader/ParseError.hpp> - -namespace ovdl::errors { - inline const ParseError make_no_file_error(std::string_view file_path) { - std::string message; - if (file_path.empty()) { - message = "File path not specified."; - } else { - message = "File '" + std::string(file_path) + "' was not found."; - } - - return ParseError { ParseError::Type::Fatal, message, 1 }; - } -} - -namespace ovdl::v2script::errors { - -} - -namespace ovdl::ovscript::errors { -}
\ No newline at end of file diff --git a/src/openvic-dataloader/detail/InternalConcepts.hpp b/src/openvic-dataloader/detail/InternalConcepts.hpp new file mode 100644 index 0000000..0c7913d --- /dev/null +++ b/src/openvic-dataloader/detail/InternalConcepts.hpp @@ -0,0 +1,127 @@ +#pragma once + +#include <concepts> +#include <utility> + +#include <openvic-dataloader/NodeLocation.hpp> +#include <openvic-dataloader/detail/Encoding.hpp> +#include <openvic-dataloader/detail/SymbolIntern.hpp> + +#include <lexy/encoding.hpp> +#include <lexy/input/buffer.hpp> + +#include <fmt/core.h> + +#include <lexy_ext/report_error.hpp> + +namespace ovdl::detail { + template<typename T> + concept IsFile = + requires(T t, const typename T::node_type* node, NodeLocation location) { + typename T::node_type; + { t.set_location(node, location) } -> std::same_as<void>; + { t.location_of(node) } -> std::same_as<NodeLocation>; + }; + + template<typename T> + concept IsAst = + requires( + T t, + const T ct, + const typename T::node_type* node, + NodeLocation loc // + ) { + requires IsFile<typename T::file_type>; + typename T::root_node_type; + typename T::node_type; + requires std::derived_from<typename T::root_node_type, typename T::node_type>; + { t.set_location(node, loc) } -> std::same_as<void>; + { t.location_of(node) } -> std::same_as<NodeLocation>; + { t.root() } -> std::same_as<typename T::root_node_type*>; + { ct.root() } -> std::same_as<const typename T::root_node_type*>; + { t.file() } -> std::same_as<typename T::file_type&>; + { ct.file() } -> std::same_as<const typename T::file_type&>; + }; + + template<typename T> + concept IsDiagnosticLogger = requires( + T t, + const T ct, + const char* str, + std::size_t length, + std::string_view sv, + lexy_ext::diagnostic_kind diag_kind // + ) { + typename T::error_range; + typename T::Writer; + { static_cast<bool>(ct) } -> std::same_as<bool>; + { ct.errored() } -> std::same_as<bool>; + { ct.warned() } -> std::same_as<bool>; + { ct.get_errors() } -> std::same_as<typename T::error_range>; + { t.intern(str, length) } -> std::same_as<ovdl::SymbolIntern::symbol_type>; + { t.intern(sv) } -> std::same_as<ovdl::SymbolIntern::symbol_type>; + { t.intern_cstr(str, length) } -> std::same_as<const char*>; + { t.intern_cstr(sv) } -> std::same_as<const char*>; + { t.symbol_interner() } -> std::same_as<SymbolIntern::symbol_interner_type&>; + { ct.symbol_interner() } -> std::same_as<const SymbolIntern::symbol_interner_type&>; + { t.error(std::declval<typename T::template format_str<>>()) } -> std::same_as<typename T::Writer>; + { t.warning(std::declval<typename T::template format_str<>>()) } -> std::same_as<typename T::Writer>; + { t.note(std::declval<typename T::template format_str<>>()) } -> std::same_as<typename T::Writer>; + { t.info(std::declval<typename T::template format_str<>>()) } -> std::same_as<typename T::Writer>; + { t.debug(std::declval<typename T::template format_str<>>()) } -> std::same_as<typename T::Writer>; + { t.fixit(std::declval<typename T::template format_str<>>()) } -> std::same_as<typename T::Writer>; + { t.help(std::declval<typename T::template format_str<>>()) } -> std::same_as<typename T::Writer>; + { t.error(sv) } -> std::same_as<typename T::Writer>; + { t.warning(sv) } -> std::same_as<typename T::Writer>; + { t.note(sv) } -> std::same_as<typename T::Writer>; + { t.info(sv) } -> std::same_as<typename T::Writer>; + { t.debug(sv) } -> std::same_as<typename T::Writer>; + { t.fixit(sv) } -> std::same_as<typename T::Writer>; + { t.help(sv) } -> std::same_as<typename T::Writer>; + { std::move(t.error_callback().sink()).finish() } -> std::same_as<std::size_t>; + { t.log(diag_kind, std::declval<typename T::template format_str<>>()) } -> std::same_as<typename T::Writer>; + }; + + template<typename T> + concept IsParseState = requires( + T t, + const T ct, + typename T::ast_type::file_type&& file, + lexy::buffer<lexy::default_encoding>&& buffer, + ovdl::detail::Encoding encoding, + const char* path // + ) { + requires IsAst<typename T::ast_type>; + requires IsDiagnosticLogger<typename T::diagnostic_logger_type>; + { T { std::move(file), encoding } } -> std::same_as<T>; + { T { std::move(buffer), encoding } } -> std::same_as<T>; + { T { path, std::move(buffer), encoding } } -> std::same_as<T>; + { t.ast() } -> std::same_as<typename T::ast_type&>; + { ct.ast() } -> std::same_as<const typename T::ast_type&>; + { t.logger() } -> std::same_as<typename T::diagnostic_logger_type&>; + { ct.logger() } -> std::same_as<const typename T::diagnostic_logger_type&>; + }; + + template<typename T> + concept IsFileParseState = requires( + T t, + const T ct, + typename T::file_type&& file, + lexy::buffer<lexy::default_encoding>&& buffer, + ovdl::detail::Encoding encoding, + const char* path // + ) { + requires IsFile<typename T::file_type>; + requires IsDiagnosticLogger<typename T::diagnostic_logger_type>; + { T { std::move(file), encoding } } -> std::same_as<T>; + { T { std::move(buffer), encoding } } -> std::same_as<T>; + { T { path, std::move(buffer), encoding } } -> std::same_as<T>; + { t.file() } -> std::same_as<typename T::file_type&>; + { ct.file() } -> std::same_as<const typename T::file_type&>; + { t.logger() } -> std::same_as<typename T::diagnostic_logger_type&>; + { ct.logger() } -> std::same_as<const typename T::diagnostic_logger_type&>; + }; + + template<typename T> + concept IsStateType = IsParseState<T> || IsFileParseState<T>; +}
\ No newline at end of file diff --git a/src/openvic-dataloader/detail/ParseHandler.cpp b/src/openvic-dataloader/detail/ParseHandler.cpp new file mode 100644 index 0000000..3818433 --- /dev/null +++ b/src/openvic-dataloader/detail/ParseHandler.cpp @@ -0,0 +1,347 @@ +#include "ParseHandler.hpp" + +#include <algorithm> +#include <cstddef> +#include <cstdlib> +#include <string_view> +#include <type_traits> + +#include <openvic-dataloader/detail/Encoding.hpp> + +using namespace ovdl::detail; + +#ifdef _WIN32 +#include <array> +#include <cstdint> +#include <utility> + +#define WIN32_LEAN_AND_MEAN +#include <Windows.h> +#undef WIN32_LEAN_AND_MEAN +#endif + +template<size_t N> +struct LangCodeLiteral { + char value[N]; + + constexpr LangCodeLiteral(const char (&str)[N]) { + std::copy_n(str, N, value); + } + + static constexpr std::integral_constant<std::size_t, N - 1> size = {}; + + constexpr const char& operator[](std::size_t index) const noexcept { + return value[index]; + } + + constexpr operator std::string_view() const noexcept { + return std::string_view(value, size()); + } + + constexpr bool operator==(const std::string_view view) const noexcept { + return view.size() >= size() + 1 && view.starts_with(*this) && view[size()] == '_'; + } +}; + +struct LangCodeView { + std::string_view view; + bool is_valid; + + constexpr LangCodeView() = default; + + template<std::size_t N> + constexpr LangCodeView(const char (&str)[N]) : view(str), is_valid(true) {} + + constexpr LangCodeView(char* str) : view(str) { + is_valid = view.find('_') != std::string_view::npos; + } + + constexpr std::size_t size() const noexcept { + return view.size(); + } + + constexpr const char& operator[](std::size_t index) const noexcept { + return view[index]; + } + + constexpr operator std::string_view() const noexcept { + return view; + } + + template<std::size_t N> + constexpr bool operator==(const LangCodeLiteral<N>& literal) { + return is_valid && size() >= LangCodeLiteral<N>::size() && view.starts_with(literal); + } +}; + +struct FallbackSetter { + std::optional<Encoding>& fallback; + + template<Encoding _Encoding, LangCodeLiteral LangCode> + constexpr bool encoded(auto&& view) const { + if (view == LangCode) { + fallback = _Encoding; + return true; + } + return false; + }; +}; + +void ParseHandler::_detect_system_fallback_encoding() { + _system_fallback_encoding = Encoding::Unknown; + LangCodeView lang_code; + +#ifdef _WIN32 + using namespace std::string_view_literals; + + // Every Windows language id mapped to a language code according to https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-lcid/63d3d639-7fd2-4afb-abbe-0d5b5551eef8 + constexpr std::array lang_id_to_lang_code = std::to_array<std::pair<std::uint8_t, LangCodeView>>({ + { 0x0001, "ar" }, + { 0x0002, "bg" }, + { 0x0003, "ca" }, + { 0x0004, "zh" }, + { 0x0005, "cs" }, + { 0x0006, "da" }, + { 0x0007, "de" }, + { 0x0008, "el" }, + { 0x0009, "en" }, + { 0x000A, "es" }, + { 0x000B, "fi" }, + { 0x000C, "fr" }, + { 0x000D, "he" }, + { 0x000E, "hu" }, + { 0x000F, "is" }, + { 0x0010, "it" }, + { 0x0011, "ja" }, + { 0x0012, "ko" }, + { 0x0013, "nl" }, + { 0x0014, "no" }, + { 0x0015, "pl" }, + { 0x0016, "pt" }, + { 0x0017, "rm" }, + { 0x0018, "ro" }, + { 0x0019, "ru" }, + { 0x001A, "hr" }, + { 0x001B, "sk" }, + { 0x001C, "sq" }, + { 0x001D, "sv" }, + { 0x001E, "th" }, + { 0x001F, "tr" }, + { 0x0020, "ur" }, + { 0x0021, "id" }, + { 0x0022, "uk" }, + { 0x0023, "be" }, + { 0x0024, "sl" }, + { 0x0025, "et" }, + { 0x0026, "lv" }, + { 0x0027, "lt" }, + { 0x0028, "tg" }, + { 0x0029, "fa" }, + { 0x002A, "vi" }, + { 0x002B, "hy" }, + { 0x002C, "az" }, + { 0x002D, "eu" }, + { 0x002E, "hsb" }, + { 0x002F, "mk" }, + { 0x0030, "st" }, + { 0x0031, "ts" }, + { 0x0032, "tn" }, + { 0x0033, "ve" }, + { 0x0034, "xh" }, + { 0x0035, "zu" }, + { 0x0036, "af" }, + { 0x0037, "ka" }, + { 0x0038, "fo" }, + { 0x0039, "hi" }, + { 0x003A, "mt" }, + { 0x003B, "se" }, + { 0x003C, "ga" }, + { 0x003D, "yi" }, + { 0x003E, "ms" }, + { 0x003F, "kk" }, + { 0x0040, "ky" }, + { 0x0041, "sw" }, + { 0x0042, "tk" }, + { 0x0043, "uz" }, + { 0x0044, "tt" }, + { 0x0045, "bn" }, + { 0x0046, "pa" }, + { 0x0047, "gu" }, + { 0x0048, "or" }, + { 0x0049, "ta" }, + { 0x004A, "te" }, + { 0x004B, "kn" }, + { 0x004C, "ml" }, + { 0x004D, "as" }, + { 0x004E, "mr" }, + { 0x004F, "sa" }, + { 0x0050, "mn" }, + { 0x0051, "bo" }, + { 0x0052, "cy" }, + { 0x0053, "km" }, + { 0x0054, "lo" }, + { 0x0055, "my" }, + { 0x0056, "gl" }, + { 0x0057, "kok" }, + { 0x0058, "mni" }, + { 0x0059, "sd" }, + { 0x005A, "syr" }, + { 0x005B, "si" }, + { 0x005C, "chr" }, + { 0x005D, "iu" }, + { 0x005E, "am" }, + { 0x005F, "tzm" }, + { 0x0060, "ks" }, + { 0x0061, "ne" }, + { 0x0062, "fy" }, + { 0x0063, "ps" }, + { 0x0064, "fil" }, + { 0x0065, "dv" }, + { 0x0066, "bin" }, + { 0x0067, "ff" }, + { 0x0068, "ha" }, + { 0x0069, "ibb" }, + { 0x006A, "yo" }, + { 0x006B, "quz" }, + { 0x006C, "nso" }, + { 0x006D, "ba" }, + { 0x006E, "lb" }, + { 0x006F, "kl" }, + { 0x0070, "ig" }, + { 0x0071, "kr" }, + { 0x0072, "om" }, + { 0x0073, "ti" }, + { 0x0074, "gn" }, + { 0x0075, "haw" }, + { 0x0076, "la" }, + { 0x0077, "so" }, + { 0x0078, "ii" }, + { 0x0079, "pap" }, + { 0x007A, "arn" }, + { 0x007C, "moh" }, + { 0x007E, "br" }, + { 0x0080, "ug" }, + { 0x0081, "mi" }, + { 0x0082, "oc" }, + { 0x0083, "co" }, + { 0x0084, "gsw" }, + { 0x0085, "sah" }, + { 0x0086, "qut" }, + { 0x0087, "rw" }, + { 0x0088, "wo" }, + { 0x008C, "prs" }, + { 0x0091, "gd" }, + { 0x0092, "ku" }, + { 0x0093, "quc" } // + }); + +#pragma pack(push, 1) + struct LocaleStruct { + struct { + uint8_t language_id; + uint8_t country_id; + } language_country; + uint8_t sort_id : 4; + uint16_t reserved : 12; + }; +#pragma pack(pop) + + std::uint32_t locale_int = GetSystemDefaultLCID(); + LocaleStruct locale_id; + std::memcpy(&locale_id, &locale_int, sizeof(locale_id)); + // first 16 bytes are language-country id, next 4 are sort id, last 12 bytes are reserved + // first 8 are the language id, last 8 bytes are a country id + const std::uint8_t& lang_id = locale_id.language_country.language_id; + + for (const auto& map : lang_id_to_lang_code) { + if (map.first != lang_id) continue; + lang_code = map.second; + break; + } +#else + lang_code = std::getenv("LANG"); +#endif + + constexpr FallbackSetter setter { _system_fallback_encoding }; + + if (lang_code.size() < 2) { + _system_fallback_encoding = Encoding::Unknown; + return; + } + +#define WIN1251(LANG_CODE) \ + if (setter.encoded<Encoding::Windows1251, #LANG_CODE>(lang_code)) return; + +#define WIN1252(LANG_CODE) \ + if (setter.encoded<Encoding::Windows1252, #LANG_CODE>(lang_code)) return; + + // More common, prefer + WIN1252(en); + WIN1252(es); + WIN1252(fr); + WIN1252(de); + + WIN1251(ru); + + WIN1252(af); + WIN1252(sq); + WIN1252(eu); + WIN1252(br); + WIN1252(co); + WIN1252(fo); + WIN1252(gl); + WIN1252(is); + WIN1252(io); + WIN1252(ga); + WIN1252(id); + WIN1252(in); + WIN1252(it); + WIN1252(lb); + WIN1252(ms); + WIN1252(gv); + WIN1252(no); + WIN1252(oc); + WIN1252(pt); + WIN1252(gd); + WIN1252(sw); + WIN1252(fi); + WIN1252(da); + WIN1252(et); + WIN1252(tn); + WIN1252(ca); + WIN1252(rm); + WIN1252(nl); + WIN1252(sl); + WIN1252(cy); + WIN1252(hu); + + WIN1251(be); + WIN1251(uk); + WIN1251(bg); + WIN1251(kk); + WIN1251(tg); + WIN1251(sr); + WIN1251(ky); + WIN1251(mn); + WIN1251(mk); + WIN1251(mo); + + if (lang_code.size() < 3) { + return; + } + + WIN1251(mol); + + WIN1252(ast); + WIN1252(jbo); + WIN1252(gla); + WIN1252(sco); + WIN1252(sma); + WIN1252(roo); + WIN1252(swa); + WIN1252(tsn); + WIN1252(tok); + +#undef WIN1251 +#undef WIN1252 +}
\ No newline at end of file diff --git a/src/openvic-dataloader/detail/ParseHandler.hpp b/src/openvic-dataloader/detail/ParseHandler.hpp index fbec0d7..9666a5b 100644 --- a/src/openvic-dataloader/detail/ParseHandler.hpp +++ b/src/openvic-dataloader/detail/ParseHandler.hpp @@ -1,20 +1,26 @@ #pragma once +#include <cstddef> +#include <optional> +#include <string> #include <utility> -#include <openvic-dataloader/ParseState.hpp> -#include <openvic-dataloader/detail/utility/Concepts.hpp> +#include <openvic-dataloader/detail/Concepts.hpp> #include <lexy/encoding.hpp> #include <lexy/input/buffer.hpp> #include <lexy/input/file.hpp> +#include "openvic-dataloader/detail/Encoding.hpp" +#include "openvic-dataloader/detail/Utility.hpp" + #include "detail/BufferError.hpp" +#include "detail/Detect.hpp" +#include "detail/InternalConcepts.hpp" namespace ovdl::detail { - template<typename Derived> struct ParseHandler { - std::string make_error_from(buffer_error error) { + std::string make_error_from(buffer_error error) const { switch (error) { using enum ovdl::detail::buffer_error; case buffer_is_null: @@ -30,116 +36,179 @@ namespace ovdl::detail { } } - template<typename... Args> - constexpr void _run_load_func(detail::LoadCallback<Derived, Args...> auto func, Args... args); - }; - - template<IsFileParseState ParseState, typename MemoryResource = void> - struct BasicFileParseHandler : ParseHandler<BasicFileParseHandler<ParseState, MemoryResource>> { - using parse_state_type = ParseState; - using encoding_type = typename parse_state_type::file_type::encoding_type; - constexpr bool is_valid() const { - if (!_parse_state) return false; - return buffer().data() != nullptr; + return is_valid_impl(); } - constexpr buffer_error load_buffer_size(const char* data, std::size_t size) { - lexy::buffer<encoding_type, MemoryResource> buffer(data, size); + buffer_error load_buffer_size(const char* data, std::size_t size, std::optional<Encoding> fallback) { + lexy::buffer<lexy::default_encoding> buffer(data, size); if (buffer.data() == nullptr) return buffer_error::buffer_is_null; - _parse_state.reset(new parse_state_type { std::move(buffer) }); - return is_valid() ? buffer_error::success : buffer_error::buffer_is_null; + return load_buffer_impl(std::move(buffer), "", fallback); } - constexpr buffer_error load_buffer(const char* start, const char* end) { - lexy::buffer<encoding_type, MemoryResource> buffer(start, end); + buffer_error load_buffer(const char* start, const char* end, std::optional<Encoding> fallback) { + lexy::buffer<lexy::default_encoding> buffer(start, end); if (buffer.data() == nullptr) return buffer_error::buffer_is_null; - _parse_state.reset(new parse_state_type { std::move(buffer) }); - return is_valid() ? buffer_error::success : buffer_error::buffer_is_null; + return load_buffer_impl(std::move(buffer), "", fallback); } - buffer_error load_file(const char* path) { - lexy::read_file_result file = lexy::read_file<encoding_type, lexy::encoding_endianness::bom, MemoryResource>(path); + buffer_error load_file(const char* path, std::optional<Encoding> fallback) { + lexy::read_file_result file = lexy::read_file<lexy::default_encoding, lexy::encoding_endianness::bom>(path); + if (!file) { - _parse_state.reset(new parse_state_type { path, lexy::buffer<typename parse_state_type::file_type::encoding_type>() }); return ovdl::detail::from_underlying<buffer_error>(ovdl::detail::to_underlying(file.error())); } - _parse_state.reset(new parse_state_type { path, std::move(file).buffer() }); - return is_valid() ? buffer_error::success : buffer_error::buffer_is_null; + + return load_buffer_impl(std::move(file).buffer(), path, fallback); } const char* path() const { + return path_impl(); + } + + static Encoding get_system_fallback() { + return _system_fallback_encoding.value_or(Encoding::Unknown); + } + + virtual ~ParseHandler() = default; + + protected: + constexpr virtual bool is_valid_impl() const = 0; + constexpr virtual buffer_error load_buffer_impl(lexy::buffer<lexy::default_encoding>&& buffer, const char* path = "", std::optional<Encoding> fallback = std::nullopt) = 0; + virtual const char* path_impl() const = 0; + + template<detail::IsStateType State, detail::IsEncoding BufferEncoding> + static constexpr auto generate_state = [](std::optional<State>* state, const char* path, auto&& buffer, Encoding encoding) { + if (path[0] != '\0') { + state->emplace( + path, + lexy::buffer<BufferEncoding>(std::move(buffer)), + encoding); + return; + } + state->emplace(lexy::buffer<BufferEncoding>(std::move(buffer)), encoding); + }; + + template<detail::IsStateType State> + static void create_state(std::optional<State>* state, const char* path, lexy::buffer<lexy::default_encoding>&& buffer, std::optional<Encoding> fallback) { + if (!_system_fallback_encoding.has_value()) { + _detect_system_fallback_encoding(); + } + bool is_bad_fallback = false; + if (fallback.has_value()) { + is_bad_fallback = fallback.value() == Encoding::Ascii || fallback.value() == Encoding::Utf8; + if (is_bad_fallback) + fallback = _system_fallback_encoding.value(); + } else { + fallback = _system_fallback_encoding.value(); + } + auto [encoding, is_alone] = encoding_detect::Detector { .default_fallback = fallback.value() }.detect_assess(buffer); + switch (encoding) { + using enum Encoding; + case Ascii: + case Utf8: { + generate_state<State, lexy::utf8_char_encoding>(state, path, std::move(buffer), encoding); + break; + } + case Unknown: + case Windows1251: + case Windows1252: { + generate_state<State, lexy::default_encoding>(state, path, std::move(buffer), encoding); + break; + } + default: + ovdl::detail::unreachable(); + } + + if (!is_alone) { + (*state)->logger().info("encoding type could not be distinguished"); + } + + if (is_bad_fallback) { + (*state)->logger().warning("fallback encoding cannot be ascii or utf8"); + } + + if (encoding == ovdl::detail::Encoding::Unknown) { + (*state)->logger().warning("could not detect encoding"); + } + } + + private: + inline static std::optional<Encoding> _system_fallback_encoding = std::nullopt; + static void _detect_system_fallback_encoding(); + }; + + template<detail::IsFileParseState ParseState> + struct BasicFileParseHandler : ParseHandler { + using parse_state_type = ParseState; + + virtual constexpr bool is_valid_impl() const { + if (!_parse_state) return false; + return _parse_state.value().file().is_valid(); + } + + constexpr virtual buffer_error load_buffer_impl(lexy::buffer<lexy::default_encoding>&& buffer, const char* path, std::optional<Encoding> fallback) { + if (buffer.data() == nullptr) return buffer_error::buffer_is_null; + create_state(&_parse_state, path, std::move(buffer), fallback); + return is_valid_impl() ? buffer_error::success : buffer_error::buffer_is_null; + } + + virtual const char* path_impl() const { if (!_parse_state) return ""; - return _parse_state->file().path(); + return _parse_state.value().file().path(); } parse_state_type& parse_state() { - return *_parse_state; + return _parse_state.value(); } const parse_state_type& parse_state() const { - return *_parse_state; + return _parse_state.value(); } + template<typename Encoding> constexpr const auto& buffer() const { - return _parse_state->file().buffer(); + return _parse_state.value().file().template get_buffer_as<Encoding>(); } protected: - std::unique_ptr<parse_state_type> _parse_state; + std::optional<parse_state_type> _parse_state; }; - template<IsParseState ParseState, typename MemoryResource = void> - struct BasicStateParseHandler : ParseHandler<BasicStateParseHandler<ParseState, MemoryResource>> { + template<detail::IsParseState ParseState> + struct BasicStateParseHandler : ParseHandler { using parse_state_type = ParseState; - using encoding_type = typename parse_state_type::ast_type::file_type::encoding_type; - constexpr bool is_valid() const { + virtual constexpr bool is_valid_impl() const { if (!_parse_state) return false; - return buffer().data() != nullptr; - } - - constexpr buffer_error load_buffer_size(const char* data, std::size_t size) { - lexy::buffer<encoding_type, MemoryResource> buffer(data, size); - _parse_state.reset(new parse_state_type { std::move(buffer) }); - return is_valid() ? buffer_error::success : buffer_error::buffer_is_null; - } - - constexpr buffer_error load_buffer(const char* start, const char* end) { - lexy::buffer<encoding_type, MemoryResource> buffer(start, end); - _parse_state.reset(new parse_state_type { std::move(buffer) }); - return is_valid() ? buffer_error::success : buffer_error::buffer_is_null; + return _parse_state.value().ast().file().is_valid(); } - buffer_error load_file(const char* path) { - lexy::read_file_result file = lexy::read_file<encoding_type, lexy::encoding_endianness::bom, MemoryResource>(path); - if (!file) { - _parse_state.reset(new parse_state_type { path, lexy::buffer<typename parse_state_type::ast_type::file_type::encoding_type>() }); - return ovdl::detail::from_underlying<buffer_error>(ovdl::detail::to_underlying(file.error())); - } - - _parse_state.reset(new parse_state_type { path, std::move(file).buffer() }); - return is_valid() ? buffer_error::success : buffer_error::buffer_is_null; + constexpr virtual buffer_error load_buffer_impl(lexy::buffer<lexy::default_encoding>&& buffer, const char* path, std::optional<Encoding> fallback) { + if (buffer.data() == nullptr) return buffer_error::buffer_is_null; + create_state(&_parse_state, path, std::move(buffer), fallback); + return is_valid_impl() ? buffer_error::success : buffer_error::buffer_is_null; } - const char* path() const { + virtual const char* path_impl() const { if (!_parse_state) return ""; - return _parse_state->ast().file().path(); + return _parse_state.value().ast().file().path(); } parse_state_type& parse_state() { - return *_parse_state; + return _parse_state.value(); } const parse_state_type& parse_state() const { - return *_parse_state; + return _parse_state.value(); } + template<typename Encoding> constexpr const auto& buffer() const { - return _parse_state->ast().file().buffer(); + return _parse_state.value().ast().file().template get_buffer_as<Encoding>(); } protected: - std::unique_ptr<parse_state_type> _parse_state; + std::optional<parse_state_type> _parse_state; }; }
\ No newline at end of file diff --git a/src/openvic-dataloader/detail/Warnings.hpp b/src/openvic-dataloader/detail/Warnings.hpp index ab718bc..3a0a239 100644 --- a/src/openvic-dataloader/detail/Warnings.hpp +++ b/src/openvic-dataloader/detail/Warnings.hpp @@ -1,18 +1,17 @@ #pragma once +#include <string> #include <string_view> -#include <openvic-dataloader/ParseWarning.hpp> - namespace ovdl::v2script::warnings { inline const std::string make_utf8_warning(std::string_view file_path) { - constexpr std::string_view message_suffix = "This may cause problems. Prefer Windows-1252 encoding."; + constexpr std::string_view message_suffix = "This may cause problems. Prefer Windows-1252 encoding:"; std::string message; if (file_path.empty()) { - message = "Buffer is a UTF-8 encoded string. " + std::string(message_suffix); + message = "Buffer is UTF-8 encoded. " + std::string(message_suffix); } else { - message = "File '" + std::string(file_path) + "' is a UTF-8 encoded file. " + std::string(message_suffix); + message = "File is UTF-8 encoded. " + std::string(message_suffix); } return message; diff --git a/src/openvic-dataloader/detail/dsl.hpp b/src/openvic-dataloader/detail/dsl.hpp index ccc1af6..fd8981a 100644 --- a/src/openvic-dataloader/detail/dsl.hpp +++ b/src/openvic-dataloader/detail/dsl.hpp @@ -1,16 +1,20 @@ #pragma once +#include <concepts> // IWYU pragma: keep #include <type_traits> #include <openvic-dataloader/NodeLocation.hpp> -#include <openvic-dataloader/ParseState.hpp> +#include <lexy/_detail/config.hpp> #include <lexy/callback/adapter.hpp> #include <lexy/callback/bind.hpp> #include <lexy/callback/container.hpp> #include <lexy/callback/fold.hpp> #include <lexy/dsl.hpp> +#include <lexy/dsl/literal.hpp> +#include <lexy/encoding.hpp> +#include "detail/InternalConcepts.hpp" #include "detail/StringLiteral.hpp" namespace ovdl::dsl { @@ -20,10 +24,46 @@ namespace ovdl::dsl { } template<typename Sink> - constexpr auto sink(Sink sink) { + constexpr auto bind_sink(Sink sink) { return lexy::bind_sink(sink, lexy::parse_state); } + template<typename ReturnT, typename Sink> + struct _sink_with_state { + using return_type = ReturnT; + + LEXY_EMPTY_MEMBER Sink _sink_cb; + + template<detail::IsStateType StateType, typename SinkCallback> + struct _sink_callback { + StateType& _state; + SinkCallback _sink_cb; + + using return_type = decltype(LEXY_MOV(_sink_cb).finish()); + + template<typename... Args> + constexpr void operator()(Args&&... args) { + lexy::_detail::invoke(_sink_cb, _state, LEXY_FWD(args)...); + } + + constexpr return_type finish() && { return LEXY_MOV(_sink_cb).finish(); } + }; + + template<typename... Args> + constexpr auto operator()(detail::IsStateType auto& state, Args... args) const -> decltype(_sink_cb(state, LEXY_FWD(args)...)) { + return _sink_cb(state, LEXY_FWD(args)...); + } + + constexpr auto sink(detail::IsStateType auto& state) const { + return _sink_callback<std::decay_t<decltype(state)>, decltype(_sink_cb.sink())> { state, _sink_cb.sink() }; + } + }; + + template<typename ReturnT, typename Sink> + constexpr auto sink(Sink&& sink) { + return bind_sink(_sink_with_state<ReturnT, Sink> { LEXY_FWD(sink) }); + } + template<typename Container, typename Callback> constexpr auto collect(Callback callback) { return sink(lexy::collect<Container>(callback)); @@ -34,49 +74,76 @@ namespace ovdl::dsl { return sink(lexy::collect(callback)); } - template<IsParseState StateType, typename T> + template<typename T> constexpr auto construct = callback<T*>( - [](StateType& state, ovdl::NodeLocation loc, auto&& arg) { - if constexpr (std::is_same_v<std::decay_t<decltype(arg)>, lexy::nullopt>) + [](detail::IsParseState auto& state, ovdl::NodeLocation loc, auto&& arg) { + if constexpr (std::same_as<std::decay_t<decltype(arg)>, lexy::nullopt>) return state.ast().template create<T>(loc); else return state.ast().template create<T>(loc, DRYAD_FWD(arg)); }, - [](StateType& state, ovdl::NodeLocation loc, auto&&... args) { + [](detail::IsParseState auto& state, ovdl::NodeLocation loc, auto&&... args) { return state.ast().template create<T>(loc, DRYAD_FWD(args)...); }); - template<IsParseState StateType, typename T, typename ListType, bool DisableEmpty = false> + template<typename T, typename ListType, bool DisableEmpty = false> constexpr auto construct_list = callback<T*>( - [](StateType& state, const char* begin, ListType&& arg, const char* end) { + [](detail::IsParseState auto& state, const char* begin, ListType&& arg, const char* end) { return state.ast().template create<T>(NodeLocation::make_from(begin, end), DRYAD_FWD(arg)); }, - [](StateType& state, const char* begin, lexy::nullopt, const char* end) { + [](detail::IsParseState auto& state, const char* begin, lexy::nullopt, const char* end) { return state.ast().template create<T>(NodeLocation::make_from(begin, end)); }, - [](StateType& state, const char* begin, const char* end) { + [](detail::IsParseState auto& state, const char* begin, const char* end) { return state.ast().template create<T>(NodeLocation::make_from(begin, end)); + }, + [](detail::IsParseState auto& state) { + return nullptr; }); - template<IsParseState StateType, typename T, typename ListType> - constexpr auto construct_list<StateType, T, ListType, true> = callback<T*>( - [](StateType& state, const char* begin, ListType&& arg, const char* end) { + template<typename T, typename ListType> + constexpr auto construct_list<T, ListType, true> = callback<T*>( + [](detail::IsParseState auto& state, const char* begin, ListType&& arg, const char* end) { return state.ast().template create<T>(NodeLocation::make_from(begin, end), DRYAD_FWD(arg)); }, - [](StateType& state, const char* begin, lexy::nullopt, const char* end) { + [](detail::IsParseState auto& state, const char* begin, lexy::nullopt, const char* end) { return state.ast().template create<T>(NodeLocation::make_from(begin, end)); }); - template<unsigned char LOW, unsigned char HIGH> - consteval auto make_range() { - if constexpr (LOW == HIGH) { - return ::lexy::dsl::lit_c<LOW>; - } else if constexpr (LOW == (HIGH - 1)) { - return ::lexy::dsl::lit_c<LOW> / ::lexy::dsl::lit_c<HIGH>; - } else { - return ::lexy::dsl::lit_c<LOW> / make_range<LOW + 1, HIGH>(); + template<typename CharT, CharT LowC, CharT HighC> + struct _crange : lexyd::char_class_base<_crange<CharT, LowC, HighC>> { + static_assert(LowC >= 0, "LowC cannot be less than 0"); + static_assert(HighC - LowC > 0, "LowC must be less than HighC"); + + static constexpr auto char_class_unicode() { + return LowC <= 0x7F && HighC <= 0x7F; } - } + + static LEXY_CONSTEVAL auto char_class_name() { + return "range"; + } + + static LEXY_CONSTEVAL auto char_class_ascii() { + lexy::_detail::ascii_set result; + if constexpr (LowC <= 0x7F && HighC <= 0x7F) + for (auto c = LowC; c <= HighC; c++) + result.insert(c); + return result; + } + + static constexpr auto char_class_match_cp([[maybe_unused]] char32_t cp) { + if constexpr (LowC <= 0x7F && HighC <= 0x7F) + return std::false_type {}; + else + return LowC <= cp && cp <= HighC; + } + }; + + template<auto LowC, decltype(LowC) HighC> + constexpr auto lit_c_range = _crange<LEXY_DECAY_DECLTYPE(LowC), LowC, HighC> {}; + + template<unsigned char LowC, unsigned char HighC> + constexpr auto lit_b_range = _crange<unsigned char, LowC, HighC> {}; template<auto Open, auto Close> constexpr auto position_brackets = lexy::dsl::brackets(lexy::dsl::position(lexy::dsl::lit_c<Open>), lexy::dsl::position(lexy::dsl::lit_c<Close>)); @@ -89,14 +156,13 @@ namespace ovdl::dsl { template<typename Production> constexpr auto p = lexy::dsl::position(lexy::dsl::p<Production>); - template<IsParseState ParseType, typename ReturnType, ovdl::detail::string_literal Keyword> + template<typename ReturnType, ovdl::detail::string_literal Keyword> static constexpr auto default_kw_value = dsl::callback<ReturnType*>( - [](ParseType& state, NodeLocation loc) { + [](detail::IsParseState auto& state, NodeLocation loc) { return state.ast().template create<ReturnType>(loc, state.ast().intern(Keyword.data(), Keyword.size())); }); template< - IsParseState ParseType, auto Identifier, typename RuleValue, ovdl::detail::string_literal Keyword, @@ -109,18 +175,17 @@ namespace ovdl::dsl { static constexpr auto value = Value; }; static constexpr auto rule = dsl::p<rule_t> >> Production; - static constexpr auto value = construct<ParseType, RuleValue>; + static constexpr auto value = construct<RuleValue>; }; template< - IsParseState ParseType, auto Identifier, typename RuleValue, ovdl::detail::string_literal Keyword, auto Production, auto Value> - struct fkeyword_rule : keyword_rule<ParseType, Identifier, RuleValue, Keyword, Production, Value> { - using base_type = keyword_rule<ParseType, Identifier, RuleValue, Keyword, Production, Value>; + struct fkeyword_rule : keyword_rule<Identifier, RuleValue, Keyword, Production, Value> { + using base_type = keyword_rule<Identifier, RuleValue, Keyword, Production, Value>; struct context_t; struct rule_t : base_type::rule_t { static constexpr auto flag = lexy::dsl::context_flag<context_t>; @@ -139,7 +204,7 @@ namespace ovdl::dsl { static constexpr auto make_flag = rule_t::flag.create(); static constexpr auto rule = dsl::p<rule_t> >> (rule_t::must >> rule_t::flag.set()) >> Production; - static constexpr auto value = construct<ParseType, RuleValue>; + static constexpr auto value = construct<RuleValue>; }; template<typename... Args> @@ -147,4 +212,71 @@ namespace ovdl::dsl { static constexpr auto flags = (Args::make_flag + ...); static constexpr auto p = (lexy::dsl::p<Args> | ...); }; + + template<typename Rule, typename RuleUtf, typename Tag> + struct _peek : lexyd::branch_base { + template<typename Reader> + struct bp { + typename Reader::iterator begin; + typename Reader::marker end; + + constexpr bool try_parse(const void*, Reader reader) { + using encoding = typename Reader::encoding; + + auto parser = [&] { + if constexpr (std::same_as<encoding, lexy::default_encoding> || std::same_as<encoding, lexy::byte_encoding>) { + // We need to match the entire rule. + return lexy::token_parser_for<decltype(lexy::dsl::token(Rule {})), Reader> { reader }; + } else { + // We need to match the entire rule. + return lexy::token_parser_for<decltype(lexy::dsl::token(RuleUtf {})), Reader> { reader }; + } + }(); + + begin = reader.position(); + auto result = parser.try_parse(reader); + end = parser.end; + + return result; + } + + template<typename Context> + constexpr void cancel(Context& context) { + context.on(lexyd::_ev::backtracked {}, begin, end.position()); + } + + template<typename NextParser, typename Context, typename... Args> + LEXY_PARSER_FUNC bool finish(Context& context, Reader& reader, Args&&... args) { + context.on(lexyd::_ev::backtracked {}, begin, end.position()); + return NextParser::parse(context, reader, LEXY_FWD(args)...); + } + }; + + template<typename NextParser> + struct p { + template<typename Context, typename Reader, typename... Args> + LEXY_PARSER_FUNC static bool parse(Context& context, Reader& reader, Args&&... args) { + bp<Reader> impl {}; + if (!impl.try_parse(context.control_block, reader)) { + // Report that we've failed. + using tag = lexy::_detail::type_or<Tag, lexy::peek_failure>; + auto err = lexy::error<Reader, tag>(impl.begin, impl.end.position()); + context.on(lexyd::_ev::error {}, err); + + // But recover immediately, as we wouldn't have consumed anything either way. + } + + context.on(lexyd::_ev::backtracked {}, impl.begin, impl.end); + return NextParser::parse(context, reader, LEXY_FWD(args)...); + } + }; + + template<typename Error> + static constexpr _peek<Rule, RuleUtf, Error> error = {}; + }; + + template<typename Rule, typename RuleUtf> + constexpr auto peek(Rule, RuleUtf) { + return _peek<Rule, RuleUtf, void> {}; + } }
\ No newline at end of file diff --git a/src/openvic-dataloader/v2script/AbstractSyntaxTree.cpp b/src/openvic-dataloader/v2script/AbstractSyntaxTree.cpp index abade40..5a98b40 100644 --- a/src/openvic-dataloader/v2script/AbstractSyntaxTree.cpp +++ b/src/openvic-dataloader/v2script/AbstractSyntaxTree.cpp @@ -1,8 +1,7 @@ -#include <stddef.h> - -#include <openvic-dataloader/v2script/AbstractSyntaxTree.hpp> +#include "openvic-dataloader/v2script/AbstractSyntaxTree.hpp" #include <lexy/dsl/option.hpp> +#include <lexy/encoding.hpp> #include <lexy/input_location.hpp> #include <dryad/node.hpp> @@ -23,6 +22,15 @@ ListValue::ListValue(dryad::node_ctor ctor, StatementList statements) } } +ListValue::ListValue(dryad::node_ctor ctor, AssignStatementList statements) : node_base(ctor) { + insert_child_list_after(nullptr, statements); + if (statements.empty()) { + _last_statement = nullptr; + } else { + _last_statement = statements.back(); + } +} + FileTree::FileTree(dryad::node_ctor ctor, StatementList statements) : node_base(ctor) { insert_child_list_after(nullptr, statements); if (statements.empty()) { @@ -32,29 +40,22 @@ FileTree::FileTree(dryad::node_ctor ctor, StatementList statements) : node_base( } } -// static void _handle_string_characters(std::string& string, bool allow_newline) { -// size_t position = 0; -// for (auto& c : string) { -// switch (c) { -// case '\r': -// case '\n': -// if (allow_newline) goto END_LOOP; -// c = ' '; -// break; -// default: break; -// } -// END_LOOP: -// position++; -// } -// } - -std::string AbstractSyntaxTree::make_list_visualizer() const { +FileTree::FileTree(dryad::node_ctor ctor, AssignStatementList statements) : node_base(ctor) { + insert_child_list_after(nullptr, statements); + if (statements.empty()) { + _last_node = nullptr; + } else { + _last_node = statements.back(); + } +} + +std::string FileAbstractSyntaxTree::make_list_visualizer() const { const int INDENT_SIZE = 2; std::string result; unsigned int level = 0; - for (auto [event, node] : dryad::traverse(_tree)) { + for (auto [event, node] : dryad::traverse(this->_tree)) { if (event == dryad::traverse_event::exit) { --level; continue; @@ -66,7 +67,7 @@ std::string AbstractSyntaxTree::make_list_visualizer() const { dryad::visit_node( node, [&](const FlatValue* value) { - result.append(value->value(_symbol_interner)); + result.append(value->value(this->_symbol_interner)); }, [&](const ListValue* value) { }, @@ -89,19 +90,19 @@ std::string AbstractSyntaxTree::make_list_visualizer() const { return result; } -std::string AbstractSyntaxTree::make_native_visualizer() const { +std::string FileAbstractSyntaxTree::make_native_visualizer() const { constexpr int INDENT_SIZE = 2; std::string result; unsigned int level = 0; dryad::visit_tree( - _tree, + this->_tree, [&](const IdentifierValue* value) { - result.append(value->value(_symbol_interner)); + result.append(value->value(this->_symbol_interner)); }, [&](const StringValue* value) { - result.append(1, '"').append(value->value(_symbol_interner)).append(1, '"'); + result.append(1, '"').append(value->value(this->_symbol_interner)).append(1, '"'); }, [&](dryad::child_visitor<NodeKind> visitor, const ValueStatement* statement) { visitor(statement->value()); diff --git a/src/openvic-dataloader/v2script/EventGrammar.hpp b/src/openvic-dataloader/v2script/EventGrammar.hpp index 27f6459..130a233 100644 --- a/src/openvic-dataloader/v2script/EventGrammar.hpp +++ b/src/openvic-dataloader/v2script/EventGrammar.hpp @@ -11,8 +11,8 @@ #include "openvic-dataloader/NodeLocation.hpp" -#include "ParseState.hpp" #include "SimpleGrammar.hpp" +#include "detail/InternalConcepts.hpp" #include "detail/dsl.hpp" #include "v2script/AiBehaviorGrammar.hpp" #include "v2script/EffectGrammar.hpp" @@ -28,7 +28,7 @@ namespace ovdl::v2script::grammar { struct MonthValue { static constexpr auto rule = lexy::dsl::p<Identifier<StringEscapeOption>>; static constexpr auto value = dsl::callback<ast::IdentifierValue*>( - [](ast::ParseState& state, ast::IdentifierValue* value) { + [](detail::IsParseState auto& state, ast::IdentifierValue* value) { bool is_number = true; for (auto* current = value->value(state.ast().symbol_interner()); *current; current++) { is_number = is_number && std::isdigit(*current); @@ -94,7 +94,7 @@ namespace ovdl::v2script::grammar { static constexpr auto value = dsl::callback<ast::EventStatement*>( - [](ast::ParseState& state, NodeLocation loc, ast::IdentifierValue* name, ast::ListValue* list) { + [](detail::IsParseState auto& state, NodeLocation loc, ast::IdentifierValue* name, ast::ListValue* list) { static auto country_decl = state.ast().intern_cstr("country_event"); static auto province_decl = state.ast().intern_cstr("province_event"); @@ -104,7 +104,7 @@ namespace ovdl::v2script::grammar { .finish(); } - return state.ast().create<ast::EventStatement>(loc, name->value(state.ast().symbol_interner()) == province_decl, list); + return state.ast().template create<ast::EventStatement>(loc, name->value(state.ast().symbol_interner()) == province_decl, list); }); }; diff --git a/src/openvic-dataloader/v2script/LuaDefinesGrammar.hpp b/src/openvic-dataloader/v2script/LuaDefinesGrammar.hpp index 96cce99..885413c 100644 --- a/src/openvic-dataloader/v2script/LuaDefinesGrammar.hpp +++ b/src/openvic-dataloader/v2script/LuaDefinesGrammar.hpp @@ -4,9 +4,12 @@ #include <lexy/_detail/config.hpp> #include <lexy/dsl.hpp> +#include <lexy/dsl/delimited.hpp> +#include <lexy/dsl/recover.hpp> +#include <lexy/dsl/unicode.hpp> -#include "ParseState.hpp" #include "SimpleGrammar.hpp" +#include "detail/InternalConcepts.hpp" #include "detail/dsl.hpp" namespace ovdl::v2script::lua::grammar { @@ -21,90 +24,118 @@ namespace ovdl::v2script::lua::grammar { template<typename T> constexpr auto construct_list = v2script::grammar::construct_list<T>; - struct ParseOptions { - }; - - template<ParseOptions Options> struct StatementListBlock; static constexpr auto comment_specifier = LEXY_LIT("--") >> lexy::dsl::until(lexy::dsl::newline).or_eof(); - template<ParseOptions Options> struct Identifier { static constexpr auto rule = lexy::dsl::identifier(lexy::dsl::ascii::alpha_underscore, lexy::dsl::ascii::alpha_digit_underscore); - static constexpr auto value = callback<ast::IdentifierValue*>( - [](ast::ParseState& state, auto lexeme) { - auto value = state.ast().intern(lexeme.data(), lexeme.size()); - return state.ast().create<ast::IdentifierValue>(lexeme.begin(), lexeme.end(), value); - }); + static constexpr auto value = + callback<ast::IdentifierValue*>( + [](detail::IsParseState auto& state, auto lexeme) { + auto value = state.ast().intern(lexeme.data(), lexeme.size()); + return state.ast().template create<ast::IdentifierValue>(lexeme.begin(), lexeme.end(), value); + }); }; - template<ParseOptions Options> struct Value { static constexpr auto rule = lexy::dsl::identifier(lexy::dsl::ascii::digit / lexy::dsl::lit_c<'.'> / lexy::dsl::lit_c<'-'>); - static constexpr auto value = callback<ast::IdentifierValue*>( - [](ast::ParseState& state, auto lexeme) { - auto value = state.ast().intern(lexeme.data(), lexeme.size()); - return state.ast().create<ast::IdentifierValue>(lexeme.begin(), lexeme.end(), value); - }); - }; - - template<ParseOptions Options> - struct String { - static constexpr auto rule = [] { - // Arbitrary code points that aren't control characters. - auto c = dsl::make_range<0x20, 0xFF>() - lexy::dsl::ascii::control; - - return lexy::dsl::delimited(lexy::dsl::position(lexy::dsl::lit_b<'"'>))(c) | lexy::dsl::delimited(lexy::dsl::position(lexy::dsl::lit_b<'\''>))(c); - }(); - static constexpr auto value = - lexy::as_string<std::string> >> - callback<ast::StringValue*>( - [](ast::ParseState& state, const char* begin, const std::string& str, const char* end) { - auto value = state.ast().intern(str.data(), str.length()); - return state.ast().create<ast::StringValue>(begin, end, value); + callback<ast::IdentifierValue*>( + [](detail::IsParseState auto& state, auto lexeme) { + auto value = state.ast().intern(lexeme.data(), lexeme.size()); + return state.ast().template create<ast::IdentifierValue>(lexeme.begin(), lexeme.end(), value); }); }; - template<ParseOptions Options> + struct String : lexy::scan_production<ast::StringValue*>, + lexy::token_production { + template<typename Context, typename Reader> + static constexpr scan_result scan(lexy::rule_scanner<Context, Reader>& scanner, detail::IsParseState auto& state) { + using encoding = typename Reader::encoding; + + constexpr auto c = [] { + if constexpr (std::same_as<encoding, lexy::default_encoding> || std::same_as<encoding, lexy::byte_encoding>) { + // Arbitrary code points that aren't control characters. + return dsl::lit_b_range<0x20, 0xFF> - lexy::dsl::ascii::control; + } else { + return -lexy::dsl::unicode::control; + } + }(); + auto rule = lexy::dsl::quoted(c) | lexy::dsl::single_quoted(c); + auto begin = scanner.position(); + lexy::scan_result<std::string> str_result; + scanner.parse(str_result, rule); + if (!scanner || !str_result) + return lexy::scan_failed; + auto end = scanner.position(); + auto str = str_result.value(); + auto value = state.ast().intern(str.data(), str.size()); + return state.ast().template create<ast::StringValue>(begin, end, value); + } + + static constexpr auto rule = lexy::dsl::peek(lexy::dsl::quoted.open() | lexy::dsl::single_quoted.open()) >> lexy::dsl::scan; + static constexpr auto value = ovdl::v2script::grammar::convert_as_string<std::string> >> lexy::forward<ast::StringValue*>; + }; + struct Expression { - static constexpr auto rule = lexy::dsl::p<Value<Options>> | lexy::dsl::p<String<Options>>; + static constexpr auto rule = lexy::dsl::p<Value> | lexy::dsl::p<String>; static constexpr auto value = lexy::forward<ast::Value*>; }; - template<ParseOptions Options> struct AssignmentStatement { - static constexpr auto rule = - dsl::p<Identifier<Options>> >> - lexy::dsl::equal_sign >> - (lexy::dsl::p<Expression<Options>> | lexy::dsl::recurse_branch<StatementListBlock<Options>>); + static constexpr auto rule = [] { + auto right_brace = lexy::dsl::lit_c<'}'>; + + auto expression = lexy::dsl::p<Expression>; + auto statement_list = lexy::dsl::recurse_branch<StatementListBlock>; + + auto rhs_recover = lexy::dsl::recover(expression, statement_list).limit(right_brace); + auto rhs_try = lexy::dsl::try_(expression | statement_list, rhs_recover); + + auto identifier = dsl::p<Identifier> >> lexy::dsl::equal_sign + rhs_try; + + auto recover = lexy::dsl::recover(identifier).limit(right_brace); + return lexy::dsl::try_(identifier, recover); + }(); static constexpr auto value = callback<ast::AssignStatement*>( - [](ast::ParseState& state, const char* pos, ast::IdentifierValue* name, ast::Value* initializer) { - return state.ast().create<ast::AssignStatement>(pos, name, initializer); + [](detail::IsParseState auto& state, const char* pos, ast::IdentifierValue* name, ast::Value* initializer) -> ast::AssignStatement* { + if (initializer == nullptr) return nullptr; + return state.ast().template create<ast::AssignStatement>(pos, name, initializer); + }, + [](detail::IsParseState auto& state, ast::Value*) { + return nullptr; + }, + [](detail::IsParseState auto& state) { + return nullptr; }); }; - template<ParseOptions Options> struct StatementListBlock { - static constexpr auto rule = - dsl::curly_bracketed( - lexy::dsl::opt( - lexy::dsl::list( - lexy::dsl::recurse_branch<AssignmentStatement<Options>>, - lexy::dsl::trailing_sep(lexy::dsl::lit_c<','>)))); + static constexpr auto rule = [] { + auto right_brace = lexy::dsl::lit_c<'}'>; + auto comma = lexy::dsl::lit_c<','>; + + auto assign_statement = lexy::dsl::recurse_branch<AssignmentStatement>; + auto assign_try = lexy::dsl::try_(assign_statement); + + auto curly_bracket = dsl::curly_bracketed.opt_list( + assign_try, + lexy::dsl::trailing_sep(comma)); + + return lexy::dsl::try_(curly_bracket, lexy::dsl::find(right_brace)); + }(); static constexpr auto value = lexy::as_list<ast::AssignStatementList> >> construct_list<ast::ListValue>; }; - template<ParseOptions Options = ParseOptions {}> struct File { // Allow arbitrary spaces between individual tokens. static constexpr auto whitespace = ovdl::v2script::grammar::whitespace_specifier | comment_specifier; - static constexpr auto rule = lexy::dsl::position + lexy::dsl::terminator(lexy::dsl::eof).opt_list(lexy::dsl::p<AssignmentStatement<Options>>); + static constexpr auto rule = lexy::dsl::position + lexy::dsl::terminator(lexy::dsl::eof).opt_list(lexy::dsl::p<AssignmentStatement>); static constexpr auto value = lexy::as_list<ast::AssignStatementList> >> construct<ast::FileTree>; }; diff --git a/src/openvic-dataloader/v2script/ModifierGrammar.hpp b/src/openvic-dataloader/v2script/ModifierGrammar.hpp index 22592d4..122a8c7 100644 --- a/src/openvic-dataloader/v2script/ModifierGrammar.hpp +++ b/src/openvic-dataloader/v2script/ModifierGrammar.hpp @@ -10,9 +10,9 @@ #include "openvic-dataloader/NodeLocation.hpp" -#include "ParseState.hpp" #include "SimpleGrammar.hpp" #include "TriggerGrammar.hpp" +#include "detail/InternalConcepts.hpp" #include "detail/dsl.hpp" namespace ovdl::v2script::grammar { @@ -22,9 +22,9 @@ namespace ovdl::v2script::grammar { struct FactorStatement { static constexpr auto rule = lexy::dsl::position(factor_keyword) >> (lexy::dsl::equal_sign + lexy::dsl::p<Identifier<StringEscapeOption>>); static constexpr auto value = dsl::callback<ast::AssignStatement*>( - [](ast::ParseState& state, NodeLocation loc, ast::IdentifierValue* value) { - auto* factor = state.ast().create<ast::IdentifierValue>(loc, state.ast().intern("factor")); - return state.ast().create<ast::AssignStatement>(loc, factor, value); + [](detail::IsParseState auto& state, NodeLocation loc, ast::IdentifierValue* value) { + auto* factor = state.ast().template create<ast::IdentifierValue>(loc, state.ast().intern("factor")); + return state.ast().template create<ast::AssignStatement>(loc, factor, value); }); }; @@ -49,9 +49,9 @@ namespace ovdl::v2script::grammar { lexy::dsl::position(modifier_keyword) >> lexy::dsl::equal_sign >> lexy::dsl::p<ModifierList>; static constexpr auto value = dsl::callback<ast::AssignStatement*>( - [](ast::ParseState& state, NodeLocation loc, ast::ListValue* list) { - auto* factor = state.ast().create<ast::IdentifierValue>(loc, state.ast().intern("modifier")); - return state.ast().create<ast::AssignStatement>(loc, factor, list); + [](detail::IsParseState auto& state, NodeLocation loc, ast::ListValue* list) { + auto* factor = state.ast().template create<ast::IdentifierValue>(loc, state.ast().intern("modifier")); + return state.ast().template create<ast::AssignStatement>(loc, factor, list); }); }; }
\ No newline at end of file diff --git a/src/openvic-dataloader/v2script/ParseState.hpp b/src/openvic-dataloader/v2script/ParseState.hpp index 8e29bf5..954e39d 100644 --- a/src/openvic-dataloader/v2script/ParseState.hpp +++ b/src/openvic-dataloader/v2script/ParseState.hpp @@ -1,23 +1,24 @@ #pragma once -#include <openvic-dataloader/File.hpp> -#include <openvic-dataloader/ParseState.hpp> #include <openvic-dataloader/v2script/AbstractSyntaxTree.hpp> #include <lexy/encoding.hpp> +#include "../openvic-dataloader/ParseState.hpp" +#include "AbstractSyntaxTree.hpp" +#include "File.hpp" +#include "detail/InternalConcepts.hpp" + namespace ovdl::v2script::ast { - using File = ovdl::BasicFile<lexy::default_encoding, Node>; - struct AbstractSyntaxTree : ovdl::BasicAbstractSyntaxTree<File, FileTree> { - using BasicAbstractSyntaxTree::BasicAbstractSyntaxTree; + + struct FileAbstractSyntaxTree : ovdl::BasicAbstractSyntaxTree<ovdl::BasicFile<Node>, FileTree> { + using ovdl::BasicAbstractSyntaxTree<ovdl::BasicFile<Node>, FileTree>::BasicAbstractSyntaxTree; std::string make_list_visualizer() const; std::string make_native_visualizer() const; }; - using ParseState = ovdl::ParseState<AbstractSyntaxTree>; + using ParseState = ovdl::ParseState<FileAbstractSyntaxTree>; - static_assert(IsFile<ast::File>, "File failed IsFile concept"); - static_assert(IsAst<ast::AbstractSyntaxTree>, "AbstractSyntaxTree failed IsAst concept"); - static_assert(IsParseState<ast::ParseState>, "ParseState failed IsParseState concept"); + static_assert(detail::IsParseState<ast::ParseState>, "ParseState failed IsParseState concept"); }
\ No newline at end of file diff --git a/src/openvic-dataloader/v2script/Parser.cpp b/src/openvic-dataloader/v2script/Parser.cpp index eb491d5..23dada7 100644 --- a/src/openvic-dataloader/v2script/Parser.cpp +++ b/src/openvic-dataloader/v2script/Parser.cpp @@ -4,16 +4,15 @@ #include <iostream> #include <optional> #include <string> +#include <type_traits> #include <utility> -#include <openvic-dataloader/DiagnosticLogger.hpp> +#include <openvic-dataloader/Error.hpp> #include <openvic-dataloader/NodeLocation.hpp> -#include <openvic-dataloader/ParseError.hpp> -#include <openvic-dataloader/ParseWarning.hpp> -#include <openvic-dataloader/detail/LexyReportError.hpp> +#include <openvic-dataloader/detail/Concepts.hpp> +#include <openvic-dataloader/detail/Encoding.hpp> #include <openvic-dataloader/detail/OStreamOutputIterator.hpp> -#include <openvic-dataloader/detail/utility/Concepts.hpp> -#include <openvic-dataloader/detail/utility/Utility.hpp> +#include <openvic-dataloader/detail/Utility.hpp> #include <openvic-dataloader/v2script/AbstractSyntaxTree.hpp> #include <lexy/action/parse.hpp> @@ -29,10 +28,8 @@ #include <fmt/core.h> -#include "openvic-dataloader/Error.hpp" - +#include "DiagnosticLogger.hpp" #include "ParseState.hpp" -#include "detail/DetectUtf8.hpp" #include "detail/NullBuff.hpp" #include "detail/ParseHandler.hpp" #include "detail/Warnings.hpp" @@ -44,29 +41,46 @@ using namespace ovdl; using namespace ovdl::v2script; -/// BufferHandler /// +/// ParseHandler /// struct Parser::ParseHandler final : detail::BasicStateParseHandler<v2script::ast::ParseState> { - constexpr bool is_exclusive_utf8() const { - return detail::is_utf8_no_ascii(buffer()); - } - template<typename Node> std::optional<DiagnosticLogger::error_range> parse() { - auto result = lexy::parse<Node>(buffer(), *_parse_state, _parse_state->logger().error_callback()); + if (parse_state().encoding() == ovdl::detail::Encoding::Utf8) { + parse_state().logger().warning(warnings::make_utf8_warning(path())); + } + + auto result = [&] { + switch (parse_state().encoding()) { + using enum detail::Encoding; + case Ascii: + case Utf8: + return lexy::parse<Node>(buffer<lexy::utf8_char_encoding>(), parse_state(), parse_state().logger().error_callback()); + case Unknown: + case Windows1251: + case Windows1252: + return lexy::parse<Node>(buffer<lexy::default_encoding>(), parse_state(), parse_state().logger().error_callback()); + default: + ovdl::detail::unreachable(); + } + }(); if (!result) { - return _parse_state->logger().get_errors(); + return parse_state().logger().get_errors(); } - _parse_state->ast().set_root(result.value()); + parse_state().ast().set_root(result.value()); return std::nullopt; } ast::FileTree* root() { - return _parse_state->ast().root(); + return parse_state().ast().root(); + } + + Parser::error_range get_errors() { + return parse_state().logger().get_errors(); } }; -/// BufferHandler /// +/// ParseHandler /// Parser::Parser() : _parse_handler(std::make_unique<ParseHandler>()) { @@ -82,29 +96,29 @@ Parser::Parser(Parser&&) = default; Parser& Parser::operator=(Parser&&) = default; Parser::~Parser() = default; -Parser Parser::from_buffer(const char* data, std::size_t size) { +Parser Parser::from_buffer(const char* data, std::size_t size, std::optional<detail::Encoding> encoding_fallback) { Parser result; - return std::move(result.load_from_buffer(data, size)); + return std::move(result.load_from_buffer(data, size, encoding_fallback)); } -Parser Parser::from_buffer(const char* start, const char* end) { +Parser Parser::from_buffer(const char* start, const char* end, std::optional<detail::Encoding> encoding_fallback) { Parser result; - return std::move(result.load_from_buffer(start, end)); + return std::move(result.load_from_buffer(start, end, encoding_fallback)); } -Parser Parser::from_string(const std::string_view string) { +Parser Parser::from_string(const std::string_view string, std::optional<detail::Encoding> encoding_fallback) { Parser result; - return std::move(result.load_from_string(string)); + return std::move(result.load_from_string(string, encoding_fallback)); } -Parser Parser::from_file(const char* path) { +Parser Parser::from_file(const char* path, std::optional<detail::Encoding> encoding_fallback) { Parser result; - return std::move(result.load_from_file(path)); + return std::move(result.load_from_file(path, encoding_fallback)); } -Parser Parser::from_file(const std::filesystem::path& path) { +Parser Parser::from_file(const std::filesystem::path& path, std::optional<detail::Encoding> encoding_fallback) { Parser result; - return std::move(result.load_from_file(path)); + return std::move(result.load_from_file(path, encoding_fallback)); } /// @@ -128,38 +142,38 @@ constexpr void Parser::_run_load_func(detail::LoadCallback<Parser::ParseHandler* if (!error_message.empty()) { _has_error = true; _has_fatal_error = true; - _parse_handler->parse_state().logger().create_log<error::BufferError>(DiagnosticLogger::DiagnosticKind::error, fmt::runtime(error_message)); + _parse_handler->parse_state().logger().template create_log<error::BufferError>(DiagnosticLogger::DiagnosticKind::error, fmt::runtime(error_message)); } if (has_error() && &_error_stream.get() != &detail::cnull) { print_errors_to(_error_stream.get()); } } -constexpr Parser& Parser::load_from_buffer(const char* data, std::size_t size) { +constexpr Parser& Parser::load_from_buffer(const char* data, std::size_t size, std::optional<detail::Encoding> encoding_fallback) { // Type can't be deduced? - _run_load_func(std::mem_fn(&ParseHandler::load_buffer_size), data, size); + _run_load_func(std::mem_fn(&ParseHandler::load_buffer_size), data, size, encoding_fallback); return *this; } -constexpr Parser& Parser::load_from_buffer(const char* start, const char* end) { +constexpr Parser& Parser::load_from_buffer(const char* start, const char* end, std::optional<detail::Encoding> encoding_fallback) { // Type can't be deduced? - _run_load_func(std::mem_fn(&ParseHandler::load_buffer), start, end); + _run_load_func(std::mem_fn(&ParseHandler::load_buffer), start, end, encoding_fallback); return *this; } -constexpr Parser& Parser::load_from_string(const std::string_view string) { - return load_from_buffer(string.data(), string.size()); +constexpr Parser& Parser::load_from_string(const std::string_view string, std::optional<detail::Encoding> encoding_fallback) { + return load_from_buffer(string.data(), string.size(), encoding_fallback); } -Parser& Parser::load_from_file(const char* path) { +Parser& Parser::load_from_file(const char* path, std::optional<detail::Encoding> encoding_fallback) { set_file_path(path); // Type can be deduced?? - _run_load_func(std::mem_fn(&ParseHandler::load_file), path); + _run_load_func(std::mem_fn(&ParseHandler::load_file), get_file_path().data(), encoding_fallback); return *this; } -Parser& Parser::load_from_file(const std::filesystem::path& path) { - return load_from_file(path.string().c_str()); +Parser& Parser::load_from_file(const std::filesystem::path& path, std::optional<detail::Encoding> encoding_fallback) { + return load_from_file(path.string().c_str(), encoding_fallback); } /* REQUIREMENTS: @@ -173,11 +187,7 @@ bool Parser::simple_parse() { return false; } - if (_parse_handler->is_exclusive_utf8()) { - _parse_handler->parse_state().logger().warning(warnings::make_utf8_warning(_file_path)); - } - - auto errors = _parse_handler->parse<grammar::File<grammar::NoStringEscapeOption>>(); + std::optional<DiagnosticLogger::error_range> errors = _parse_handler->parse<grammar::File>(); _has_error = _parse_handler->parse_state().logger().errored(); _has_warning = _parse_handler->parse_state().logger().warned(); if (!_parse_handler->root()) { @@ -196,14 +206,11 @@ bool Parser::event_parse() { return false; } - if (_parse_handler->is_exclusive_utf8()) { - _parse_handler->parse_state().logger().warning(warnings::make_utf8_warning(_file_path)); - } - - auto errors = _parse_handler->parse<grammar::EventFile>(); + std::optional<DiagnosticLogger::error_range> errors = _parse_handler->parse<grammar::EventFile>(); _has_error = _parse_handler->parse_state().logger().errored(); _has_warning = _parse_handler->parse_state().logger().warned(); if (!_parse_handler->root()) { + _has_error = true; _has_fatal_error = true; if (&_error_stream.get() != &detail::cnull) { print_errors_to(_error_stream); @@ -218,14 +225,11 @@ bool Parser::decision_parse() { return false; } - if (_parse_handler->is_exclusive_utf8()) { - _parse_handler->parse_state().logger().warning(warnings::make_utf8_warning(_file_path)); - } - - auto errors = _parse_handler->parse<grammar::DecisionFile>(); + std::optional<DiagnosticLogger::error_range> errors = _parse_handler->parse<grammar::DecisionFile>(); _has_error = _parse_handler->parse_state().logger().errored(); _has_warning = _parse_handler->parse_state().logger().warned(); if (!_parse_handler->root()) { + _has_error = true; _has_fatal_error = true; if (&_error_stream.get() != &detail::cnull) { print_errors_to(_error_stream); @@ -240,14 +244,11 @@ bool Parser::lua_defines_parse() { return false; } - if (_parse_handler->is_exclusive_utf8()) { - _parse_handler->parse_state().logger().warning(warnings::make_utf8_warning(_file_path)); - } - - auto errors = _parse_handler->parse<lua::grammar::File<>>(); + std::optional<DiagnosticLogger::error_range> errors = _parse_handler->parse<lua::grammar::File>(); _has_error = _parse_handler->parse_state().logger().errored(); _has_warning = _parse_handler->parse_state().logger().warned(); if (!_parse_handler->root()) { + _has_error = true; _has_fatal_error = true; if (&_error_stream.get() != &detail::cnull) { print_errors_to(_error_stream); @@ -273,48 +274,66 @@ std::string Parser::make_list_string() const { return _parse_handler->parse_state().ast().make_list_visualizer(); } +// TODO: Remove reinterpret_cast +// WARNING: This almost certainly breaks on utf16 and utf32 encodings, luckily we don't parse in that format +// This is purely to silence the node_location errors because char8_t is useless +#define REINTERPRET_IT(IT) reinterpret_cast<const std::decay_t<decltype(buffer)>::encoding::char_type*>((IT)) + const FilePosition Parser::get_position(const ast::Node* node) const { if (!node || !node->is_linked_in_tree()) { return {}; } - auto node_location = _parse_handler->parse_state().ast().location_of(node); + + NodeLocation node_location; + + node_location = _parse_handler->parse_state().ast().location_of(node); + if (node_location.is_synthesized()) { - return {}; + return FilePosition {}; } - auto loc_begin = lexy::get_input_location(_parse_handler->buffer(), node_location.begin()); - FilePosition result { loc_begin.line_nr(), loc_begin.line_nr(), loc_begin.column_nr(), loc_begin.column_nr() }; - if (node_location.begin() < node_location.end()) { - auto loc_end = lexy::get_input_location(_parse_handler->buffer(), node_location.end(), loc_begin.anchor()); - result.end_line = loc_end.line_nr(); - result.end_column = loc_end.column_nr(); - } - return result; + return _parse_handler->parse_state().ast().file().visit_buffer( + [&](auto&& buffer) -> FilePosition { + auto loc_begin = lexy::get_input_location(buffer, REINTERPRET_IT(node_location.begin())); + FilePosition result { loc_begin.line_nr(), loc_begin.line_nr(), loc_begin.column_nr(), loc_begin.column_nr() }; + if (node_location.begin() < node_location.end()) { + auto loc_end = lexy::get_input_location(buffer, REINTERPRET_IT(node_location.end()), loc_begin.anchor()); + result.end_line = loc_end.line_nr(); + result.end_column = loc_end.column_nr(); + } + return result; + }); } Parser::error_range Parser::get_errors() const { - return _parse_handler->parse_state().logger().get_errors(); + return _parse_handler->get_errors(); } const FilePosition Parser::get_error_position(const error::Error* error) const { if (!error || !error->is_linked_in_tree()) { return {}; } + auto err_location = _parse_handler->parse_state().logger().location_of(error); if (err_location.is_synthesized()) { - return {}; + return FilePosition {}; } - auto loc_begin = lexy::get_input_location(_parse_handler->buffer(), err_location.begin()); - FilePosition result { loc_begin.line_nr(), loc_begin.line_nr(), loc_begin.column_nr(), loc_begin.column_nr() }; - if (err_location.begin() < err_location.end()) { - auto loc_end = lexy::get_input_location(_parse_handler->buffer(), err_location.end(), loc_begin.anchor()); - result.end_line = loc_end.line_nr(); - result.end_column = loc_end.column_nr(); - } - return result; + return _parse_handler->parse_state().ast().file().visit_buffer( + [&](auto&& buffer) -> FilePosition { + auto loc_begin = lexy::get_input_location(buffer, REINTERPRET_IT(err_location.begin())); + FilePosition result { loc_begin.line_nr(), loc_begin.line_nr(), loc_begin.column_nr(), loc_begin.column_nr() }; + if (err_location.begin() < err_location.end()) { + auto loc_end = lexy::get_input_location(buffer, REINTERPRET_IT(err_location.end()), loc_begin.anchor()); + result.end_line = loc_end.line_nr(); + result.end_column = loc_end.column_nr(); + } + return result; + }); } +#undef REINTERPRET_IT + void Parser::print_errors_to(std::basic_ostream<char>& stream) const { auto errors = get_errors(); if (errors.empty()) return; @@ -324,19 +343,9 @@ void Parser::print_errors_to(std::basic_ostream<char>& stream) const { [&](const error::BufferError* buffer_error) { stream << "buffer error: " << buffer_error->message() << '\n'; }, - [&](const error::ParseError* parse_error) { - auto position = get_error_position(parse_error); - std::string pos_str = fmt::format(":{}:{}: ", position.start_line, position.start_column); - stream << _file_path << pos_str << "parse error for '" << parse_error->production_name() << "': " << parse_error->message() << '\n'; - }, - [&](dryad::child_visitor<error::ErrorKind> visitor, const error::Semantic* semantic) { - auto position = get_error_position(semantic); - std::string pos_str = ": "; - if (!position.is_empty()) { - pos_str = fmt::format(":{}:{}: ", position.start_line, position.start_column); - } - stream << _file_path << pos_str << semantic->message() << '\n'; - auto annotations = semantic->annotations(); + [&](dryad::child_visitor<error::ErrorKind> visitor, const error::AnnotatedError* annotated_error) { + stream << annotated_error->message() << '\n'; + auto annotations = annotated_error->annotations(); for (auto annotation : annotations) { visitor(annotation); } diff --git a/src/openvic-dataloader/v2script/SimpleGrammar.hpp b/src/openvic-dataloader/v2script/SimpleGrammar.hpp index 37e295f..d42ce07 100644 --- a/src/openvic-dataloader/v2script/SimpleGrammar.hpp +++ b/src/openvic-dataloader/v2script/SimpleGrammar.hpp @@ -5,10 +5,22 @@ #include <lexy/callback.hpp> #include <lexy/dsl.hpp> +#include <lexy/dsl/any.hpp> #include <lexy/dsl/identifier.hpp> +#include <lexy/dsl/option.hpp> +#include <lexy/dsl/peek.hpp> +#include <lexy/dsl/punctuator.hpp> +#include <lexy/dsl/recover.hpp> +#include <lexy/dsl/scan.hpp> #include <lexy/dsl/symbol.hpp> - -#include "ParseState.hpp" +#include <lexy/dsl/unicode.hpp> +#include <lexy/encoding.hpp> +#include <lexy/input/base.hpp> +#include <lexy/input/buffer.hpp> +#include <lexy/lexeme.hpp> + +#include "detail/Convert.hpp" +#include "detail/InternalConcepts.hpp" #include "detail/dsl.hpp" // Grammar Definitions // @@ -23,17 +35,28 @@ */ namespace ovdl::v2script::grammar { template<typename T> - constexpr auto construct = dsl::construct<ast::ParseState, T>; + constexpr auto construct = dsl::construct<T>; template<typename T, bool DisableEmpty = false, typename ListType = ast::AssignStatementList> - constexpr auto construct_list = dsl::construct_list<ast::ParseState, T, ListType, DisableEmpty>; + constexpr auto construct_list = dsl::construct_list<T, ListType, DisableEmpty>; + + struct ConvertErrorHandler { + static constexpr void on_invalid_character(detail::IsStateType auto& state, auto reader) { + state.logger().warning("invalid character value '{}' found.", static_cast<int>(reader.peek())) // + .primary(BasicNodeLocation { reader.position() }, "here") + .finish(); + } + }; + + template<typename String> + constexpr auto convert_as_string = convert::convert_as_string<String, ConvertErrorHandler>; struct ParseOptions { /// @brief Makes string parsing avoid string escapes bool NoStringEscape; }; - static constexpr ParseOptions NoStringEscapeOption = ParseOptions { true }; - static constexpr ParseOptions StringEscapeOption = ParseOptions { false }; + static constexpr auto NoStringEscapeOption = ParseOptions { true }; + static constexpr auto StringEscapeOption = ParseOptions { false }; /* REQUIREMENTS: DAT-630 */ static constexpr auto whitespace_specifier = lexy::dsl::ascii::blank / lexy::dsl::ascii::newline; @@ -50,24 +73,28 @@ namespace ovdl::v2script::grammar { ascii / lexy::dsl::lit_b<0x8A> / lexy::dsl::lit_b<0x8C> / lexy::dsl::lit_b<0x8E> / lexy::dsl::lit_b<0x92> / lexy::dsl::lit_b<0x97> / lexy::dsl::lit_b<0x9A> / lexy::dsl::lit_b<0x9C> / - dsl::make_range<0x9E, 0x9F>() / - dsl::make_range<0xC0, 0xD6>() / - dsl::make_range<0xD8, 0xF6>() / - dsl::make_range<0xF8, 0xFF>(); + dsl::lit_b_range<0x9E, 0x9F> / + dsl::lit_b_range<0xC0, 0xD6> / + dsl::lit_b_range<0xD8, 0xF6> / + dsl::lit_b_range<0xF8, 0xFF>; static constexpr auto windows_1251_data_specifier_additions = - dsl::make_range<0x80, 0x81>() / lexy::dsl::lit_b<0x83> / lexy::dsl::lit_b<0x8D> / lexy::dsl::lit_b<0x8F> / + dsl::lit_b_range<0x80, 0x81> / lexy::dsl::lit_b<0x83> / lexy::dsl::lit_b<0x8D> / lexy::dsl::lit_b<0x8F> / lexy::dsl::lit_b<0x90> / lexy::dsl::lit_b<0x9D> / lexy::dsl::lit_b<0x9F> / - dsl::make_range<0xA1, 0xA3>() / lexy::dsl::lit_b<0xA5> / lexy::dsl::lit_b<0xA8> / lexy::dsl::lit_b<0xAA> / + dsl::lit_b_range<0xA1, 0xA3> / lexy::dsl::lit_b<0xA5> / lexy::dsl::lit_b<0xA8> / lexy::dsl::lit_b<0xAA> / lexy::dsl::lit_b<0xAF> / - dsl::make_range<0xB2, 0xB4>() / lexy::dsl::lit_b<0xB8> / lexy::dsl::lit_b<0xBA> / - dsl::make_range<0xBC, 0xBF>() / + dsl::lit_b_range<0xB2, 0xB4> / lexy::dsl::lit_b<0xB8> / lexy::dsl::lit_b<0xBA> / + dsl::lit_b_range<0xBC, 0xBF> / lexy::dsl::lit_b<0xD7> / lexy::dsl::lit_b<0xF7>; static constexpr auto data_specifier = windows_1252_data_specifier / windows_1251_data_specifier_additions; static constexpr auto data_char_class = LEXY_CHAR_CLASS("DataSpecifier", data_specifier); + static constexpr auto utf_data_specifier = lexy::dsl::unicode::xid_continue / LEXY_ASCII_ONE_OF("+:@%&'-."); + + static constexpr auto utf_char_class = LEXY_CHAR_CLASS("DataSpecifier", utf_data_specifier); + static constexpr auto escaped_symbols = lexy::symbol_table<char> // .map<'"'>('"') .map<'\''>('\'') @@ -79,50 +106,121 @@ namespace ovdl::v2script::grammar { .map<'r'>('\r') .map<'t'>('\t'); - static constexpr auto id = lexy::dsl::identifier(data_char_class); + static constexpr auto id = lexy::dsl::identifier(ascii); template<ParseOptions Options> struct SimpleGrammar { struct StatementListBlock; - struct Identifier { - static constexpr auto rule = lexy::dsl::identifier(data_char_class); - static constexpr auto value = dsl::callback<ast::IdentifierValue*>( - [](ast::ParseState& state, auto lexeme) { - auto value = state.ast().intern(lexeme.data(), lexeme.size()); - return state.ast().create<ast::IdentifierValue>(ovdl::NodeLocation::make_from(lexeme.begin(), lexeme.end()), value); - }); + struct Identifier : lexy::scan_production<ast::IdentifierValue*>, + lexy::token_production { + + template<typename Context, typename Reader> + static constexpr scan_result scan(lexy::rule_scanner<Context, Reader>& scanner, detail::IsParseState auto& state) { + using encoding = typename Reader::encoding; + using char_type = typename encoding::char_type; + + std::basic_string<char_type> value_result; + + auto content_begin = scanner.position(); + do { + if constexpr (std::same_as<encoding, lexy::default_encoding> || std::same_as<encoding, lexy::byte_encoding>) { + if (lexy::scan_result<lexy::lexeme<Reader>> ascii_result; scanner.branch(ascii_result, lexy::dsl::identifier(ascii))) { + value_result.append(ascii_result.value().begin(), ascii_result.value().end()); + continue; + } + + char_type char_array[] { *scanner.position(), char_type {} }; + auto input = lexy::range_input(&char_array[0], &char_array[1]); + auto reader = input.reader(); + convert::map_value val = convert::try_parse_map(state.encoding(), reader); + + if (val.is_invalid()) { + ConvertErrorHandler::on_invalid_character(state, reader); + continue; + } + + if (!val.is_pass()) { + // non-pass characters are not valid ascii and are mapped to utf8 values + value_result.append(val._value); + scanner.parse(data_char_class); + } else { + break; + } + } else { + auto lexeme_result = scanner.template parse<lexy::lexeme<Reader>>(lexy::dsl::identifier(utf_char_class)); + if (lexeme_result) { + value_result.append(lexeme_result.value().begin(), lexeme_result.value().size()); + break; + } + } + } while (scanner); + auto content_end = scanner.position(); + + if (value_result.empty()) { + return lexy::scan_failed; + } + + auto value = state.ast().intern(value_result); + return state.ast().template create<ast::IdentifierValue>(ovdl::NodeLocation::make_from(content_begin, content_end), value); + } + + static constexpr auto rule = dsl::peek(data_char_class, utf_char_class) >> lexy::dsl::scan; }; /* REQUIREMENTS: * DAT-633 * DAT-634 */ - struct StringExpression { - static constexpr auto rule = [] { - if constexpr (Options.NoStringEscape) { - auto c = dsl::make_range<0x20, 0xFF>() / lexy::dsl::lit_b<0x07> / lexy::dsl::lit_b<0x09> / lexy::dsl::lit_b<0x0A> / lexy::dsl::lit_b<0x0D>; - return lexy::dsl::delimited(lexy::dsl::position(lexy::dsl::lit_b<'"'>))(c); - } else { - // Arbitrary code points that aren't control characters. - auto c = dsl::make_range<0x20, 0xFF>() - lexy::dsl::ascii::control; - - // Escape sequences start with a backlash. - // They either map one of the symbols, - // or a Unicode code point of the form uXXXX. - auto escape = lexy::dsl::backslash_escape // - .symbol<escaped_symbols>(); - return lexy::dsl::delimited(lexy::dsl::position(lexy::dsl::lit_b<'"'>))(c, escape); - } - }(); - - static constexpr auto value = - lexy::as_string<std::string> >> - dsl::callback<ast::StringValue*>( - [](ast::ParseState& state, const char* begin, auto&& str, const char* end) { - auto value = state.ast().intern(str.data(), str.length()); - return state.ast().create<ast::StringValue>(ovdl::NodeLocation::make_from(begin, end), value); - }); + struct StringExpression : lexy::scan_production<ast::StringValue*>, + lexy::token_production { + + template<typename Context, typename Reader> + static constexpr scan_result scan(lexy::rule_scanner<Context, Reader>& scanner, detail::IsParseState auto& state) { + using encoding = typename Reader::encoding; + + constexpr auto rule = [] { + if constexpr (Options.NoStringEscape) { + auto c = [] { + if constexpr (std::same_as<encoding, lexy::default_encoding> || std::same_as<encoding, lexy::byte_encoding>) { + return dsl::lit_b_range<0x20, 0xFF> / lexy::dsl::lit_b<0x07> / lexy::dsl::lit_b<0x09> / lexy::dsl::lit_b<0x0A> / lexy::dsl::lit_b<0x0D>; + } else { + return -lexy::dsl::unicode::control; + } + }(); + return lexy::dsl::quoted(c); + } else { + // Arbitrary code points that aren't control characters. + auto c = [] { + if constexpr (std::same_as<encoding, lexy::default_encoding> || std::same_as<encoding, lexy::byte_encoding>) { + return dsl::lit_b_range<0x20, 0xFF> - lexy::dsl::ascii::control; + } else { + return -lexy::dsl::unicode::control; + } + }(); + + // Escape sequences start with a backlash. + // They either map one of the symbols, + // or a Unicode code point of the form uXXXX. + auto escape = lexy::dsl::backslash_escape // + .symbol<escaped_symbols>(); + return lexy::dsl::quoted(c, escape); + } + }(); + + auto begin = scanner.position(); + lexy::scan_result<std::string> str_result; + scanner.parse(str_result, rule); + if (!scanner || !str_result) + return lexy::scan_failed; + auto end = scanner.position(); + auto str = str_result.value(); + auto value = state.ast().intern(str.data(), str.size()); + return state.ast().template create<ast::StringValue>(ovdl::NodeLocation::make_from(begin, end), value); + } + + static constexpr auto rule = lexy::dsl::peek(lexy::dsl::quoted.open()) >> lexy::dsl::scan; + static constexpr auto value = convert_as_string<std::string> >> lexy::forward<ast::StringValue*>; }; /* REQUIREMENTS: DAT-638 */ @@ -132,59 +230,112 @@ namespace ovdl::v2script::grammar { }; struct SimpleAssignmentStatement { - static constexpr auto rule = - dsl::p<Identifier> >> - (lexy::dsl::equal_sign >> - (lexy::dsl::p<ValueExpression> | lexy::dsl::recurse_branch<StatementListBlock>)); + static constexpr auto rule = [] { + auto right_brace = lexy::dsl::lit_c<'}'>; + + auto value_expression = lexy::dsl::p<ValueExpression>; + auto statement_list_expression = lexy::dsl::recurse_branch<StatementListBlock>; + + auto rhs_recover = lexy::dsl::recover(value_expression, statement_list_expression).limit(right_brace); + auto rhs_try = lexy::dsl::try_(value_expression | statement_list_expression, rhs_recover); + + auto identifier = + dsl::p<Identifier> >> + (lexy::dsl::equal_sign >> rhs_try); + + auto recover = lexy::dsl::recover(identifier).limit(right_brace); + return lexy::dsl::try_(identifier, recover); + }(); static constexpr auto value = construct<ast::AssignStatement>; }; /* REQUIREMENTS: DAT-639 */ struct AssignmentStatement { - static constexpr auto rule = - dsl::p<Identifier> >> + static constexpr auto rule = [] { + auto right_brace = lexy::dsl::lit_c<'}'>; + + auto value_expression = lexy::dsl::p<ValueExpression>; + auto statement_list_expression = lexy::dsl::recurse_branch<StatementListBlock>; + + auto rhs_recover = lexy::dsl::recover(value_expression, statement_list_expression).limit(right_brace); + auto rhs_try = lexy::dsl::try_(value_expression | statement_list_expression, rhs_recover); + + auto identifier = + dsl::p<Identifier> >> (lexy::dsl::equal_sign >> - (lexy::dsl::p<ValueExpression> | lexy::dsl::recurse_branch<StatementListBlock>) | - lexy::dsl::else_ >> lexy::dsl::return_) | - dsl::p<StringExpression> | - lexy::dsl::recurse_branch<StatementListBlock>; + rhs_try | + lexy::dsl::else_ >> lexy::dsl::return_); + + auto string_expression = dsl::p<StringExpression>; + auto statement_list = lexy::dsl::recurse_branch<StatementListBlock>; + + return identifier | string_expression | statement_list; + }(); static constexpr auto value = dsl::callback<ast::Statement*>( - [](ast::ParseState& state, const char* pos, ast::IdentifierValue* name, ast::Value* initializer) { - return state.ast().create<ast::AssignStatement>(pos, name, initializer); + [](detail::IsParseState auto& state, const char* pos, ast::IdentifierValue* name, ast::Value* initializer) { + return state.ast().template create<ast::AssignStatement>(pos, name, initializer); }, - [](ast::ParseState& state, const char* pos, ast::Value* left, lexy::nullopt = {}) { - return state.ast().create<ast::ValueStatement>(pos, left); + [](detail::IsParseState auto& state, bool&, const char* pos, ast::IdentifierValue* name, ast::Value* initializer) { + return state.ast().template create<ast::AssignStatement>(pos, name, initializer); }, - [](ast::ParseState& state, ast::Value* left) { - return state.ast().create<ast::ValueStatement>(state.ast().location_of(left), left); + [](detail::IsParseState auto& state, bool&, bool&, const char* pos, ast::IdentifierValue* name, ast::Value* initializer) { + return state.ast().template create<ast::AssignStatement>(pos, name, initializer); + }, + [](detail::IsParseState auto& state, bool&, bool&, const char* pos, ast::Value* name) { + return state.ast().template create<ast::ValueStatement>(pos, name); + }, + [](detail::IsParseState auto& state, const char* pos, ast::Value* left, lexy::nullopt = {}) { + return state.ast().template create<ast::ValueStatement>(pos, left); + }, + [](detail::IsParseState auto& state, bool&, const char* pos, ast::Value* left, lexy::nullopt = {}) { + return state.ast().template create<ast::ValueStatement>(pos, left); + }, + [](detail::IsParseState auto& state, ast::Value* left) -> ast::ValueStatement* { + if (left == nullptr) return nullptr; + return state.ast().template create<ast::ValueStatement>(state.ast().location_of(left), left); + }, + [](detail::IsParseState auto& state, bool&, ast::Value* left) -> ast::ValueStatement* { + if (left == nullptr) return nullptr; + return state.ast().template create<ast::ValueStatement>(state.ast().location_of(left), left); }); }; /* REQUIREMENTS: DAT-640 */ struct StatementListBlock { - static constexpr auto rule = - dsl::curly_bracketed( - (lexy::dsl::opt(lexy::dsl::list(lexy::dsl::recurse_branch<AssignmentStatement>)) + - lexy::dsl::opt(lexy::dsl::semicolon))); + static constexpr auto rule = [] { + auto right_brace = lexy::dsl::lit_c<'}'>; + + auto assign_statement = lexy::dsl::recurse_branch<AssignmentStatement>; + + auto assign_try = lexy::dsl::try_(assign_statement); + auto assign_opt = lexy::dsl::opt(lexy::dsl::list(assign_try)); + + auto curly_bracket = dsl::curly_bracketed(assign_opt + lexy::dsl::opt(lexy::dsl::semicolon)); + + return lexy::dsl::try_(curly_bracket, lexy::dsl::find(right_brace)); + }(); static constexpr auto value = lexy::as_list<ast::StatementList> >> dsl::callback<ast::ListValue*>( - [](ast::ParseState& state, const char* begin, auto&& list, const char* end) { + [](detail::IsParseState auto& state, const char* begin, auto&& list, const char* end) { if constexpr (std::is_same_v<std::decay_t<decltype(list)>, lexy::nullopt>) { - return state.ast().create<ast::ListValue>(ovdl::NodeLocation::make_from(begin, end)); + return state.ast().template create<ast::ListValue>(ovdl::NodeLocation::make_from(begin, end)); } else { - return state.ast().create<ast::ListValue>(ovdl::NodeLocation::make_from(begin, end), LEXY_MOV(list)); + return state.ast().template create<ast::ListValue>(ovdl::NodeLocation::make_from(begin, end), LEXY_MOV(list)); } }, - [](ast::ParseState& state, const char* begin, auto&& list, auto&& semicolon, const char* end) { + [](detail::IsParseState auto& state, const char* begin, auto&& list, auto&& semicolon, const char* end) { if constexpr (std::is_same_v<std::decay_t<decltype(list)>, lexy::nullopt>) { - return state.ast().create<ast::ListValue>(ovdl::NodeLocation::make_from(begin, end)); + return state.ast().template create<ast::ListValue>(ovdl::NodeLocation::make_from(begin, end)); } else { - return state.ast().create<ast::ListValue>(ovdl::NodeLocation::make_from(begin, end), LEXY_MOV(list)); + return state.ast().template create<ast::ListValue>(ovdl::NodeLocation::make_from(begin, end), LEXY_MOV(list)); } + }, + [](detail::IsParseState auto& state, lexy::nullopt fail = {}) { + return fail; }); }; }; @@ -198,22 +349,20 @@ namespace ovdl::v2script::grammar { template<ParseOptions Options> using SAssignStatement = typename SimpleGrammar<Options>::SimpleAssignmentStatement; - template<ovdl::detail::string_literal Keyword, auto Production, auto Value = dsl::default_kw_value<ast::ParseState, ast::IdentifierValue, Keyword>> + template<ovdl::detail::string_literal Keyword, auto Production, auto Value = dsl::default_kw_value<ast::IdentifierValue, Keyword>> using keyword_rule = dsl::keyword_rule< - ast::ParseState, id, ast::AssignStatement, Keyword, Production, Value>; - template<ovdl::detail::string_literal Keyword, auto Production, auto Value = dsl::default_kw_value<ast::ParseState, ast::IdentifierValue, Keyword>> + template<ovdl::detail::string_literal Keyword, auto Production, auto Value = dsl::default_kw_value<ast::IdentifierValue, Keyword>> using fkeyword_rule = dsl::fkeyword_rule< - ast::ParseState, id, ast::AssignStatement, Keyword, Production, Value>; template<ParseOptions Options> - struct File { + struct BasicFile { // Allow arbitrary spaces between individual tokens. static constexpr auto whitespace = whitespace_specifier | comment_specifier; @@ -223,4 +372,6 @@ namespace ovdl::v2script::grammar { static constexpr auto value = lexy::as_list<ast::StatementList> >> construct<ast::FileTree>; }; + + using File = BasicFile<NoStringEscapeOption>; } |