aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
author George L. Albany <Megacake1234@gmail.com>2024-06-18 19:43:20 +0200
committer GitHub <noreply@github.com>2024-06-18 19:43:20 +0200
commit8b623bf4087aa360842ad31145d4ab6946cee9aa (patch)
treef15ebc47d6bf370031af28e4bb4814ae30ef46e1 /src
parent7b521d6023113372cf6b02e562828273c4040f0e (diff)
parentb0c3ba3f91926b0c95625bdbf4aab69269130b13 (diff)
Merge pull request #46 from OpenVicProject/fix/char-detection
Add runtime encoding detection and conversion
Diffstat (limited to 'src')
-rw-r--r--src/headless/main.cpp24
-rw-r--r--src/openvic-dataloader/AbstractSyntaxTree.cpp2
-rw-r--r--src/openvic-dataloader/AbstractSyntaxTree.hpp89
-rw-r--r--src/openvic-dataloader/DiagnosticLogger.cpp5
-rw-r--r--src/openvic-dataloader/DiagnosticLogger.hpp492
-rw-r--r--src/openvic-dataloader/File.cpp12
-rw-r--r--src/openvic-dataloader/File.hpp139
-rw-r--r--src/openvic-dataloader/NodeLocation.cpp26
-rw-r--r--src/openvic-dataloader/ParseState.hpp105
-rw-r--r--src/openvic-dataloader/csv/CsvGrammar.hpp244
-rw-r--r--src/openvic-dataloader/csv/CsvParseState.hpp26
-rw-r--r--src/openvic-dataloader/csv/Parser.cpp182
-rw-r--r--src/openvic-dataloader/detail/Convert.hpp577
-rw-r--r--src/openvic-dataloader/detail/Detect.cpp351
-rw-r--r--src/openvic-dataloader/detail/Detect.hpp627
-rw-r--r--src/openvic-dataloader/detail/DetectUtf8.hpp53
-rw-r--r--src/openvic-dataloader/detail/Errors.hpp25
-rw-r--r--src/openvic-dataloader/detail/InternalConcepts.hpp127
-rw-r--r--src/openvic-dataloader/detail/ParseHandler.cpp347
-rw-r--r--src/openvic-dataloader/detail/ParseHandler.hpp199
-rw-r--r--src/openvic-dataloader/detail/Warnings.hpp9
-rw-r--r--src/openvic-dataloader/detail/dsl.hpp194
-rw-r--r--src/openvic-dataloader/v2script/AbstractSyntaxTree.cpp53
-rw-r--r--src/openvic-dataloader/v2script/EventGrammar.hpp8
-rw-r--r--src/openvic-dataloader/v2script/LuaDefinesGrammar.hpp133
-rw-r--r--src/openvic-dataloader/v2script/ModifierGrammar.hpp14
-rw-r--r--src/openvic-dataloader/v2script/ParseState.hpp19
-rw-r--r--src/openvic-dataloader/v2script/Parser.cpp195
-rw-r--r--src/openvic-dataloader/v2script/SimpleGrammar.hpp307
29 files changed, 3860 insertions, 724 deletions
diff --git a/src/headless/main.cpp b/src/headless/main.cpp
index 7279a6e..0ad6115 100644
--- a/src/headless/main.cpp
+++ b/src/headless/main.cpp
@@ -6,7 +6,7 @@
#include <iterator>
#include <string_view>
-#include <openvic-dataloader/ParseWarning.hpp>
+#include <openvic-dataloader/NodeLocation.hpp>
#include <openvic-dataloader/csv/LineObject.hpp>
#include <openvic-dataloader/csv/Parser.hpp>
#include <openvic-dataloader/v2script/AbstractSyntaxTree.hpp>
@@ -41,9 +41,8 @@ bool insenitive_trim_eq(std::string_view lhs, std::string_view rhs) {
[](char a, char b) { return std::tolower(a) == std::tolower(b); });
}
-template<ovdl::csv::EncodingType Encoding>
int print_csv(const std::string_view path) {
- auto parser = ovdl::csv::Parser<Encoding>(std::cerr);
+ auto parser = ovdl::csv::Parser(std::cerr);
parser.load_from_file(path);
if (parser.has_error()) {
return 1;
@@ -73,12 +72,11 @@ int print_lua(const std::string_view path, VisualizationType visual_type) {
return 1;
}
- parser.lua_defines_parse();
- if (parser.has_error()) {
+ if (!parser.lua_defines_parse()) {
return 2;
}
- if (parser.has_warning()) {
+ if (parser.has_error() || parser.has_warning()) {
parser.print_errors_to(std::cerr);
}
@@ -97,12 +95,11 @@ int print_v2script_simple(const std::string_view path, VisualizationType visual_
return 1;
}
- parser.simple_parse();
- if (parser.has_error()) {
+ if (!parser.simple_parse()) {
return 2;
}
- if (parser.has_warning()) {
+ if (parser.has_error() || parser.has_warning()) {
parser.print_errors_to(std::cerr);
}
@@ -139,23 +136,18 @@ int main(int argc, char** argv) {
return print_lua(args[1], type);
}
return print_v2script_simple(args[1], type);
- case 4:
- if (insenitive_trim_eq(args[1], "csv") && insenitive_trim_eq(args[2], "utf"))
- return print_csv<ovdl::csv::EncodingType::Utf8>(args[3]);
- goto default_jump;
case 3:
if (insenitive_trim_eq(args[1], "csv"))
- return print_csv<ovdl::csv::EncodingType::Windows1252>(args[2]);
+ return print_csv(args[2]);
if (insenitive_trim_eq(args[1], "lua"))
return print_lua(args[2], type);
[[fallthrough]];
default:
- default_jump:
std::fprintf(stderr, "usage: %s <filename>\n", args[0].c_str());
std::fprintf(stderr, "usage: %s list <options> <filename>\n", args[0].c_str());
std::fprintf(stderr, "usage: %s native <options> <filename>\n", args[0].c_str());
std::fprintf(stderr, "usage: %s lua <filename>\n", args[0].c_str());
- std::fprintf(stderr, "usage: %s csv [utf] <filename>\n", args[0].c_str());
+ std::fprintf(stderr, "usage: %s csv <filename>\n", args[0].c_str());
return EXIT_FAILURE;
}
diff --git a/src/openvic-dataloader/AbstractSyntaxTree.cpp b/src/openvic-dataloader/AbstractSyntaxTree.cpp
index 11a90dc..d6f58f7 100644
--- a/src/openvic-dataloader/AbstractSyntaxTree.cpp
+++ b/src/openvic-dataloader/AbstractSyntaxTree.cpp
@@ -1,4 +1,4 @@
-#include <openvic-dataloader/AbstractSyntaxTree.hpp>
+#include "AbstractSyntaxTree.hpp"
using namespace ovdl;
diff --git a/src/openvic-dataloader/AbstractSyntaxTree.hpp b/src/openvic-dataloader/AbstractSyntaxTree.hpp
new file mode 100644
index 0000000..a5b8886
--- /dev/null
+++ b/src/openvic-dataloader/AbstractSyntaxTree.hpp
@@ -0,0 +1,89 @@
+#pragma once
+
+#include <concepts>
+#include <cstdio>
+#include <string_view>
+#include <utility>
+
+#include <openvic-dataloader/NodeLocation.hpp>
+#include <openvic-dataloader/detail/SymbolIntern.hpp>
+#include <openvic-dataloader/detail/Utility.hpp>
+
+#include <dryad/node.hpp>
+#include <dryad/node_map.hpp>
+#include <dryad/symbol.hpp>
+#include <dryad/tree.hpp>
+
+#include <fmt/core.h>
+
+#include "detail/InternalConcepts.hpp"
+
+namespace ovdl {
+ struct AbstractSyntaxTree : SymbolIntern {
+ symbol_type intern(const char* str, std::size_t length);
+ symbol_type intern(std::string_view str);
+ const char* intern_cstr(const char* str, std::size_t length);
+ const char* intern_cstr(std::string_view str);
+ symbol_interner_type& symbol_interner();
+ const symbol_interner_type& symbol_interner() const;
+
+ protected:
+ symbol_interner_type _symbol_interner;
+ };
+
+ template<detail::IsFile FileT, std::derived_from<typename FileT::node_type> RootNodeT>
+ struct BasicAbstractSyntaxTree : AbstractSyntaxTree {
+ using file_type = FileT;
+ using root_node_type = RootNodeT;
+ using node_type = typename file_type::node_type;
+
+ explicit BasicAbstractSyntaxTree(file_type&& file) : _file { std::move(file) } {}
+
+ template<typename Encoding, typename MemoryResource = void>
+ explicit BasicAbstractSyntaxTree(lexy::buffer<Encoding, MemoryResource>&& buffer) : _file { std::move(buffer) } {}
+
+ void set_location(const node_type* n, NodeLocation loc) {
+ _file.set_location(n, loc);
+ }
+
+ NodeLocation location_of(const node_type* n) const {
+ return _file.location_of(n);
+ }
+
+ root_node_type* root() {
+ return _tree.root();
+ }
+
+ const root_node_type* root() const {
+ return _tree.root();
+ }
+
+ file_type& file() {
+ return _file;
+ }
+
+ const file_type& file() const {
+ return _file;
+ }
+
+ template<typename T, typename... Args>
+ T* create(NodeLocation loc, Args&&... args) {
+ auto node = _tree.template create<T>(DRYAD_FWD(args)...);
+ set_location(node, loc);
+ return node;
+ }
+
+ template<typename T, typename... Args>
+ T* create(const char* begin, const char* end, Args&&... args) {
+ return create<T>(NodeLocation::make_from(begin, end), DRYAD_FWD(args)...);
+ }
+
+ void set_root(root_node_type* node) {
+ _tree.set_root(node);
+ }
+
+ protected:
+ dryad::tree<root_node_type> _tree;
+ file_type _file;
+ };
+} \ No newline at end of file
diff --git a/src/openvic-dataloader/DiagnosticLogger.cpp b/src/openvic-dataloader/DiagnosticLogger.cpp
index aae3dcb..9fe5e93 100644
--- a/src/openvic-dataloader/DiagnosticLogger.cpp
+++ b/src/openvic-dataloader/DiagnosticLogger.cpp
@@ -1,4 +1,4 @@
-#include <openvic-dataloader/DiagnosticLogger.hpp>
+#include "DiagnosticLogger.hpp"
using namespace ovdl;
@@ -9,8 +9,7 @@ DiagnosticLogger::operator bool() const {
bool DiagnosticLogger::errored() const { return _errored; }
bool DiagnosticLogger::warned() const { return _warned; }
-
NodeLocation DiagnosticLogger::location_of(const error::Error* error) const {
auto result = _map.lookup(error);
- return result ? *result : NodeLocation{};
+ return result ? *result : NodeLocation {};
} \ No newline at end of file
diff --git a/src/openvic-dataloader/DiagnosticLogger.hpp b/src/openvic-dataloader/DiagnosticLogger.hpp
new file mode 100644
index 0000000..2a655a9
--- /dev/null
+++ b/src/openvic-dataloader/DiagnosticLogger.hpp
@@ -0,0 +1,492 @@
+#pragma once
+
+#include <concepts> // IWYU pragma: keep
+#include <cstdio>
+#include <ostream>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include <openvic-dataloader/Error.hpp>
+#include <openvic-dataloader/NodeLocation.hpp>
+#include <openvic-dataloader/detail/CallbackOStream.hpp>
+#include <openvic-dataloader/detail/ErrorRange.hpp>
+#include <openvic-dataloader/detail/OStreamOutputIterator.hpp>
+#include <openvic-dataloader/detail/SymbolIntern.hpp>
+#include <openvic-dataloader/detail/Utility.hpp>
+
+#include <lexy/error.hpp>
+#include <lexy/input/base.hpp>
+#include <lexy/input/buffer.hpp>
+#include <lexy/input_location.hpp>
+#include <lexy/visualize.hpp>
+
+#include <dryad/_detail/config.hpp>
+#include <dryad/abstract_node.hpp>
+#include <dryad/arena.hpp>
+#include <dryad/node.hpp>
+#include <dryad/node_map.hpp>
+#include <dryad/tree.hpp>
+
+#include <fmt/core.h>
+
+#include <lexy_ext/report_error.hpp>
+
+namespace ovdl {
+ template<typename ParseState>
+ struct BasicDiagnosticLogger;
+
+ struct DiagnosticLogger : SymbolIntern {
+ using AnnotationKind = lexy_ext::annotation_kind;
+ using DiagnosticKind = lexy_ext::diagnostic_kind;
+
+ using error_range = detail::error_range<error::Root>;
+
+ explicit operator bool() const;
+ bool errored() const;
+ bool warned() const;
+
+ NodeLocation location_of(const error::Error* error) const;
+
+ template<std::derived_from<DiagnosticLogger> Logger>
+ struct ErrorCallback {
+ ErrorCallback(Logger& logger) : _logger(&logger) {}
+
+ struct sink_t {
+ using return_type = std::size_t;
+
+ template<typename Input, typename Tag>
+ void operator()(lexy::error_context<Input> const& context, lexy::error_for<Input, Tag> const& error) {
+ using Reader = lexy::input_reader<Input>;
+ using Encoding = typename Reader::encoding;
+ using char_type = typename Encoding::char_type;
+ error::Error* result;
+
+ std::string production_name = context.production();
+ auto left_strip = production_name.find_first_of('<');
+ if (left_strip != std::string::npos) {
+ auto right_strip = production_name.find_first_of('>', left_strip);
+ if (right_strip != std::string::npos) {
+ production_name.erase(left_strip, right_strip - left_strip + 1);
+ }
+ }
+
+ auto context_location = lexy::get_input_location(context.input(), context.position());
+ auto location = lexy::get_input_location(context.input(), error.position(), context_location.anchor());
+
+ if constexpr (detail::is_instance_of_v<Logger, BasicDiagnosticLogger>) {
+ lexy_ext::diagnostic_writer impl { context.input() };
+
+ BasicNodeLocation loc = [&] {
+ if constexpr (std::is_same_v<Tag, lexy::expected_literal>) {
+ return BasicNodeLocation<char_type>::make_from(error.position(), error.position() + error.index() + 1);
+ } else if constexpr (std::is_same_v<Tag, lexy::expected_keyword>) {
+ return BasicNodeLocation<char_type>::make_from(error.position(), error.end());
+ } else if constexpr (std::is_same_v<Tag, lexy::expected_char_class>) {
+ return BasicNodeLocation<char_type>::make_from(error.position(), error.position() + 1);
+ } else {
+ return BasicNodeLocation<char_type>::make_from(error.position(), error.end());
+ }
+ }();
+
+ auto writer = _logger.template parse_error<Tag>(impl, loc, production_name.c_str());
+ if (location.line_nr() != context_location.line_nr())
+ writer.secondary(BasicNodeLocation { context.position(), lexy::_detail::next(context.position()) }, "beginning here").finish();
+
+ if constexpr (std::is_same_v<Tag, lexy::expected_literal>) {
+ auto string = lexy::_detail::make_literal_lexeme<typename Reader::encoding>(error.string(), error.length());
+ writer.primary(loc, "expected '{}'", string.data())
+ .finish();
+ } else if constexpr (std::is_same_v<Tag, lexy::expected_keyword>) {
+ auto string = lexy::_detail::make_literal_lexeme<typename Reader::encoding>(error.string(), error.length());
+ writer.primary(loc, "expected keyword '{}'", string.data())
+ .finish();
+ } else if constexpr (std::is_same_v<Tag, lexy::expected_char_class>) {
+ writer.primary(loc, "expected {}", error.name())
+ .finish();
+ } else {
+ writer.primary(loc, error.message())
+ .finish();
+ }
+ result = writer.error();
+ } else {
+ auto production = _logger.intern_cstr(production_name);
+ if constexpr (std::is_same_v<Tag, lexy::expected_literal>) {
+ auto string = lexy::_detail::make_literal_lexeme<typename Reader::encoding>(error.string(), error.length());
+ NodeLocation loc = NodeLocation::make_from(context.position(), error.position() - 1);
+ auto message = _logger.intern_cstr(fmt::format("expected '{}'", string.data()));
+ result = _logger.template create<error::ExpectedLiteral>(loc, message, production);
+ } else if constexpr (std::is_same_v<Tag, lexy::expected_keyword>) {
+ auto string = lexy::_detail::make_literal_lexeme<typename Reader::encoding>(error.string(), error.length());
+ NodeLocation loc = NodeLocation::make_from(context.position(), error.position() - 1);
+ auto message = _logger.intern_cstr(fmt::format("expected keyword '{}'", string.data()));
+ result = _logger.template create<error::ExpectedKeyword>(loc, message, production);
+ } else if constexpr (std::is_same_v<Tag, lexy::expected_char_class>) {
+ auto message = _logger.intern_cstr(fmt::format("expected {}", error.name()));
+ result = _logger.template create<error::ExpectedCharClass>(error.position(), message, production);
+ } else {
+ NodeLocation loc = NodeLocation::make_from(error.begin(), error.end());
+ auto message = _logger.intern_cstr(error.message());
+ result = _logger.template create<error::GenericParseError>(loc, message, production);
+ }
+ }
+ _logger.insert(result);
+
+ _count++;
+ }
+
+ std::size_t finish() && {
+ return _count;
+ }
+
+ Logger& _logger;
+ std::size_t _count;
+ };
+
+ constexpr auto sink() const {
+ return sink_t { *_logger, 0 };
+ }
+
+ mutable Logger* _logger;
+ };
+
+ template<typename T, typename LocCharT, typename... Args>
+ T* create(BasicNodeLocation<LocCharT> loc, Args&&... args) {
+ using node_creator = dryad::node_creator<decltype(DRYAD_DECLVAL(T).kind()), void>;
+ T* result = _tree.create<T>(DRYAD_FWD(args)...);
+ _map.insert(result, loc);
+ return result;
+ }
+
+ template<typename T>
+ T* create() {
+ using node_creator = dryad::node_creator<decltype(DRYAD_DECLVAL(T).kind()), void>;
+ T* result = _tree.create<T>();
+ return result;
+ }
+
+ error_range get_errors() const {
+ return _tree.root()->errors();
+ }
+
+ protected:
+ bool _errored = false;
+ bool _warned = false;
+ dryad::node_map<const error::Error, NodeLocation> _map;
+ dryad::tree<error::Root> _tree;
+
+ symbol_interner_type _symbol_interner;
+
+ void insert(error::Error* root) {
+ _tree.root()->insert_back(root);
+ }
+
+ public:
+ symbol_type intern(const char* str, std::size_t length) {
+ return _symbol_interner.intern(str, length);
+ }
+ symbol_type intern(std::string_view str) {
+ return intern(str.data(), str.size());
+ }
+ const char* intern_cstr(const char* str, std::size_t length) {
+ return intern(str, length).c_str(_symbol_interner);
+ }
+ const char* intern_cstr(std::string_view str) {
+ return intern_cstr(str.data(), str.size());
+ }
+ symbol_interner_type& symbol_interner() {
+ return _symbol_interner;
+ }
+ const symbol_interner_type& symbol_interner() const {
+ return _symbol_interner;
+ }
+ };
+
+ template<typename ParseState>
+ struct BasicDiagnosticLogger : DiagnosticLogger {
+ using parse_state_type = ParseState;
+ using file_type = typename parse_state_type::file_type;
+
+ template<typename... Args>
+ using format_str = fmt::basic_format_string<char, fmt::type_identity_t<Args>...>;
+
+ explicit BasicDiagnosticLogger(const file_type& file)
+ : _file(&file) {
+ _tree.set_root(_tree.create<error::Root>());
+ }
+
+ struct Writer;
+
+ template<typename... Args>
+ Writer error(format_str<Args...> fmt, Args&&... args) {
+ return log(DiagnosticKind::error, fmt, std::forward<Args>(args)...);
+ }
+
+ template<typename... Args>
+ Writer warning(format_str<Args...> fmt, Args&&... args) {
+ return log(DiagnosticKind::warning, fmt, std::forward<Args>(args)...);
+ }
+
+ template<typename... Args>
+ Writer note(format_str<Args...> fmt, Args&&... args) {
+ return log(DiagnosticKind::note, fmt, std::forward<Args>(args)...);
+ }
+
+ template<typename... Args>
+ Writer info(format_str<Args...> fmt, Args&&... args) {
+ return log(DiagnosticKind::info, fmt, std::forward<Args>(args)...);
+ }
+
+ template<typename... Args>
+ Writer debug(format_str<Args...> fmt, Args&&... args) {
+ return log(DiagnosticKind::debug, fmt, std::forward<Args>(args)...);
+ }
+
+ template<typename... Args>
+ Writer fixit(format_str<Args...> fmt, Args&&... args) {
+ return log(DiagnosticKind::fixit, fmt, std::forward<Args>(args)...);
+ }
+
+ template<typename... Args>
+ Writer help(format_str<Args...> fmt, Args&&... args) {
+ return log(DiagnosticKind::help, fmt, std::forward<Args>(args)...);
+ }
+
+ Writer error(std::string_view sv) {
+ return log(DiagnosticKind::error, fmt::runtime(sv));
+ }
+
+ Writer warning(std::string_view sv) {
+ return log(DiagnosticKind::warning, fmt::runtime(sv));
+ }
+
+ Writer note(std::string_view sv) {
+ return log(DiagnosticKind::note, fmt::runtime(sv));
+ }
+
+ Writer info(std::string_view sv) {
+ return log(DiagnosticKind::info, fmt::runtime(sv));
+ }
+
+ Writer debug(std::string_view sv) {
+ return log(DiagnosticKind::debug, fmt::runtime(sv));
+ }
+
+ Writer fixit(std::string_view sv) {
+ return log(DiagnosticKind::fixit, fmt::runtime(sv));
+ }
+
+ Writer help(std::string_view sv) {
+ return log(DiagnosticKind::help, fmt::runtime(sv));
+ }
+
+ auto error_callback() {
+ return ErrorCallback(*this);
+ }
+
+ template<typename CharT>
+ static void _write_to_buffer(const CharT* s, std::streamsize n, void* output_str) {
+ auto* output = reinterpret_cast<std::basic_string<CharT>*>(output_str);
+ output->append(s, n);
+ }
+
+ template<typename CharT>
+ auto make_callback_stream(std::basic_string<CharT>& output) {
+ return detail::make_callback_stream<CharT>(&_write_to_buffer<CharT>, reinterpret_cast<void*>(&output));
+ }
+
+ template<typename CharT>
+ detail::OStreamOutputIterator make_ostream_iterator(std::basic_ostream<CharT>& stream) {
+ return detail::OStreamOutputIterator { stream };
+ }
+
+ struct Writer {
+ template<typename LocCharT, typename... Args>
+ [[nodiscard]] Writer& primary(BasicNodeLocation<LocCharT> loc, format_str<Args...> fmt, Args&&... args) {
+ return annotation(AnnotationKind::primary, loc, fmt, std::forward<Args>(args)...);
+ }
+
+ template<typename LocCharT, typename... Args>
+ [[nodiscard]] Writer& secondary(BasicNodeLocation<LocCharT> loc, format_str<Args...> fmt, Args&&... args) {
+ return annotation(AnnotationKind::secondary, loc, fmt, std::forward<Args>(args)...);
+ }
+
+ template<typename LocCharT>
+ [[nodiscard]] Writer& primary(BasicNodeLocation<LocCharT> loc, const char* sv) {
+ return annotation(AnnotationKind::primary, loc, fmt::runtime(sv));
+ }
+
+ template<typename LocCharT>
+ [[nodiscard]] Writer& secondary(BasicNodeLocation<LocCharT> loc, const char* sv) {
+ return annotation(AnnotationKind::secondary, loc, fmt::runtime(sv));
+ }
+
+ void finish() {}
+
+ template<typename LocCharT, typename... Args>
+ [[nodiscard]] Writer& annotation(AnnotationKind kind, BasicNodeLocation<LocCharT> loc, format_str<Args...> fmt, Args&&... args) {
+ std::basic_string<typename decltype(fmt.get())::value_type> output;
+
+ _file.visit_buffer([&](auto&& buffer) {
+ using char_type = typename std::decay_t<decltype(buffer)>::encoding::char_type;
+
+ BasicNodeLocation<char_type> converted_loc = loc;
+
+ auto begin_loc = lexy::get_input_location(buffer, converted_loc.begin());
+
+ auto stream = _logger.make_callback_stream(output);
+ auto iter = _logger.make_ostream_iterator(stream);
+
+ lexy_ext::diagnostic_writer _impl { buffer, { lexy::visualize_fancy } };
+ _impl.write_empty_annotation(iter);
+ _impl.write_annotation(iter, kind, begin_loc, converted_loc.end(),
+ [&](auto out, lexy::visualization_options) {
+ return lexy::_detail::write_str(out, fmt::format(fmt, std::forward<Args>(args)...).c_str());
+ });
+ });
+
+ error::Annotation* annotation;
+ auto message = _logger.intern_cstr(output);
+ switch (kind) {
+ case AnnotationKind::primary:
+ annotation = _logger.create<error::PrimaryAnnotation>(loc, message);
+ break;
+ case AnnotationKind::secondary:
+ annotation = _logger.create<error::SecondaryAnnotation>(loc, message);
+ break;
+ default: detail::unreachable();
+ }
+ _annotated->push_back(annotation);
+ return *this;
+ }
+
+ error::AnnotatedError* error() {
+ return _annotated;
+ }
+
+ private:
+ Writer(BasicDiagnosticLogger& logger, const file_type& file, error::AnnotatedError* annotated)
+ : _file(file),
+ _logger(logger),
+ _annotated(annotated) {}
+
+ const file_type& _file;
+ BasicDiagnosticLogger& _logger;
+ error::AnnotatedError* _annotated;
+
+ friend BasicDiagnosticLogger;
+ };
+
+ template<std::derived_from<error::Error> T, typename Buffer, typename... Args>
+ void log_with_impl(lexy_ext::diagnostic_writer<Buffer>& impl, T* error, DiagnosticKind kind, format_str<Args...> fmt, Args&&... args) {
+ std::basic_string<typename decltype(fmt.get())::value_type> output;
+ auto stream = make_callback_stream(output);
+ auto iter = make_ostream_iterator(stream);
+
+ impl.write_message(iter, kind,
+ [&](auto out, lexy::visualization_options) {
+ return lexy::_detail::write_str(out, fmt::format(fmt, std::forward<Args>(args)...).c_str());
+ });
+ impl.write_path(iter, file().path());
+
+ auto message = intern_cstr(output);
+ error->_set_message(message);
+ if (!error->is_linked_in_tree())
+ insert(error);
+ }
+
+ template<typename Tag, typename Buffer>
+ Writer parse_error(lexy_ext::diagnostic_writer<Buffer>& impl, NodeLocation loc, const char* production_name) {
+ std::basic_string<typename Buffer::encoding::char_type> output;
+ auto stream = make_callback_stream(output);
+ auto iter = make_ostream_iterator(stream);
+
+ impl.write_message(iter, DiagnosticKind::error,
+ [&](auto out, lexy::visualization_options) {
+ return lexy::_detail::write_str(out, fmt::format("while parsing {}", production_name).c_str());
+ });
+ impl.write_path(iter, file().path());
+
+ auto production = intern_cstr(production_name);
+ auto message = intern_cstr(output);
+ auto* error = [&] {
+ if constexpr (std::is_same_v<Tag, lexy::expected_literal>) {
+ return create<error::ExpectedLiteral>(loc, message, production);
+ } else if constexpr (std::is_same_v<Tag, lexy::expected_keyword>) {
+ return create<error::ExpectedKeyword>(loc, message, production);
+ } else if constexpr (std::is_same_v<Tag, lexy::expected_char_class>) {
+ return create<error::ExpectedCharClass>(loc, message, production);
+ } else {
+ return create<error::GenericParseError>(loc, message, production);
+ }
+ }();
+
+ Writer result(*this, file(), error);
+ _errored = true;
+
+ return result;
+ }
+
+ template<std::derived_from<error::Error> T, typename... Args>
+ void log_with_error(T* error, DiagnosticKind kind, format_str<Args...> fmt, Args&&... args) {
+ file().visit_buffer(
+ [&](auto&& buffer) {
+ lexy_ext::diagnostic_writer impl { buffer };
+ log_with_impl(impl, error, kind, fmt, std::forward<Args>(args)...);
+ });
+ }
+
+ template<std::derived_from<error::Error> T, typename... Args>
+ void create_log(DiagnosticKind kind, format_str<Args...> fmt, Args&&... args) {
+ log_with_error(create<T>(), kind, fmt, std::forward<Args>(args)...);
+ }
+
+ template<typename... Args>
+ Writer log(DiagnosticKind kind, format_str<Args...> fmt, Args&&... args) {
+ error::Semantic* semantic;
+
+ switch (kind) {
+ case DiagnosticKind::error:
+ semantic = create<error::SemanticError>();
+ break;
+ case DiagnosticKind::warning:
+ semantic = create<error::SemanticWarning>();
+ break;
+ case DiagnosticKind::info:
+ semantic = create<error::SemanticInfo>();
+ break;
+ case DiagnosticKind::debug:
+ semantic = create<error::SemanticDebug>();
+ break;
+ case DiagnosticKind::fixit:
+ semantic = create<error::SemanticFixit>();
+ break;
+ case DiagnosticKind::help:
+ semantic = create<error::SemanticHelp>();
+ break;
+ default: detail::unreachable();
+ }
+
+ Writer result(*this, file(), semantic);
+
+ file().visit_buffer([&](auto&& buffer) {
+ lexy_ext::diagnostic_writer impl { buffer };
+ log_with_impl(impl, semantic, kind, fmt, std::forward<Args>(args)...);
+ });
+
+ if (kind == DiagnosticKind::error)
+ _errored = true;
+ if (kind == DiagnosticKind::warning)
+ _warned = true;
+
+ return result;
+ }
+
+ const auto& file() const {
+ return *_file;
+ }
+
+ private:
+ const file_type* _file;
+ };
+} \ No newline at end of file
diff --git a/src/openvic-dataloader/File.cpp b/src/openvic-dataloader/File.cpp
index 9b27bf0..e4d3773 100644
--- a/src/openvic-dataloader/File.cpp
+++ b/src/openvic-dataloader/File.cpp
@@ -1,4 +1,10 @@
-#include <openvic-dataloader/File.hpp>
+#include "File.hpp"
+
+#include <cstring>
+
+#include <openvic-dataloader/detail/Utility.hpp>
+
+#include <lexy/encoding.hpp>
using namespace ovdl;
@@ -6,4 +12,8 @@ File::File(const char* path) : _path(path) {}
const char* File::path() const noexcept {
return _path;
+}
+
+bool File::is_valid() const noexcept {
+ return _buffer.index() != 0 && !_buffer.valueless_by_exception() && visit_buffer([](auto&& buffer) { return buffer.data() != nullptr; });
} \ No newline at end of file
diff --git a/src/openvic-dataloader/File.hpp b/src/openvic-dataloader/File.hpp
new file mode 100644
index 0000000..90fcb11
--- /dev/null
+++ b/src/openvic-dataloader/File.hpp
@@ -0,0 +1,139 @@
+#pragma once
+
+#include <cassert>
+#include <concepts> // IWYU pragma: keep
+#include <type_traits>
+#include <variant>
+
+#include <openvic-dataloader/NodeLocation.hpp>
+#include <openvic-dataloader/detail/Utility.hpp>
+
+#include <lexy/encoding.hpp>
+#include <lexy/input/buffer.hpp>
+
+#include <dryad/node_map.hpp>
+
+namespace ovdl {
+ struct File {
+ using buffer_ids = detail::TypeRegister<
+ lexy::buffer<lexy::default_encoding, void>,
+ lexy::buffer<lexy::utf8_char_encoding, void>,
+ lexy::buffer<lexy::utf8_encoding, void>,
+ lexy::buffer<lexy::utf16_encoding, void>,
+ lexy::buffer<lexy::utf32_encoding, void>,
+ lexy::buffer<lexy::byte_encoding, void>>;
+
+ explicit File(const char* path);
+
+ const char* path() const noexcept;
+
+ bool is_valid() const noexcept;
+
+ template<typename Encoding, typename MemoryResource = void>
+ constexpr bool is_buffer() const {
+ return buffer_ids::type_id<lexy::buffer<Encoding, MemoryResource>>() + 1 == _buffer.index();
+ }
+
+ template<typename Encoding, typename MemoryResource = void>
+ lexy::buffer<Encoding, MemoryResource>* try_get_buffer_as() {
+ return std::get_if<lexy::buffer<Encoding, MemoryResource>>(&_buffer);
+ }
+
+ template<typename Encoding, typename MemoryResource = void>
+ const lexy::buffer<Encoding, MemoryResource>* try_get_buffer_as() const {
+ return std::get_if<lexy::buffer<Encoding, MemoryResource>>(&_buffer);
+ }
+
+ template<typename Encoding, typename MemoryResource = void>
+ lexy::buffer<Encoding, MemoryResource>& get_buffer_as() {
+ assert((is_buffer<Encoding, MemoryResource>()));
+ return *std::get_if<lexy::buffer<Encoding, MemoryResource>>(&_buffer);
+ }
+
+ template<typename Encoding, typename MemoryResource = void>
+ const lexy::buffer<Encoding, MemoryResource>& get_buffer_as() const {
+ assert((is_buffer<Encoding, MemoryResource>()));
+ return *std::get_if<lexy::buffer<Encoding, MemoryResource>>(&_buffer);
+ }
+
+#define SWITCH_LIST \
+ X(1) \
+ X(2) \
+ X(3) \
+ X(4) \
+ X(5) \
+ X(6)
+
+#define X(NUM) \
+ case NUM: \
+ return visitor(std::get<NUM>(_buffer));
+
+ template<typename Visitor>
+ decltype(auto) visit_buffer(Visitor&& visitor) {
+ switch (_buffer.index()) {
+ SWITCH_LIST
+ default: ovdl::detail::unreachable();
+ }
+ }
+
+ template<typename Return, typename Visitor>
+ Return visit_buffer(Visitor&& visitor) {
+ switch (_buffer.index()) {
+ SWITCH_LIST
+ default: ovdl::detail::unreachable();
+ }
+ }
+
+ template<typename Visitor>
+ decltype(auto) visit_buffer(Visitor&& visitor) const {
+ switch (_buffer.index()) {
+ SWITCH_LIST
+ default: ovdl::detail::unreachable();
+ }
+ }
+
+ template<typename Return, typename Visitor>
+ Return visit_buffer(Visitor&& visitor) const {
+ switch (_buffer.index()) {
+ SWITCH_LIST
+ default: ovdl::detail::unreachable();
+ }
+ }
+#undef X
+#undef SWITCH_LIST
+
+ protected:
+ const char* _path;
+ detail::type_prepend_t<buffer_ids::variant_type, std::monostate> _buffer;
+ };
+
+ template<typename NodeT>
+ struct BasicFile : File {
+ using node_type = NodeT;
+
+ template<typename Encoding, typename MemoryResource = void>
+ explicit BasicFile(const char* path, lexy::buffer<Encoding, MemoryResource>&& buffer)
+ : File(path) {
+ _buffer = static_cast<std::remove_reference_t<decltype(buffer)>&&>(buffer);
+ }
+
+ template<typename Encoding, typename MemoryResource = void>
+ explicit BasicFile(lexy::buffer<Encoding, MemoryResource>&& buffer)
+ : File("") {
+ _buffer = static_cast<std::remove_reference_t<decltype(buffer)>&&>(buffer);
+ }
+
+ void set_location(const node_type* n, NodeLocation loc) {
+ _map.insert(n, loc);
+ }
+
+ NodeLocation location_of(const node_type* n) const {
+ auto result = _map.lookup(n);
+ DRYAD_ASSERT(result != nullptr, "every Node should have a NodeLocation");
+ return *result;
+ }
+
+ protected:
+ dryad::node_map<const node_type, NodeLocation> _map;
+ };
+} \ No newline at end of file
diff --git a/src/openvic-dataloader/NodeLocation.cpp b/src/openvic-dataloader/NodeLocation.cpp
deleted file mode 100644
index 9e4f669..0000000
--- a/src/openvic-dataloader/NodeLocation.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-#include <openvic-dataloader/NodeLocation.hpp>
-
-using namespace ovdl;
-
-NodeLocation::NodeLocation() = default;
-NodeLocation::NodeLocation(const char* pos) : _begin(pos),
- _end(pos) {}
-NodeLocation::NodeLocation(const char* begin, const char* end) : _begin(begin),
- _end(end) {}
-
-NodeLocation::NodeLocation(const NodeLocation&) noexcept = default;
-NodeLocation& NodeLocation::operator=(const NodeLocation&) = default;
-
-NodeLocation::NodeLocation(NodeLocation&&) = default;
-NodeLocation& NodeLocation::operator=(NodeLocation&&) = default;
-
-const char* NodeLocation::begin() const { return _begin; }
-const char* NodeLocation::end() const { return _end; }
-
-bool NodeLocation::is_synthesized() const { return _begin == nullptr && _end == nullptr; }
-
-NodeLocation NodeLocation::make_from(const char* begin, const char* end) {
- end++;
- if (begin >= end) return NodeLocation(begin);
- return NodeLocation(begin, end);
-}
diff --git a/src/openvic-dataloader/ParseState.hpp b/src/openvic-dataloader/ParseState.hpp
new file mode 100644
index 0000000..806829c
--- /dev/null
+++ b/src/openvic-dataloader/ParseState.hpp
@@ -0,0 +1,105 @@
+#pragma once
+
+#include <utility>
+
+#include <openvic-dataloader/detail/Encoding.hpp>
+
+#include <lexy/encoding.hpp>
+#include <lexy/input/buffer.hpp>
+
+#include <dryad/tree.hpp>
+
+#include "DiagnosticLogger.hpp"
+#include "detail/InternalConcepts.hpp"
+
+namespace ovdl {
+ struct BasicParseState {
+ explicit BasicParseState(detail::Encoding encoding = detail::Encoding::Unknown) : _encoding(encoding) {}
+
+ detail::Encoding encoding() const {
+ return _encoding;
+ }
+
+ protected:
+ detail::Encoding _encoding;
+ };
+
+ template<detail::IsAst AstT>
+ struct ParseState : BasicParseState {
+ using ast_type = AstT;
+ using file_type = typename ast_type::file_type;
+ using diagnostic_logger_type = BasicDiagnosticLogger<ParseState>;
+
+ ParseState(typename ast_type::file_type&& file, detail::Encoding encoding)
+ : _ast { std::move(file) },
+ _logger { this->ast().file() },
+ BasicParseState(encoding) {}
+
+ template<typename Encoding, typename MemoryResource = void>
+ ParseState(lexy::buffer<Encoding, MemoryResource>&& buffer, detail::Encoding encoding)
+ : ParseState(typename ast_type::file_type { std::move(buffer) }, encoding) {}
+
+ template<typename Encoding, typename MemoryResource = void>
+ ParseState(const char* path, lexy::buffer<Encoding, MemoryResource>&& buffer, detail::Encoding encoding)
+ : ParseState(typename ast_type::file_type { path, std::move(buffer) }, encoding) {}
+
+ ast_type& ast() {
+ return _ast;
+ }
+
+ const ast_type& ast() const {
+ return _ast;
+ }
+
+ diagnostic_logger_type& logger() {
+ return _logger;
+ }
+
+ const diagnostic_logger_type& logger() const {
+ return _logger;
+ }
+
+ private:
+ ast_type _ast;
+ diagnostic_logger_type _logger;
+ };
+
+ template<detail::IsFile FileT>
+ struct FileParseState : BasicParseState {
+ using file_type = FileT;
+ using diagnostic_logger_type = BasicDiagnosticLogger<FileParseState>;
+
+ FileParseState(file_type&& file, detail::Encoding encoding)
+ : _file { std::move(file) },
+ _logger { this->file() },
+ BasicParseState(encoding) {}
+
+ template<typename Encoding, typename MemoryResource = void>
+ FileParseState(lexy::buffer<Encoding, MemoryResource>&& buffer, detail::Encoding encoding)
+ : FileParseState(file_type { std::move(buffer) }, encoding) {}
+
+ template<typename Encoding, typename MemoryResource = void>
+ FileParseState(const char* path, lexy::buffer<Encoding, MemoryResource>&& buffer, detail::Encoding encoding)
+ : FileParseState(file_type { path, std::move(buffer) }, encoding) {}
+
+ file_type& file() {
+ return _file;
+ }
+
+ const file_type& file() const {
+ return _file;
+ }
+
+ diagnostic_logger_type& logger() {
+ return _logger;
+ }
+
+ const diagnostic_logger_type& logger() const {
+ return _logger;
+ }
+
+ private:
+ file_type _file;
+ diagnostic_logger_type _logger;
+ };
+} \ No newline at end of file
diff --git a/src/openvic-dataloader/csv/CsvGrammar.hpp b/src/openvic-dataloader/csv/CsvGrammar.hpp
index 5451f26..19aee54 100644
--- a/src/openvic-dataloader/csv/CsvGrammar.hpp
+++ b/src/openvic-dataloader/csv/CsvGrammar.hpp
@@ -9,22 +9,20 @@
#include <openvic-dataloader/csv/LineObject.hpp>
#include <openvic-dataloader/csv/Parser.hpp>
+#include <lexy/_detail/config.hpp>
#include <lexy/callback.hpp>
+#include <lexy/callback/string.hpp>
#include <lexy/dsl.hpp>
+#include <lexy/dsl/ascii.hpp>
+#include <lexy/dsl/option.hpp>
+#include <lexy/encoding.hpp>
+#include "detail/Convert.hpp"
+#include "detail/InternalConcepts.hpp"
#include "detail/dsl.hpp"
// Grammar Definitions //
namespace ovdl::csv::grammar {
- using EncodingType = ovdl::csv::EncodingType;
-
- template<typename T>
- concept ParseChars = requires() {
- { T::character };
- { T::control };
- };
-
- template<ParseChars T>
struct ParseOptions {
/// @brief Seperator character
char SepChar;
@@ -33,12 +31,34 @@ namespace ovdl::csv::grammar {
/// @brief Paradox-style localization escape characters
/// @note Is ignored if SupportStrings is true
char EscapeChar;
+ };
- static constexpr auto parse_chars = T {};
- static constexpr auto character = parse_chars.character;
- static constexpr auto control = parse_chars.control;
+ struct ConvertErrorHandler {
+ static constexpr void on_invalid_character(detail::IsStateType auto& state, auto reader) {
+ state.logger().warning("invalid character value '{}' found", static_cast<int>(reader.peek())) //
+ .primary(BasicNodeLocation { reader.position() }, "here")
+ .finish();
+ }
};
+ constexpr bool IsUtf8(auto encoding) {
+ return std::same_as<std::decay_t<decltype(encoding)>, lexy::utf8_char_encoding>;
+ }
+
+ template<ParseOptions Options, typename String>
+ constexpr auto convert_as_string = convert::convert_as_string<
+ String,
+ ConvertErrorHandler>;
+
+ constexpr auto ansi_character = lexy::dsl::ascii::character / dsl::lit_b_range<0x80, 0xFF>;
+ constexpr auto ansi_control =
+ lexy::dsl::ascii::control /
+ lexy::dsl::lit_b<0x81> / lexy::dsl::lit_b<0x8D> / lexy::dsl::lit_b<0x8F> /
+ lexy::dsl::lit_b<0x90> / lexy::dsl::lit_b<0x9D>;
+
+ constexpr auto utf_character = lexy::dsl::unicode::character;
+ constexpr auto utf_control = lexy::dsl::unicode::control;
+
constexpr auto escaped_symbols = lexy::symbol_table<char> //
.map<'"'>('"')
.map<'\''>('\'')
@@ -55,38 +75,95 @@ namespace ovdl::csv::grammar {
template<ParseOptions Options>
struct CsvGrammar {
- struct StringValue {
- static constexpr auto rule = [] {
- // Arbitrary code points
- auto c = Options.character - Options.control;
+ struct StringValue : lexy::scan_production<std::string>,
+ lexy::token_production {
+
+ template<typename Context, typename Reader>
+ static constexpr scan_result scan(lexy::rule_scanner<Context, Reader>& scanner, detail::IsFileParseState auto& state) {
+ using encoding = typename Reader::encoding;
+
+ constexpr auto rule = [] {
+ // Arbitrary code points
+ auto c = [] {
+ if constexpr (std::same_as<encoding, lexy::default_encoding> || std::same_as<encoding, lexy::byte_encoding>) {
+ return ansi_character - ansi_control;
+ } else {
+ return utf_character - utf_control;
+ }
+ }();
- auto back_escape = lexy::dsl::backslash_escape //
- .symbol<escaped_symbols>();
+ auto back_escape = lexy::dsl::backslash_escape //
+ .symbol<escaped_symbols>();
- auto quote_escape = lexy::dsl::escape(lexy::dsl::lit_c<'"'>) //
- .template symbol<escaped_quote>();
+ auto quote_escape = lexy::dsl::escape(lexy::dsl::lit_c<'"'>) //
+ .template symbol<escaped_quote>();
- return lexy::dsl::delimited(lexy::dsl::lit_c<'"'>, lexy::dsl::not_followed_by(lexy::dsl::lit_c<'"'>, lexy::dsl::lit_c<'"'>))(c, back_escape, quote_escape);
- }();
+ return lexy::dsl::delimited(lexy::dsl::lit_c<'"'>, lexy::dsl::not_followed_by(lexy::dsl::lit_c<'"'>, lexy::dsl::lit_c<'"'>))(c, back_escape, quote_escape);
+ }();
+
+ lexy::scan_result<std::string> str_result = scanner.template parse<std::string>(rule);
+ if (!scanner || !str_result)
+ return lexy::scan_failed;
+ return str_result.value();
+ }
- static constexpr auto value = lexy::as_string<std::string>;
+ static constexpr auto rule = lexy::dsl::peek(lexy::dsl::lit_c<'"'>) >> lexy::dsl::scan;
+
+ static constexpr auto value = convert_as_string<Options, std::string> >> lexy::forward<std::string>;
};
- struct PlainValue {
- static constexpr auto rule = [] {
+ struct PlainValue : lexy::scan_production<std::string>,
+ lexy::token_production {
+
+ template<auto character>
+ static constexpr auto _escape_check = character - (lexy::dsl::lit_b<Options.SepChar> / lexy::dsl::ascii::newline);
+
+ template<typename Context, typename Reader>
+ static constexpr scan_result scan(lexy::rule_scanner<Context, Reader>& scanner, detail::IsFileParseState auto& state) {
+ using encoding = typename Reader::encoding;
+
+ constexpr auto rule = [] {
+ constexpr auto character = [] {
+ if constexpr (std::same_as<encoding, lexy::default_encoding> || std::same_as<encoding, lexy::byte_encoding>) {
+ return ansi_character;
+ } else {
+ return utf_character;
+ }
+ }();
+
+ if constexpr (Options.SupportStrings) {
+ return lexy::dsl::identifier(character - (lexy::dsl::lit_b<Options.SepChar> / lexy::dsl::ascii::newline));
+ } else {
+ auto escape_check_char = _escape_check<character>;
+ auto id_check_char = escape_check_char - lexy::dsl::lit_b<'\\'>;
+ auto id_segment = lexy::dsl::identifier(id_check_char);
+ auto escape_segement = lexy::dsl::token(escape_check_char);
+ auto escape_sym = lexy::dsl::symbol<escaped_symbols>(escape_segement);
+ auto escape_rule = lexy::dsl::lit_b<'\\'> >> escape_sym;
+ return lexy::dsl::list(id_segment | escape_rule);
+ }
+ }();
+
if constexpr (Options.SupportStrings) {
- return lexy::dsl::identifier(Options.character - (lexy::dsl::lit_b<Options.SepChar> / lexy::dsl::ascii::newline));
+ auto lexeme_result = scanner.template parse<lexy::lexeme<Reader>>(rule);
+ if (!scanner || !lexeme_result)
+ return lexy::scan_failed;
+ return std::string { lexeme_result.value().begin(), lexeme_result.value().end() };
} else {
- auto escape_check_char = Options.character - (lexy::dsl::lit_b<Options.SepChar> / lexy::dsl::ascii::newline);
- auto id_check_char = escape_check_char - lexy::dsl::lit_b<'\\'>;
- auto id_segment = lexy::dsl::identifier(id_check_char);
- auto escape_segement = lexy::dsl::token(escape_check_char);
- auto escape_sym = lexy::dsl::symbol<escaped_symbols>(escape_segement);
- auto escape_rule = lexy::dsl::lit_b<'\\'> >> escape_sym;
- return lexy::dsl::list(id_segment | escape_rule);
+ lexy::scan_result<std::string> str_result = scanner.template parse<std::string>(rule);
+ if (!scanner || !str_result)
+ return lexy::scan_failed;
+ return str_result.value();
}
- }();
- static constexpr auto value = lexy::as_string<std::string>;
+ }
+
+ static constexpr auto rule =
+ dsl::peek(
+ _escape_check<ansi_character>,
+ _escape_check<utf_character>) >>
+ lexy::dsl::scan;
+
+ static constexpr auto value = convert_as_string<Options, std::string> >> lexy::forward<std::string>;
};
struct Value {
@@ -114,17 +191,17 @@ namespace ovdl::csv::grammar {
static constexpr auto rule = lexy::dsl::list(lexy::dsl::p<Value>, lexy::dsl::trailing_sep(lexy::dsl::p<Seperator>));
static constexpr auto value = lexy::fold_inplace<ovdl::csv::LineObject>(
std::initializer_list<ovdl::csv::LineObject::value_type> {},
- [](ovdl::csv::LineObject& result, auto&& arg) {
- if constexpr (std::is_same_v<std::decay_t<decltype(arg)>, std::size_t>) {
- // Count seperators, adds to previous value, making it a position
- using position_type = ovdl::csv::LineObject::position_type;
- result.emplace_back(static_cast<position_type>(arg + result.back().first), "");
+ [](ovdl::csv::LineObject& result, std::size_t&& arg) {
+ // Count seperators, adds to previous value, making it a position
+ using position_type = ovdl::csv::LineObject::position_type;
+ result.emplace_back(static_cast<position_type>(arg + result.back().first), "");
+ },
+ [](ovdl::csv::LineObject& result, std::string&& arg) {
+ if (result.empty()) {
+ result.emplace_back(0u, LEXY_MOV(arg));
} else {
- if (result.empty()) result.emplace_back(0u, LEXY_MOV(arg));
- else {
- auto& [pos, value] = result.back();
- value = arg;
- }
+ auto& [pos, value] = result.back();
+ value = LEXY_MOV(arg);
}
});
};
@@ -169,74 +246,17 @@ namespace ovdl::csv::grammar {
static constexpr auto value = lexy::as_list<std::vector<ovdl::csv::LineObject>>;
};
- template<ParseChars T>
- using CommaFile = File<ParseOptions<T> { ',', false, '$' }>;
- template<ParseChars T>
- using ColonFile = File<ParseOptions<T> { ':', false, '$' }>;
- template<ParseChars T>
- using SemiColonFile = File<ParseOptions<T> { ';', false, '$' }>;
- template<ParseChars T>
- using TabFile = File<ParseOptions<T> { '\t', false, '$' }>;
- template<ParseChars T>
- using BarFile = File<ParseOptions<T> { '|', false, '$' }>;
-
- namespace strings {
- template<ParseChars T>
- using CommaFile = File<ParseOptions<T> { ',', true, '$' }>;
- template<ParseChars T>
- using ColonFile = File<ParseOptions<T> { ':', true, '$' }>;
- template<ParseChars T>
- using SemiColonFile = File<ParseOptions<T> { ';', true, '$' }>;
- template<ParseChars T>
- using TabFile = File<ParseOptions<T> { '\t', true, '$' }>;
- template<ParseChars T>
- using BarFile = File<ParseOptions<T> { '|', true, '$' }>;
- }
-}
-
-namespace ovdl::csv::grammar::windows1252 {
- struct windows1252_t {
- static constexpr auto character = dsl::make_range<0x01, 0xFF>();
- static constexpr auto control =
- lexy::dsl::ascii::control /
- lexy::dsl::lit_b<0x81> / lexy::dsl::lit_b<0x8D> / lexy::dsl::lit_b<0x8F> /
- lexy::dsl::lit_b<0x90> / lexy::dsl::lit_b<0x9D>;
- };
-
- using CommaFile = CommaFile<windows1252_t>;
- using ColonFile = ColonFile<windows1252_t>;
- using SemiColonFile = SemiColonFile<windows1252_t>;
- using TabFile = TabFile<windows1252_t>;
- using BarFile = BarFile<windows1252_t>;
-
- namespace strings {
- using CommaFile = grammar::strings::CommaFile<windows1252_t>;
- using ColonFile = grammar::strings::ColonFile<windows1252_t>;
- using SemiColonFile = grammar::strings::SemiColonFile<windows1252_t>;
- using TabFile = grammar::strings::TabFile<windows1252_t>;
- using BarFile = grammar::strings::BarFile<windows1252_t>;
-
- }
-}
-
-namespace ovdl::csv::grammar::utf8 {
- struct unicode_t {
- static constexpr auto character = lexy::dsl::unicode::character;
- static constexpr auto control = lexy::dsl::unicode::control;
- };
-
- using CommaFile = CommaFile<unicode_t>;
- using ColonFile = ColonFile<unicode_t>;
- using SemiColonFile = SemiColonFile<unicode_t>;
- using TabFile = TabFile<unicode_t>;
- using BarFile = BarFile<unicode_t>;
+ using CommaFile = File<ParseOptions { ',', false, '$' }>;
+ using ColonFile = File<ParseOptions { ':', false, '$' }>;
+ using SemiColonFile = File<ParseOptions { ';', false, '$' }>;
+ using TabFile = File<ParseOptions { '\t', false, '$' }>;
+ using BarFile = File<ParseOptions { '|', false, '$' }>;
namespace strings {
- using CommaFile = grammar::strings::CommaFile<unicode_t>;
- using ColonFile = grammar::strings::ColonFile<unicode_t>;
- using SemiColonFile = grammar::strings::SemiColonFile<unicode_t>;
- using TabFile = grammar::strings::TabFile<unicode_t>;
- using BarFile = grammar::strings::BarFile<unicode_t>;
-
+ using CommaFile = File<ParseOptions { ',', true, '$' }>;
+ using ColonFile = File<ParseOptions { ':', true, '$' }>;
+ using SemiColonFile = File<ParseOptions { ';', true, '$' }>;
+ using TabFile = File<ParseOptions { '\t', true, '$' }>;
+ using BarFile = File<ParseOptions { '|', true, '$' }>;
}
} \ No newline at end of file
diff --git a/src/openvic-dataloader/csv/CsvParseState.hpp b/src/openvic-dataloader/csv/CsvParseState.hpp
index 2390453..ee60c34 100644
--- a/src/openvic-dataloader/csv/CsvParseState.hpp
+++ b/src/openvic-dataloader/csv/CsvParseState.hpp
@@ -1,28 +1,16 @@
#pragma once
-#include <openvic-dataloader/File.hpp>
-#include <openvic-dataloader/ParseState.hpp>
#include <openvic-dataloader/csv/LineObject.hpp>
#include <openvic-dataloader/csv/Parser.hpp>
#include <lexy/encoding.hpp>
-template<ovdl::csv::EncodingType>
-struct LexyEncodingFrom {
-};
+#include "File.hpp"
+#include "ParseState.hpp"
+#include "detail/InternalConcepts.hpp"
-template<>
-struct LexyEncodingFrom<ovdl::csv::EncodingType::Windows1252> {
- using encoding = lexy::default_encoding;
-};
+namespace ovdl::csv {
+ using CsvParseState = ovdl::FileParseState<ovdl::BasicFile<std::vector<ovdl::csv::LineObject>>>;
-template<>
-struct LexyEncodingFrom<ovdl::csv::EncodingType::Utf8> {
- using encoding = lexy::utf8_char_encoding;
-};
-
-template<ovdl::csv::EncodingType Encoding>
-using CsvFile = ovdl::BasicFile<typename LexyEncodingFrom<Encoding>::encoding, std::vector<ovdl::csv::LineObject>>;
-
-template<ovdl::csv::EncodingType Encoding>
-using CsvParseState = ovdl::FileParseState<CsvFile<Encoding>>; \ No newline at end of file
+ static_assert(detail::IsFileParseState<CsvParseState>, "CsvParseState failed IsFileParseState concept");
+} \ No newline at end of file
diff --git a/src/openvic-dataloader/csv/Parser.cpp b/src/openvic-dataloader/csv/Parser.cpp
index 361f6ad..5dbee32 100644
--- a/src/openvic-dataloader/csv/Parser.cpp
+++ b/src/openvic-dataloader/csv/Parser.cpp
@@ -1,11 +1,14 @@
+#include <iostream>
+#include <optional>
+#include <type_traits>
#include <vector>
-#include <openvic-dataloader/File.hpp>
+#include <openvic-dataloader/NodeLocation.hpp>
#include <openvic-dataloader/csv/LineObject.hpp>
#include <openvic-dataloader/csv/Parser.hpp>
-#include <openvic-dataloader/detail/LexyReportError.hpp>
+#include <openvic-dataloader/detail/Encoding.hpp>
#include <openvic-dataloader/detail/OStreamOutputIterator.hpp>
-#include <openvic-dataloader/detail/utility/Utility.hpp>
+#include <openvic-dataloader/detail/Utility.hpp>
#include <lexy/action/parse.hpp>
#include <lexy/encoding.hpp>
@@ -22,15 +25,27 @@ using namespace ovdl::csv;
/// ParseHandler ///
-template<EncodingType Encoding>
-struct Parser<Encoding>::ParseHandler final : detail::BasicFileParseHandler<CsvParseState<Encoding>> {
+struct Parser::ParseHandler final : detail::BasicFileParseHandler<CsvParseState> {
template<typename Node>
std::optional<DiagnosticLogger::error_range> parse() {
- auto result = lexy::parse<Node>(this->buffer(), *this->_parse_state, this->_parse_state->logger().error_callback());
+ auto result = [&] {
+ switch (parse_state().encoding()) {
+ using enum detail::Encoding;
+ case Ascii:
+ case Utf8:
+ return lexy::parse<Node>(buffer<lexy::utf8_char_encoding>(), parse_state(), parse_state().logger().error_callback());
+ case Unknown:
+ case Windows1251:
+ case Windows1252:
+ return lexy::parse<Node>(buffer<lexy::default_encoding>(), parse_state(), parse_state().logger().error_callback());
+ default:
+ ovdl::detail::unreachable();
+ }
+ }();
if (!result) {
- return this->_parse_state->logger().get_errors();
+ return this->parse_state().logger().get_errors();
}
- _lines = std::move(result.value());
+ _lines = LEXY_MOV(result).value();
return std::nullopt;
}
@@ -42,55 +57,45 @@ private:
std::vector<csv::LineObject> _lines;
};
-/// BufferHandler ///
+/// ParserHandler ///
-template<EncodingType Encoding>
-Parser<Encoding>::Parser()
+Parser::Parser()
: _parse_handler(std::make_unique<ParseHandler>()) {
set_error_log_to_null();
}
-template<EncodingType Encoding>
-Parser<Encoding>::Parser(std::basic_ostream<char>& error_stream)
+Parser::Parser(std::basic_ostream<char>& error_stream)
: _parse_handler(std::make_unique<ParseHandler>()) {
set_error_log_to(error_stream);
}
-template<EncodingType Encoding>
-Parser<Encoding>::Parser(Parser&&) = default;
-template<EncodingType Encoding>
-Parser<Encoding>& Parser<Encoding>::operator=(Parser&&) = default;
-template<EncodingType Encoding>
-Parser<Encoding>::~Parser() = default;
+Parser::Parser(Parser&&) = default;
+Parser& Parser::operator=(Parser&&) = default;
+Parser::~Parser() = default;
-template<EncodingType Encoding>
-Parser<Encoding> Parser<Encoding>::from_buffer(const char* data, std::size_t size) {
+Parser Parser::from_buffer(const char* data, std::size_t size, std::optional<detail::Encoding> encoding_fallback) {
Parser result;
- return std::move(result.load_from_buffer(data, size));
+ return std::move(result.load_from_buffer(data, size, encoding_fallback));
}
-template<EncodingType Encoding>
-Parser<Encoding> Parser<Encoding>::from_buffer(const char* start, const char* end) {
+Parser Parser::from_buffer(const char* start, const char* end, std::optional<detail::Encoding> encoding_fallback) {
Parser result;
- return std::move(result.load_from_buffer(start, end));
+ return std::move(result.load_from_buffer(start, end, encoding_fallback));
}
-template<EncodingType Encoding>
-Parser<Encoding> Parser<Encoding>::from_string(const std::string_view string) {
+Parser Parser::from_string(const std::string_view string, std::optional<detail::Encoding> encoding_fallback) {
Parser result;
- return std::move(result.load_from_string(string));
+ return std::move(result.load_from_string(string, encoding_fallback));
}
-template<EncodingType Encoding>
-Parser<Encoding> Parser<Encoding>::from_file(const char* path) {
+Parser Parser::from_file(const char* path, std::optional<detail::Encoding> encoding_fallback) {
Parser result;
- return std::move(result.load_from_file(path));
+ return std::move(result.load_from_file(path, encoding_fallback));
}
-template<EncodingType Encoding>
-Parser<Encoding> Parser<Encoding>::from_file(const std::filesystem::path& path) {
+Parser Parser::from_file(const std::filesystem::path& path, std::optional<detail::Encoding> encoding_fallback) {
Parser result;
- return std::move(result.load_from_file(path));
+ return std::move(result.load_from_file(path, encoding_fallback));
}
///
@@ -106,9 +111,8 @@ Parser<Encoding> Parser<Encoding>::from_file(const std::filesystem::path& path)
/// @param func
/// @param args
///
-template<EncodingType Encoding>
template<typename... Args>
-constexpr void Parser<Encoding>::_run_load_func(detail::LoadCallback<ParseHandler, Args...> auto func, Args... args) {
+constexpr void Parser::_run_load_func(detail::LoadCallback<ParseHandler, Args...> auto func, Args... args) {
_has_fatal_error = false;
auto error = func(_parse_handler.get(), std::forward<Args>(args)...);
auto error_message = _parse_handler->make_error_from(error);
@@ -122,82 +126,66 @@ constexpr void Parser<Encoding>::_run_load_func(detail::LoadCallback<ParseHandle
}
}
-template<EncodingType Encoding>
-constexpr Parser<Encoding>& Parser<Encoding>::load_from_buffer(const char* data, std::size_t size) {
+constexpr Parser& Parser::load_from_buffer(const char* data, std::size_t size, std::optional<detail::Encoding> encoding_fallback) {
// Type can't be deduced?
- _run_load_func(std::mem_fn(&ParseHandler::load_buffer_size), data, size);
+ _run_load_func(std::mem_fn(&ParseHandler::load_buffer_size), data, size, encoding_fallback);
return *this;
}
-template<EncodingType Encoding>
-constexpr Parser<Encoding>& Parser<Encoding>::load_from_buffer(const char* start, const char* end) {
+constexpr Parser& Parser::load_from_buffer(const char* start, const char* end, std::optional<detail::Encoding> encoding_fallback) {
// Type can't be deduced?
- _run_load_func(std::mem_fn(&ParseHandler::load_buffer), start, end);
+ _run_load_func(std::mem_fn(&ParseHandler::load_buffer), start, end, encoding_fallback);
return *this;
}
-template<EncodingType Encoding>
-constexpr Parser<Encoding>& Parser<Encoding>::load_from_string(const std::string_view string) {
- return load_from_buffer(string.data(), string.size());
+constexpr Parser& Parser::load_from_string(const std::string_view string, std::optional<detail::Encoding> encoding_fallback) {
+ return load_from_buffer(string.data(), string.size(), encoding_fallback);
}
-template<EncodingType Encoding>
-Parser<Encoding>& Parser<Encoding>::load_from_file(const char* path) {
+Parser& Parser::load_from_file(const char* path, std::optional<detail::Encoding> encoding_fallback) {
set_file_path(path);
// Type can be deduced??
- _run_load_func(std::mem_fn(&ParseHandler::load_file), path);
+ _run_load_func(std::mem_fn(&ParseHandler::load_file), get_file_path().data(), encoding_fallback);
return *this;
}
-template<EncodingType Encoding>
-Parser<Encoding>& Parser<Encoding>::load_from_file(const std::filesystem::path& path) {
- return load_from_file(path.string().c_str());
+Parser& Parser::load_from_file(const std::filesystem::path& path, std::optional<detail::Encoding> encoding_fallback) {
+ return load_from_file(path.string().c_str(), encoding_fallback);
}
-template<EncodingType Encoding>
-bool Parser<Encoding>::parse_csv(bool handle_strings) {
+bool Parser::parse_csv(bool handle_strings) {
if (!_parse_handler->is_valid()) {
return false;
}
- std::optional<Parser<Encoding>::error_range> errors;
- // auto report_error = ovdl::detail::ReporError.path(_file_path).to(detail::OStreamOutputIterator { _error_stream });
- if constexpr (Encoding == EncodingType::Windows1252) {
+ std::optional<Parser::error_range> errors = [&] {
if (handle_strings)
- errors = _parse_handler->template parse<csv::grammar::windows1252::strings::SemiColonFile>();
+ return _parse_handler->template parse<csv::grammar::strings::SemiColonFile>();
else
- errors = _parse_handler->template parse<csv::grammar::windows1252::SemiColonFile>();
- } else {
- if (handle_strings)
- errors = _parse_handler->template parse<csv::grammar::utf8::strings::SemiColonFile>();
- else
- errors = _parse_handler->template parse<csv::grammar::utf8::SemiColonFile>();
- }
+ return _parse_handler->template parse<csv::grammar::SemiColonFile>();
+ }();
_has_error = _parse_handler->parse_state().logger().errored();
_has_warning = _parse_handler->parse_state().logger().warned();
if (!errors->empty()) {
+ _has_error = true;
_has_fatal_error = true;
if (&_error_stream.get() != &detail::cnull) {
print_errors_to(_error_stream);
}
return false;
}
- _lines = std::move(_parse_handler->get_lines());
return true;
}
-template<EncodingType Encoding>
-const std::vector<csv::LineObject>& Parser<Encoding>::get_lines() const {
- return _lines;
+const std::vector<csv::LineObject>& Parser::get_lines() const {
+ return _parse_handler->get_lines();
}
-template<EncodingType Encoding>
-typename Parser<Encoding>::error_range Parser<Encoding>::get_errors() const {
+typename Parser::error_range Parser::get_errors() const {
return _parse_handler->parse_state().logger().get_errors();
}
-template<EncodingType Encoding>
-const FilePosition Parser<Encoding>::get_error_position(const error::Error* error) const {
+const FilePosition Parser::get_error_position(const error::Error* error) const {
if (!error || !error->is_linked_in_tree()) {
return {};
}
@@ -206,18 +194,27 @@ const FilePosition Parser<Encoding>::get_error_position(const error::Error* erro
return {};
}
- auto loc_begin = lexy::get_input_location(_parse_handler->buffer(), err_location.begin());
- FilePosition result { loc_begin.line_nr(), loc_begin.line_nr(), loc_begin.column_nr(), loc_begin.column_nr() };
- if (err_location.begin() < err_location.end()) {
- auto loc_end = lexy::get_input_location(_parse_handler->buffer(), err_location.end(), loc_begin.anchor());
- result.end_line = loc_end.line_nr();
- result.end_column = loc_end.column_nr();
- }
- return result;
+// TODO: Remove reinterpret_cast
+// WARNING: This almost certainly breaks on utf16 and utf32 encodings, luckily we don't parse in that format
+// This is purely to silence the node_location errors because char8_t is useless
+#define REINTERPRET_IT(IT) reinterpret_cast<const std::decay_t<decltype(buffer)>::encoding::char_type*>((IT))
+
+ return _parse_handler->parse_state().file().visit_buffer(
+ [&](auto&& buffer) -> FilePosition {
+ auto loc_begin = lexy::get_input_location(buffer, REINTERPRET_IT(err_location.begin()));
+ FilePosition result { loc_begin.line_nr(), loc_begin.line_nr(), loc_begin.column_nr(), loc_begin.column_nr() };
+ if (err_location.begin() < err_location.end()) {
+ auto loc_end = lexy::get_input_location(buffer, REINTERPRET_IT(err_location.end()), loc_begin.anchor());
+ result.end_line = loc_end.line_nr();
+ result.end_column = loc_end.column_nr();
+ }
+ return result;
+ });
+
+#undef REINTERPRET_IT
}
-template<EncodingType Encoding>
-void Parser<Encoding>::print_errors_to(std::basic_ostream<char>& stream) const {
+void Parser::print_errors_to(std::basic_ostream<char>& stream) const {
auto errors = get_errors();
if (errors.empty()) return;
for (const auto error : errors) {
@@ -226,19 +223,9 @@ void Parser<Encoding>::print_errors_to(std::basic_ostream<char>& stream) const {
[&](const error::BufferError* buffer_error) {
stream << "buffer error: " << buffer_error->message() << '\n';
},
- [&](const error::ParseError* parse_error) {
- auto position = get_error_position(parse_error);
- std::string pos_str = fmt::format(":{}:{}: ", position.start_line, position.start_column);
- stream << _file_path << pos_str << "parse error for '" << parse_error->production_name() << "': " << parse_error->message() << '\n';
- },
- [&](dryad::child_visitor<error::ErrorKind> visitor, const error::Semantic* semantic) {
- auto position = get_error_position(semantic);
- std::string pos_str = ": ";
- if (!position.is_empty()) {
- pos_str = fmt::format(":{}:{}: ", position.start_line, position.start_column);
- }
- stream << _file_path << pos_str << semantic->message() << '\n';
- auto annotations = semantic->annotations();
+ [&](dryad::child_visitor<error::ErrorKind> visitor, const error::AnnotatedError* annotated_error) {
+ stream << annotated_error->message() << '\n';
+ auto annotations = annotated_error->annotations();
for (auto annotation : annotations) {
visitor(annotation);
}
@@ -250,7 +237,4 @@ void Parser<Encoding>::print_errors_to(std::basic_ostream<char>& stream) const {
stream << secondary->message() << '\n';
});
}
-}
-
-template class ovdl::csv::Parser<EncodingType::Windows1252>;
-template class ovdl::csv::Parser<EncodingType::Utf8>; \ No newline at end of file
+} \ No newline at end of file
diff --git a/src/openvic-dataloader/detail/Convert.hpp b/src/openvic-dataloader/detail/Convert.hpp
new file mode 100644
index 0000000..5d9fca0
--- /dev/null
+++ b/src/openvic-dataloader/detail/Convert.hpp
@@ -0,0 +1,577 @@
+#pragma once
+
+#include <cstddef>
+#include <string_view>
+#include <type_traits>
+
+#include <lexy/_detail/config.hpp>
+#include <lexy/callback/string.hpp>
+#include <lexy/code_point.hpp>
+#include <lexy/dsl/option.hpp>
+#include <lexy/dsl/symbol.hpp>
+#include <lexy/encoding.hpp>
+#include <lexy/input/base.hpp>
+#include <lexy/input/file.hpp>
+#include <lexy/input/string_input.hpp>
+#include <lexy/lexeme.hpp>
+
+#include "openvic-dataloader/detail/Encoding.hpp"
+
+#include "ParseState.hpp" // IWYU pragma: keep
+#include "detail/InternalConcepts.hpp"
+#include "detail/dsl.hpp"
+#include "v2script/ParseState.hpp"
+
+namespace ovdl::convert {
+ struct MappedChar {
+ char value;
+ std::string_view utf8;
+
+ constexpr bool is_invalid() const { return value == 0; }
+ constexpr bool is_pass() const { return value == 1; }
+ };
+ constexpr MappedChar invalid_map { 0, "" };
+ constexpr MappedChar pass_map { 1, "" };
+
+ struct map_value {
+ std::string_view _value;
+
+ constexpr map_value() noexcept : _value("") {}
+ constexpr map_value(std::nullptr_t) noexcept : _value("\0") {}
+ constexpr explicit map_value(std::string_view val) noexcept : _value(val) {}
+
+ constexpr bool is_invalid() const {
+ return !_value.empty() && _value[0] == '\0';
+ }
+
+ constexpr bool is_pass() const {
+ return _value.empty();
+ }
+
+ constexpr bool is_valid() const noexcept {
+ return !_value.empty() && _value[0] != '\0';
+ }
+
+ constexpr explicit operator bool() const noexcept {
+ return is_valid();
+ }
+ };
+
+ template<typename T>
+ concept IsConverter = requires(unsigned char c, lexy::_pr<lexy::deduce_encoding<char>>& reader) {
+ { T::try_parse(reader) } -> std::same_as<map_value>;
+ };
+
+ struct Utf8 {
+ static constexpr auto map = lexy::symbol_table<std::string_view>;
+
+ template<typename Reader>
+ static constexpr map_value try_parse(Reader& reader) {
+ return {};
+ }
+ };
+ static_assert(IsConverter<Utf8>);
+
+ struct Windows1252 {
+ static constexpr auto map = lexy::symbol_table<std::string_view> //
+ .map<'\x80'>("€")
+ .map<'\x82'>("‚")
+ .map<'\x83'>("ƒ")
+ .map<'\x84'>("„")
+ .map<'\x85'>("…")
+ .map<'\x86'>("†")
+ .map<'\x87'>("‡")
+ .map<'\x88'>("ˆ")
+ .map<'\x89'>("‰")
+ .map<'\x8A'>("Š")
+ .map<'\x8B'>("‹")
+ .map<'\x8C'>("Œ")
+ .map<'\x8E'>("Ž")
+
+ .map<'\x91'>("‘")
+ .map<'\x92'>("’")
+ .map<'\x93'>("“")
+ .map<'\x94'>("”")
+ .map<'\x95'>("•")
+ .map<'\x96'>("–")
+ .map<'\x97'>("—")
+ .map<'\x98'>("˜")
+ .map<'\x99'>("™")
+ .map<'\x9A'>("š")
+ .map<'\x9B'>("›")
+ .map<'\x9C'>("œ")
+ .map<'\x9E'>("ž")
+ .map<'\x9F'>("Ÿ")
+
+ .map<'\xA0'>(" ")
+ .map<'\xA1'>("¡")
+ .map<'\xA2'>("¢")
+ .map<'\xA3'>("£")
+ .map<'\xA4'>("¤")
+ .map<'\xA5'>("¥")
+ .map<'\xA6'>("¦")
+ .map<'\xA7'>("§")
+ .map<'\xA8'>("¨")
+ .map<'\xA9'>("©")
+ .map<'\xAA'>("ª")
+ .map<'\xAB'>("«")
+ .map<'\xAC'>("¬")
+ .map<'\xAD'>("­") // Soft Hyphen
+ .map<'\xAE'>("®")
+ .map<'\xAF'>("¯")
+
+ .map<'\xB0'>("°")
+ .map<'\xB1'>("±")
+ .map<'\xB2'>("²")
+ .map<'\xB3'>("³")
+ .map<'\xB4'>("´")
+ .map<'\xB5'>("µ")
+ .map<'\xB6'>("¶")
+ .map<'\xB7'>("·")
+ .map<'\xB8'>("¸")
+ .map<'\xB9'>("¹")
+ .map<'\xBA'>("º")
+ .map<'\xBB'>("»")
+ .map<'\xBC'>("¼")
+ .map<'\xBD'>("½")
+ .map<'\xBE'>("¾")
+ .map<'\xBF'>("¿")
+
+ .map<'\xC0'>("À")
+ .map<'\xC1'>("Á")
+ .map<'\xC2'>("Â")
+ .map<'\xC3'>("Ã")
+ .map<'\xC4'>("Ä")
+ .map<'\xC5'>("Å")
+ .map<'\xC6'>("Æ")
+ .map<'\xC7'>("Ç")
+ .map<'\xC8'>("È")
+ .map<'\xC9'>("É")
+ .map<'\xCA'>("Ê")
+ .map<'\xCB'>("Ë")
+ .map<'\xCC'>("Ì")
+ .map<'\xCD'>("Í")
+ .map<'\xCE'>("Î")
+ .map<'\xCF'>("Ï")
+
+ .map<'\xD0'>("Ð")
+ .map<'\xD1'>("Ñ")
+ .map<'\xD2'>("Ò")
+ .map<'\xD3'>("Ó")
+ .map<'\xD4'>("Ô")
+ .map<'\xD5'>("Õ")
+ .map<'\xD6'>("Ö")
+ .map<'\xD7'>("×")
+ .map<'\xD8'>("Ø")
+ .map<'\xD9'>("Ù")
+ .map<'\xDA'>("Ú")
+ .map<'\xDB'>("Û")
+ .map<'\xDC'>("Ü")
+ .map<'\xDD'>("Ý")
+ .map<'\xDE'>("Þ")
+ .map<'\xDF'>("ß")
+
+ .map<'\xE0'>("à")
+ .map<'\xE1'>("á")
+ .map<'\xE2'>("â")
+ .map<'\xE3'>("ã")
+ .map<'\xE4'>("ä")
+ .map<'\xE5'>("å")
+ .map<'\xE6'>("æ")
+ .map<'\xE7'>("ç")
+ .map<'\xE8'>("è")
+ .map<'\xE9'>("é")
+ .map<'\xEA'>("ê")
+ .map<'\xEB'>("ë")
+ .map<'\xEC'>("ì")
+ .map<'\xED'>("í")
+ .map<'\xEE'>("î")
+ .map<'\xEF'>("ï")
+
+ .map<'\xF0'>("ð")
+ .map<'\xF1'>("ñ")
+ .map<'\xF2'>("ò")
+ .map<'\xF3'>("ó")
+ .map<'\xF4'>("ô")
+ .map<'\xF5'>("õ")
+ .map<'\xF6'>("ö")
+ .map<'\xF7'>("÷")
+ .map<'\xF8'>("ø")
+ .map<'\xF9'>("ù")
+ .map<'\xFA'>("ú")
+ .map<'\xFB'>("û")
+ .map<'\xFC'>("ü")
+ .map<'\xFD'>("ý")
+ .map<'\xFE'>("þ")
+ .map<'\xFF'>("ÿ");
+
+ template<typename Reader>
+ static constexpr map_value try_parse(Reader& reader) {
+ auto index = map.try_parse(reader);
+ if (index) {
+ return map_value(map[index]);
+ }
+ return {};
+ }
+ };
+ static_assert(IsConverter<Windows1252>);
+
+ struct Windows1251 {
+ static constexpr auto map = lexy::symbol_table<std::string_view> //
+ .map<'\x80'>("Ђ")
+ .map<'\x81'>("Ѓ")
+ .map<'\x82'>("‚")
+ .map<'\x83'>("ѓ")
+ .map<'\x84'>("„")
+ .map<'\x85'>("…")
+ .map<'\x86'>("†")
+ .map<'\x87'>("‡")
+ .map<'\x88'>("€")
+ .map<'\x89'>("‰")
+ .map<'\x8A'>("Љ")
+ .map<'\x8B'>("‹")
+ .map<'\x8C'>("Њ")
+ .map<'\x8D'>("Ќ")
+ .map<'\x8E'>("Ћ")
+ .map<'\x8F'>("Џ")
+
+ .map<'\x90'>("ђ")
+ .map<'\x91'>("‘")
+ .map<'\x92'>("’")
+ .map<'\x93'>("“")
+ .map<'\x94'>("”")
+ .map<'\x95'>("•")
+ .map<'\x96'>("–")
+ .map<'\x97'>("—")
+ .map<'\x99'>("™")
+ .map<'\x9A'>("љ")
+ .map<'\x9B'>("›")
+ .map<'\x9C'>("њ")
+ .map<'\x9D'>("ќ")
+ .map<'\x9E'>("ћ")
+ .map<'\x9F'>("џ")
+
+ .map<'\xA0'>(" ")
+ .map<'\xA1'>("Ў")
+ .map<'\xA2'>("ў")
+ .map<'\xA3'>("Ј")
+ .map<'\xA4'>("¤")
+ .map<'\xA5'>("Ґ")
+ .map<'\xA6'>("¦")
+ .map<'\xA7'>("§")
+ .map<'\xA8'>("Ё")
+ .map<'\xA9'>("©")
+ .map<'\xAA'>("Є")
+ .map<'\xAB'>("«")
+ .map<'\xAC'>("¬")
+ .map<'\xAD'>("­") // Soft Hyphen
+ .map<'\xAE'>("®")
+ .map<'\xAF'>("Ї")
+
+ .map<'\xB0'>("°")
+ .map<'\xB1'>("±")
+ .map<'\xB2'>("І")
+ .map<'\xB3'>("і")
+ .map<'\xB4'>("ґ")
+ .map<'\xB5'>("µ")
+ .map<'\xB6'>("¶")
+ .map<'\xB7'>("·")
+ .map<'\xB8'>("ё")
+ .map<'\xB9'>("№")
+ .map<'\xBA'>("є")
+ .map<'\xBB'>("»")
+ .map<'\xBC'>("ј")
+ .map<'\xBD'>("Ѕ")
+ .map<'\xBE'>("ѕ")
+ .map<'\xBF'>("ї")
+
+ .map<'\xC0'>("А")
+ .map<'\xC1'>("Б")
+ .map<'\xC2'>("В")
+ .map<'\xC3'>("Г")
+ .map<'\xC4'>("Д")
+ .map<'\xC5'>("Е")
+ .map<'\xC6'>("Ж")
+ .map<'\xC7'>("З")
+ .map<'\xC8'>("И")
+ .map<'\xC9'>("Й")
+ .map<'\xCA'>("К")
+ .map<'\xCB'>("Л")
+ .map<'\xCC'>("М")
+ .map<'\xCD'>("Н")
+ .map<'\xCE'>("О")
+ .map<'\xCF'>("П")
+
+ .map<'\xD0'>("Р")
+ .map<'\xD1'>("С")
+ .map<'\xD2'>("Т")
+ .map<'\xD3'>("У")
+ .map<'\xD4'>("Ф")
+ .map<'\xD5'>("Х")
+ .map<'\xD6'>("Ц")
+ .map<'\xD7'>("Ч")
+ .map<'\xD8'>("Ш")
+ .map<'\xD9'>("Щ")
+ .map<'\xDA'>("Ъ")
+ .map<'\xDB'>("Ы")
+ .map<'\xDC'>("Ь")
+ .map<'\xDD'>("Э")
+ .map<'\xDE'>("Ю")
+ .map<'\xDF'>("Я")
+
+ .map<'\xE0'>("а")
+ .map<'\xE1'>("б")
+ .map<'\xE2'>("в")
+ .map<'\xE3'>("г")
+ .map<'\xE4'>("д")
+ .map<'\xE5'>("е")
+ .map<'\xE6'>("ж")
+ .map<'\xE7'>("з")
+ .map<'\xE8'>("и")
+ .map<'\xE9'>("й")
+ .map<'\xEA'>("к")
+ .map<'\xEB'>("л")
+ .map<'\xEC'>("м")
+ .map<'\xED'>("н")
+ .map<'\xEE'>("о")
+ .map<'\xEF'>("п")
+
+ .map<'\xF0'>("р")
+ .map<'\xF1'>("с")
+ .map<'\xF2'>("т")
+ .map<'\xF3'>("у")
+ .map<'\xF4'>("ф")
+ .map<'\xF5'>("х")
+ .map<'\xF6'>("ц")
+ .map<'\xF7'>("ч")
+ .map<'\xF8'>("ш")
+ .map<'\xF9'>("щ")
+ .map<'\xFA'>("ъ")
+ .map<'\xFB'>("ы")
+ .map<'\xFC'>("ь")
+ .map<'\xFD'>("э")
+ .map<'\xFE'>("ю")
+ .map<'\xFF'>("я");
+
+ template<typename Reader>
+ static constexpr map_value try_parse(Reader& reader) {
+ auto index = map.try_parse(reader);
+ if (index) {
+ return map_value(map[index]);
+ }
+ return {};
+ }
+ };
+ static_assert(IsConverter<Windows1251>);
+
+ template<typename Reader>
+ constexpr map_value try_parse_map(detail::Encoding&& encoding, Reader& reader) {
+ switch (encoding) {
+ case detail::Encoding::Unknown:
+ case detail::Encoding::Ascii:
+ case detail::Encoding::Utf8: return Utf8::try_parse(reader);
+ case detail::Encoding::Windows1251: return Windows1251::try_parse(reader);
+ case detail::Encoding::Windows1252: return Windows1252::try_parse(reader);
+ }
+ ovdl::detail::unreachable();
+ }
+
+ template<typename String>
+ using _string_char_type = LEXY_DECAY_DECLTYPE(LEXY_DECLVAL(String)[0]);
+
+ template<typename T, typename CharT>
+ concept IsErrorHandler =
+ std::is_convertible_v<CharT, char> //
+ && requires(T t, ovdl::v2script::ast::ParseState& state, lexy::_pr<lexy::deduce_encoding<CharT>> reader) {
+ { T::on_invalid_character(state, reader) };
+ };
+
+ struct EmptyHandler {
+ static constexpr void on_invalid_character(detail::IsStateType auto& state, auto reader) {}
+ };
+
+ template<typename String,
+ IsErrorHandler<_string_char_type<String>> Error = EmptyHandler>
+ constexpr auto convert_as_string =
+ dsl::sink<String>(
+ lexy::fold_inplace<String>(
+ std::initializer_list<_string_char_type<String>> {}, //
+ []<typename CharT, typename = decltype(LEXY_DECLVAL(String).push_back(CharT()))>(String& result, detail::IsStateType auto& state, CharT c) {
+ if constexpr (std::is_convertible_v<CharT, char>) {
+ switch (state.encoding()) {
+ using enum ovdl::detail::Encoding;
+ case Ascii:
+ case Utf8:
+ break;
+ // Skip Ascii and Utf8 encoding
+ default: {
+ map_value val = {};
+ CharT char_array[] { c, CharT() };
+ auto input = lexy::range_input(&char_array[0], &char_array[1]);
+ auto reader = input.reader();
+
+ // prefer preserving unknown conversion maps, least things will work, they'll just probably display wrong
+ // map = make_map_from(state.encoding(), c);
+ val = try_parse_map(state.encoding(), reader);
+
+ // Invalid characters are dropped
+ if (val.is_invalid()) {
+ Error::on_invalid_character(state, reader);
+ return;
+ }
+
+ // non-pass characters are not valid ascii and are mapped to utf8 values
+ if (!val.is_pass()) {
+ result.append(val._value);
+ return;
+ }
+
+ break;
+ }
+ }
+ }
+
+ result.push_back(c); //
+ }, //
+ [](String& result, detail::IsStateType auto& state, String&& str) {
+ if constexpr (std::is_convertible_v<typename String::value_type, char>) {
+ switch (state.encoding()) {
+ using enum ovdl::detail::Encoding;
+ case Ascii:
+ case Utf8:
+ break;
+ // Skip Ascii and Utf8 encoding
+ default: {
+ auto input = lexy::string_input(str);
+ auto reader = input.reader();
+ using encoding = decltype(reader)::encoding;
+ constexpr auto eof = encoding::eof();
+
+ if constexpr (requires { result.reserve(str.size()); }) {
+ result.reserve(str.size());
+ }
+
+ auto begin = reader.position();
+ auto last_it = begin;
+ while (reader.peek() != eof) {
+ map_value val = try_parse_map(state.encoding(), reader);
+
+ if (val.is_invalid()) {
+ Error::on_invalid_character(state, reader);
+ reader.bump();
+ continue;
+ } else if (!val.is_pass()) {
+ result.append(val._value);
+ last_it = reader.position();
+ continue;
+ }
+
+ reader.bump();
+ result.append(last_it, reader.position());
+ last_it = reader.position();
+ }
+ if (last_it != begin) {
+ result.append(last_it, reader.position());
+ return;
+ }
+ break;
+ }
+ }
+ }
+
+ result.append(LEXY_MOV(str)); //
+ }, //
+ []<typename Str = String, typename Iterator>(String& result, detail::IsStateType auto& state, Iterator begin, Iterator end) //
+ -> decltype(void(LEXY_DECLVAL(Str).append(begin, end))) {
+ if constexpr (std::is_convertible_v<typename String::value_type, char>) {
+ switch (state.encoding()) {
+ using enum ovdl::detail::Encoding;
+ case Ascii:
+ case Utf8:
+ break;
+ // Skip Ascii and Utf8 encoding
+ default: {
+ auto input = lexy::range_input(begin, end);
+ auto reader = input.reader();
+ using encoding = decltype(reader)::encoding;
+ constexpr auto eof = encoding::eof();
+
+ if constexpr (requires { result.reserve(end - begin); }) {
+ result.reserve(end - begin);
+ }
+
+ auto begin = reader.position();
+ auto last_it = begin;
+ while (reader.peek() != eof) {
+ map_value val = try_parse_map(state.encoding(), reader);
+
+ if (val.is_invalid()) {
+ Error::on_invalid_character(state, reader);
+ reader.bump();
+ continue;
+ } else if (!val.is_pass()) {
+ result.append(val._value);
+ last_it = reader.position();
+ continue;
+ }
+
+ reader.bump();
+ result.append(last_it, reader.position());
+ last_it = reader.position();
+ }
+ if (last_it != begin) {
+ result.append(last_it, reader.position());
+ return;
+ }
+ break;
+ }
+ }
+ }
+
+ result.append(begin, end); //
+ }, //
+ []<typename Reader>(String& result, detail::IsStateType auto& state, lexy::lexeme<Reader> lex) {
+ using encoding = typename Reader::encoding;
+ using _char_type = _string_char_type<String>;
+ static_assert(lexy::char_type_compatible_with_reader<Reader, _char_type>,
+ "cannot convert lexeme to this string type");
+
+ if constexpr ((std::same_as<encoding, lexy::default_encoding> || std::same_as<encoding, lexy::byte_encoding>) &&
+ std::convertible_to<typename String::value_type, char>) {
+ auto input = lexy::range_input(lex.begin(), lex.end());
+ auto reader = input.reader();
+ using encoding = decltype(reader)::encoding;
+ constexpr auto eof = encoding::eof();
+
+ if constexpr (requires { result.reserve(lex.end() - lex.begin()); }) {
+ result.reserve(lex.end() - lex.begin());
+ }
+
+ auto begin = reader.position();
+ auto last_it = begin;
+ while (reader.peek() != eof) {
+ map_value val = try_parse_map(state.encoding(), reader);
+
+ if (val.is_invalid()) {
+ Error::on_invalid_character(state, reader);
+ reader.bump();
+ continue;
+ } else if (!val.is_pass()) {
+ result.append(val._value);
+ last_it = reader.position();
+ continue;
+ }
+
+ reader.bump();
+ result.append(last_it, reader.position());
+ last_it = reader.position();
+ }
+ if (last_it != begin) {
+ result.append(last_it, reader.position());
+ return;
+ }
+ }
+
+ result.append(lex.begin(), lex.end()); //
+ }));
+} \ No newline at end of file
diff --git a/src/openvic-dataloader/detail/Detect.cpp b/src/openvic-dataloader/detail/Detect.cpp
new file mode 100644
index 0000000..1516fc7
--- /dev/null
+++ b/src/openvic-dataloader/detail/Detect.cpp
@@ -0,0 +1,351 @@
+#include "detail/Detect.hpp"
+
+using namespace ovdl;
+using namespace ovdl::encoding_detect;
+
+static constexpr int64_t INVALID_CLASS = 255;
+
+std::optional<int64_t> Utf8Canidate::read(const std::span<const cbyte>& buffer) {
+ auto lexy_buffer = lexy::make_buffer_from_raw<lexy::default_encoding, lexy::encoding_endianness::little>(buffer.data(), buffer.size());
+ if (is_utf8(lexy_buffer)) {
+ return 0;
+ }
+
+ return std::nullopt;
+}
+
+std::optional<int64_t> AsciiCanidate::read(const std::span<const cbyte>& buffer) {
+ auto lexy_buffer = lexy::make_buffer_from_raw<lexy::default_encoding, lexy::encoding_endianness::little>(buffer.data(), buffer.size());
+ if (is_ascii(lexy_buffer)) {
+ return 0;
+ }
+
+ return std::nullopt;
+}
+
+std::optional<int64_t> NonLatinCasedCanidate::read(const std::span<const cbyte>& buffer) {
+ static constexpr cbyte LATIN_LETTER = 1;
+ static constexpr int64_t NON_LATIN_MIXED_CASE_PENALTY = -20;
+ static constexpr int64_t NON_LATIN_ALL_CAPS_PENALTY = -40;
+ static constexpr int64_t NON_LATIN_CAPITALIZATION_BONUS = 40;
+ static constexpr int64_t LATIN_ADJACENCY_PENALTY = -50;
+
+ int64_t score = 0;
+ for (const ubyte& b : buffer) {
+ const ubyte byte_class = score_data.classify(b);
+ if (byte_class == INVALID_CLASS) {
+ return std::nullopt;
+ }
+
+ const ubyte caseless_class = byte_class & 0x7F;
+ const bool ascii = b < 0x80;
+ const bool ascii_pair = prev_ascii == 0 && ascii;
+ const bool non_ascii_alphabetic = score_data.is_non_latin_alphabetic(caseless_class);
+
+ if (caseless_class == LATIN_LETTER) {
+ case_state = CaseState::Mix;
+ } else if (!non_ascii_alphabetic) {
+ switch (case_state) {
+ default: break;
+ case CaseState::UpperLower:
+ score += NON_LATIN_CAPITALIZATION_BONUS;
+ break;
+ case CaseState::AllCaps:
+ // pass
+ break;
+ case CaseState::Mix:
+ score += NON_LATIN_MIXED_CASE_PENALTY * current_word_len;
+ break;
+ }
+ case_state = CaseState::Space;
+ } else if (byte_class >> 7 == 0) {
+ switch (case_state) {
+ default: break;
+ case CaseState::Space:
+ case_state = CaseState::Lower;
+ break;
+ case CaseState::Upper:
+ case_state = CaseState::UpperLower;
+ break;
+ case CaseState::AllCaps:
+ case_state = CaseState::Mix;
+ break;
+ }
+ } else {
+ switch (case_state) {
+ default: break;
+ case CaseState::Space:
+ case_state = CaseState::Upper;
+ break;
+ case CaseState::Upper:
+ case_state = CaseState::AllCaps;
+ break;
+ case CaseState::Lower:
+ case CaseState::UpperLower:
+ case_state = CaseState::Mix;
+ break;
+ }
+ }
+
+ if (non_ascii_alphabetic) {
+ current_word_len += 1;
+ } else {
+ if (current_word_len > longest_word) {
+ longest_word = current_word_len;
+ }
+ current_word_len = 0;
+ }
+
+ const bool is_a0 = b == 0xA0;
+
+ if (!ascii_pair) {
+ // 0xA0 is no-break space in many other encodings, so avoid
+ // assigning score to IBM866 when 0xA0 occurs next to itself
+ // or a space-like byte.
+ if (!(ibm866 && ((is_a0 && (prev_was_a0 || prev == 0)) || caseless_class == 0 && prev_was_a0))) {
+ score += score_data.score(caseless_class, prev);
+ }
+
+ if (prev == LATIN_LETTER &&
+ non_ascii_alphabetic) {
+ score += LATIN_ADJACENCY_PENALTY;
+ } else if (caseless_class == LATIN_LETTER && score_data.is_non_latin_alphabetic(prev)) {
+ score += LATIN_ADJACENCY_PENALTY;
+ }
+ }
+
+ prev_ascii = ascii;
+ prev = caseless_class;
+ prev_was_a0 = is_a0;
+ }
+ return score;
+}
+
+std::optional<int64_t> LatinCanidate::read(const std::span<const cbyte>& buffer) {
+ static constexpr int64_t IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY = -180;
+ static constexpr int64_t ORDINAL_BONUS = 300;
+ static constexpr int64_t COPYRIGHT_BONUS = 222;
+ static constexpr int64_t IMPLAUSIBILITY_PENALTY = -220;
+
+ int64_t score = 0;
+ for (const ubyte& b : buffer) {
+ const ubyte byte_class = score_data.classify(b);
+ if (byte_class == INVALID_CLASS) {
+ return std::nullopt;
+ }
+
+ const ubyte caseless_class = byte_class & 0x7F;
+ const bool ascii = b < 0x80;
+ const bool ascii_pair = prev_non_ascii == 0 && ascii;
+
+ int16_t non_ascii_penalty = -200;
+ switch (prev_non_ascii) {
+ case 0:
+ case 1:
+ case 2:
+ non_ascii_penalty = 0;
+ break;
+ case 3:
+ non_ascii_penalty = -5;
+ break;
+ case 4:
+ non_ascii_penalty = 20;
+ break;
+ }
+ score += non_ascii_penalty;
+
+ if (!score_data.is_latin_alphabetic(caseless_class)) {
+ case_state = CaseState::Space;
+ } else if (byte_class >> 7 == 0) {
+ if (case_state == CaseState::AllCaps && !ascii_pair) {
+ score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY;
+ }
+ case_state = CaseState::Lower;
+ } else {
+ switch (case_state) {
+ case CaseState::Lower:
+ if (!ascii_pair) {
+ score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY;
+ }
+ [[fallthrough]];
+ case CaseState::Space:
+ case_state = CaseState::Upper;
+ break;
+ case CaseState::Upper:
+ case CaseState::AllCaps:
+ case_state = CaseState::AllCaps;
+ break;
+ }
+ }
+
+ bool ascii_ish_pair = ascii_pair || (ascii && prev == 0) || (caseless_class == 0 && prev_non_ascii == 0);
+
+ if (!ascii_ish_pair) {
+ score += score_data.score(caseless_class, prev);
+ }
+
+ if (windows1252) {
+ switch (ordinal_state) {
+ case OrdinalState::Other:
+ if (caseless_class == 0) {
+ ordinal_state = OrdinalState::Space;
+ }
+ break;
+ case OrdinalState::Space:
+ if (caseless_class == 0) {
+ // pass
+ } else if (b == 0xAA || b == 0xBA) {
+ ordinal_state = OrdinalState::OrdinalExpectingSpace;
+ } else if (b == 'M' || b == 'D' || b == 'S') {
+ ordinal_state = OrdinalState::FeminineAbbreviationStartLetter;
+ } else if (b == 'N') {
+ // numero or Nuestra
+ ordinal_state = OrdinalState::UpperN;
+ } else if (b == 'n') {
+ // numero
+ ordinal_state = OrdinalState::LowerN;
+ } else if (caseless_class == ASCII_DIGIT) {
+ ordinal_state = OrdinalState::Digit;
+ } else if (caseless_class == 9 /* I */ || caseless_class == 22 /* V */ || caseless_class == 24)
+ /* X */
+ {
+ ordinal_state = OrdinalState::Roman;
+ } else if (b == 0xA9) {
+ ordinal_state = OrdinalState::Copyright;
+ } else {
+ ordinal_state = OrdinalState::Other;
+ }
+ break;
+ case OrdinalState::OrdinalExpectingSpace:
+ if (caseless_class == 0) {
+ score += ORDINAL_BONUS;
+ ordinal_state = OrdinalState::Space;
+ } else {
+ ordinal_state = OrdinalState::Other;
+ }
+ case OrdinalState::OrdinalExpectingSpaceUndoImplausibility:
+ if (caseless_class == 0) {
+ score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY;
+ ordinal_state = OrdinalState::Space;
+ } else {
+ ordinal_state = OrdinalState::Other;
+ }
+ break;
+ case OrdinalState::OrdinalExpectingSpaceOrDigit:
+ if (caseless_class == 0) {
+ score += ORDINAL_BONUS;
+ ordinal_state = OrdinalState::Space;
+ } else if (caseless_class == ASCII_DIGIT) {
+ score += ORDINAL_BONUS;
+ // Deliberately set to `Other`
+ ordinal_state = OrdinalState::Other;
+ } else {
+ ordinal_state = OrdinalState::Other;
+ }
+ break;
+ case OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily:
+ if (caseless_class == 0) {
+ score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY;
+ ordinal_state = OrdinalState::Space;
+ } else if (caseless_class == ASCII_DIGIT) {
+ score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY;
+ // Deliberately set to `Other`
+ ordinal_state = OrdinalState::Other;
+ } else {
+ ordinal_state = OrdinalState::Other;
+ }
+ break;
+ case OrdinalState::UpperN:
+ if (b == 0xAA) {
+ ordinal_state =
+ OrdinalState::OrdinalExpectingSpaceUndoImplausibility;
+ } else if (b == 0xBA) {
+ ordinal_state =
+ OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily;
+ } else if (b == '.') {
+ ordinal_state = OrdinalState::PeriodAfterN;
+ } else if (caseless_class == 0) {
+ ordinal_state = OrdinalState::Space;
+ } else {
+ ordinal_state = OrdinalState::Other;
+ }
+ break;
+ case OrdinalState::LowerN:
+ if (b == 0xBA) {
+ ordinal_state =
+ OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily;
+ } else if (b == '.') {
+ ordinal_state = OrdinalState::PeriodAfterN;
+ } else if (caseless_class == 0) {
+ ordinal_state = OrdinalState::Space;
+ } else {
+ ordinal_state = OrdinalState::Other;
+ }
+ break;
+ case OrdinalState::FeminineAbbreviationStartLetter:
+ if (b == 0xAA) {
+ ordinal_state =
+ OrdinalState::OrdinalExpectingSpaceUndoImplausibility;
+ } else if (caseless_class == 0) {
+ ordinal_state = OrdinalState::Space;
+ } else {
+ ordinal_state = OrdinalState::Other;
+ }
+ break;
+ case OrdinalState::Digit:
+ if (b == 0xAA || b == 0xBA) {
+ ordinal_state = OrdinalState::OrdinalExpectingSpace;
+ } else if (caseless_class == 0) {
+ ordinal_state = OrdinalState::Space;
+ } else if (caseless_class == ASCII_DIGIT) {
+ // pass
+ } else {
+ ordinal_state = OrdinalState::Other;
+ }
+ break;
+ case OrdinalState::Roman:
+ if (b == 0xAA || b == 0xBA) {
+ ordinal_state =
+ OrdinalState::OrdinalExpectingSpaceUndoImplausibility;
+ } else if (caseless_class == 0) {
+ ordinal_state = OrdinalState::Space;
+ } else if (caseless_class == 9 /* I */ || caseless_class == 22 /* V */ || caseless_class == 24)
+ /* X */
+ {
+ // pass
+ } else {
+ ordinal_state = OrdinalState::Other;
+ }
+ break;
+ case OrdinalState::PeriodAfterN:
+ if (b == 0xBA) {
+ ordinal_state = OrdinalState::OrdinalExpectingSpaceOrDigit;
+ } else if (caseless_class == 0) {
+ ordinal_state = OrdinalState::Space;
+ } else {
+ ordinal_state = OrdinalState::Other;
+ }
+ break;
+ case OrdinalState::Copyright:
+ if (caseless_class == 0) {
+ score += COPYRIGHT_BONUS;
+ ordinal_state = OrdinalState::Space;
+ } else {
+ ordinal_state = OrdinalState::Other;
+ }
+ break;
+ }
+ }
+
+ if (ascii) {
+ prev_non_ascii = 0;
+ } else {
+ prev_non_ascii += 1;
+ }
+ prev = caseless_class;
+ }
+ return score;
+}
+
+template struct ovdl::encoding_detect::DetectUtf8<true>;
+template struct ovdl::encoding_detect::DetectUtf8<false>;
diff --git a/src/openvic-dataloader/detail/Detect.hpp b/src/openvic-dataloader/detail/Detect.hpp
new file mode 100644
index 0000000..ad36d04
--- /dev/null
+++ b/src/openvic-dataloader/detail/Detect.hpp
@@ -0,0 +1,627 @@
+/// Based heavily on https://github.com/hsivonen/chardetng/tree/143dadde20e283a46ef33ba960b517a3283a3d22
+
+#pragma once
+
+#include <array>
+#include <cassert>
+#include <cstdint>
+#include <optional>
+#include <span>
+#include <type_traits>
+#include <variant>
+#include <vector>
+
+#include <openvic-dataloader/detail/Encoding.hpp>
+
+#include <lexy/action/match.hpp>
+#include <lexy/callback/constant.hpp>
+#include <lexy/dsl.hpp>
+#include <lexy/dsl/ascii.hpp>
+#include <lexy/dsl/newline.hpp>
+#include <lexy/encoding.hpp>
+#include <lexy/input/buffer.hpp>
+
+#include "detail/dsl.hpp"
+
+namespace ovdl::encoding_detect {
+ using cbyte = char;
+ using ubyte = unsigned char;
+
+ using Encoding = detail::Encoding;
+
+ struct DetectAscii {
+ // & 0b10000000 == 0b00000000
+ static constexpr auto rule = lexy::dsl::while_(lexy::dsl::ascii::character) + lexy::dsl::eol;
+ static constexpr auto value = lexy::constant(true);
+ };
+
+ template<bool IncludeAscii>
+ struct DetectUtf8 {
+ struct not_utf8 {
+ static constexpr auto name = "not utf8";
+ };
+
+ static constexpr auto rule = [] {
+ constexpr auto is_not_ascii_flag = lexy::dsl::context_flag<DetectUtf8>;
+
+ // & 0b10000000 == 0b00000000
+ constexpr auto ascii_values = lexy::dsl::ascii::character;
+ // & 0b11100000 == 0b11000000
+ constexpr auto two_byte = dsl::lit_b_range<0b11000000, 0b11011111>;
+ // & 0b11110000 == 0b11100000
+ constexpr auto three_byte = dsl::lit_b_range<0b11100000, 0b11101111>;
+ // & 0b11111000 == 0b11110000
+ constexpr auto four_byte = dsl::lit_b_range<0b11110000, 0b11110111>;
+ // & 0b11000000 == 0b10000000
+ constexpr auto check_bytes = dsl::lit_b_range<0b10000000, 0b10111111>;
+
+ constexpr auto utf8_check =
+ ((four_byte >> lexy::dsl::times<3>(check_bytes)) |
+ (three_byte >> lexy::dsl::times<2>(check_bytes)) |
+ (two_byte >> lexy::dsl::times<1>(check_bytes))) >>
+ is_not_ascii_flag.set();
+
+ return is_not_ascii_flag.template create<IncludeAscii>() +
+ lexy::dsl::while_(utf8_check | ascii_values) +
+ lexy::dsl::must(is_not_ascii_flag.is_set()).template error<not_utf8> + lexy::dsl::eof;
+ }();
+
+ static constexpr auto value = lexy::constant(true);
+ };
+
+ extern template struct DetectUtf8<true>;
+ extern template struct DetectUtf8<false>;
+
+ template<typename Input>
+ constexpr bool is_ascii(const Input& input) {
+ return lexy::match<DetectAscii>(input);
+ }
+
+ template<typename Input>
+ constexpr bool is_utf8_no_ascii(const Input& input) {
+ return lexy::match<DetectUtf8<false>>(input);
+ }
+
+ template<typename Input>
+ constexpr bool is_utf8(const Input& input) {
+ return lexy::match<DetectUtf8<true>>(input);
+ }
+
+ struct DetectorData {
+ static constexpr std::array latin_ascii = std::to_array<ubyte>({
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //
+ 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 0, 0, 0, 0, 0, 0, //
+ 0, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, //
+ 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 0, 0, 0, 0, 0, //
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 0, 0, 0, 0, 0, //
+ });
+
+ static constexpr std::array non_latin_ascii = std::to_array<ubyte>({
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //
+ 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 0, 0, 0, 0, 0, 0, //
+ 0, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, //
+ 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 0, 0, 0, 0, 0, //
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, //
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, //
+ });
+
+ static constexpr std::array windows_1251 = std::to_array<ubyte>({
+ 131, 130, 0, 2, 0, 0, 0, 0, 0, 0, 132, 0, 133, 130, 134, 135, //
+ 3, 0, 0, 0, 0, 0, 0, 0, 255, 0, 4, 0, 5, 2, 6, 7, //
+ 0, 136, 8, 140, 47, 130, 46, 47, 138, 49, 139, 49, 50, 46, 48, 141, //
+ 49, 50, 137, 9, 2, 49, 48, 46, 10, 47, 11, 48, 12, 130, 2, 13, //
+ 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, //
+ 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, //
+ 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, //
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, //
+ });
+
+ static constexpr std::array windows_1252 = std::to_array<ubyte>({
+ 0, 255, 0, 60, 0, 0, 0, 0, 0, 0, 156, 0, 157, 255, 185, 255, //
+ 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 28, 0, 29, 255, 57, 186, //
+ 0, 62, 60, 60, 60, 60, 59, 60, 60, 62, 60, 59, 63, 59, 61, 60, //
+ 62, 63, 61, 61, 60, 62, 61, 59, 60, 61, 60, 59, 62, 62, 62, 62, //
+ 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, //
+ 188, 174, 175, 176, 177, 178, 179, 63, 180, 181, 182, 183, 184, 188, 188, 27, //
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, //
+ 60, 46, 47, 48, 49, 50, 51, 63, 52, 53, 54, 55, 56, 60, 60, 58, //
+ });
+
+ // clang-format off
+ static constexpr std::array cyrillic = std::to_array<ubyte>({
+ 0, 0, 0, 0, 1, 0, 16, 38, 0, 2, 5, 10,121, 4, 20, 25, 26, 53, 9, 5, 61, 23, 20, 26, 15, 95, 60, 2, 26, 15, 25, 29, 0, 14, 6, 6, 25, 1, 0, 27, 25, 8, 5, 39, // ,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // a,
+ 0, 0, 0,255, 0, 0,255, 0,255, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0,255, 0, 0, // ѓ,
+ 0, 0,255, 0, 0, 0, 0, 0,255,255,255,255, 0,255, 2, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255, // ђ,
+ 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255, 0,255, 0, 0, 0, 0, 0, 4, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255, // љ,
+ 0, 0, 0, 0, 0, 0, 0, 0,255,255,255, 0, 0,255, 5, 0, 0, 0, 0, 2, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255, // њ,
+ 0, 0,255, 0, 0, 0, 0, 0,255, 0,255,255, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 1,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255, // ћ,
+ 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255, // џ,
+ 7, 0, 0,255,255,255,255,255, 0, 1, 0,255,255,255, 15, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 1, 0, 0, 0, 1, // ў,
+ 12, 0, 0,255,255, 0,255,255, 0, 2, 0, 0, 0, 0, 2, 3, 15, 5, 5, 0, 0, 4, 0, 0, 21, 15, 10, 17, 0, 6, 14, 4, 6, 0, 3, 1, 8, 1, 0, 0, 0, 2, 0, 0, 0, 0, // і,
+ 0, 0,255,255,255,255,255,255, 0, 0, 0,255,255, 0, 4, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ё,
+ 6, 0, 0,255,255,255,255,255, 0, 0,255, 5,255, 0, 1, 7, 0, 3, 2, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 2, 2, 5, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // є,
+ 12, 0, 0, 0, 0, 0, 0, 0,255, 0,255, 0, 0, 0, 5, 1, 0, 0, 0, 2, 0, 0, 20,255, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0,255,255,255,255, // ј,
+ 9, 0, 0,255,255,255,255,255,255, 5,255, 0, 0, 13, 3, 3, 0, 4, 1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 4, 0, 0, 1, 3, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ї,
+ 32, 0, 0, 2, 2, 2, 0, 0, 0, 1, 0, 0, 28, 0, 23, 22, 26, 22, 19, 0, 3, 12, 5, 0, 44, 38, 18, 58, 1, 21, 44, 17, 54, 1, 2, 28, 5, 8, 3, 1, 9, 0, 12, 0, 0, 0, // а,
+ 40, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 7, 0, 0, 0, 1, 7, 0, 1, 1, 0, 0, 7, 4, 1, 9, 0, 1, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, // б,
+ 31, 0, 0, 0, 0, 0, 0, 0, 0, 11, 0, 3, 0, 0, 19, 0, 0, 1, 1, 6, 0, 2, 6, 0, 1, 0, 1, 0, 32, 0, 2, 2, 23, 9, 0, 0, 0, 1, 0, 0, 1, 1, 0, 3, 0, 2, // в,
+ 23, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 7, 0, 1, 20, 0, 0, 1, 0, 9, 0, 0, 9, 7, 0, 5, 2, 18, 11, 0, 8, 3, 2, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 13, 0, 3, // г,
+ 26, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 2, 0, 2, 19, 0, 1, 5, 0, 13, 2, 2, 3, 2, 0, 6, 1, 12, 30, 0, 4, 0, 0, 7, 0, 0, 0, 0, 0, 0, 1, 0, 0, 5, 0, 1, // д,
+ 12, 0, 0, 1, 4, 5, 0, 0, 0, 0, 0, 0, 24, 1, 5, 7, 11, 3, 12, 1, 6, 6, 11, 0, 3, 15, 14, 14, 4, 8, 25, 14, 29, 0, 1, 1, 4, 8, 8, 2, 0, 3, 1, 0, 0, 0, // е,
+ 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 3, 2, 1, 2, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, // ж,
+ 19, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 1, 6, 0, 0, 0, 11, 8, 0, 0, 8, 0, 0, 0, 0, 0, 4, 0, 1, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, // з,
+ 24, 0, 0, 0, 0, 1, 5, 0, 0, 0, 0, 0, 1, 0, 1, 10, 16, 21, 22, 0, 6, 5, 6, 1, 15, 15, 8, 38, 2, 4, 27, 9, 15, 0, 3, 8, 12, 7, 6, 1, 0, 0, 0, 0, 0, 0, // и,
+ 6, 0, 0, 0,255,255,255,255, 0, 7, 0, 0,255, 4, 21, 0, 0, 0, 0, 5, 0, 0, 39, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 5, 0, 3, 0, 0, // й,
+ 54, 0, 0, 0, 0, 0, 0, 0, 1, 8, 0, 0, 0, 0, 10, 0, 1, 0, 1, 11, 0, 0, 12, 0, 1, 2, 0, 4, 8, 0, 2, 23, 2, 4, 0, 2, 3, 3, 8, 0, 0, 3, 16, 1, 4, 3, // к,
+ 12, 0, 0, 0, 0, 0, 0, 0, 2, 6, 0, 6, 0, 4, 29, 12, 4, 5, 2, 18, 0, 0, 17, 4, 5, 11, 0, 0, 21, 2, 3, 4, 1, 15, 1, 0, 0, 0, 0, 0, 4, 3, 2, 12, 0, 2, // л,
+ 23, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 4, 0, 0, 17, 1, 0, 0, 0, 7, 0, 1, 13, 2, 0, 0, 0, 0, 13, 0, 2, 4, 0, 2, 0, 0, 0, 0, 0, 0, 1, 4, 2, 4, 1, 1, // м,
+ 42, 0, 0, 0, 0, 0, 0, 0, 4, 12, 6, 7, 1, 7, 76, 0, 22, 1, 4, 27, 1, 3, 34, 30, 0, 7, 1, 13, 24, 1, 3, 5, 3, 4, 0, 1, 0, 4, 1, 0, 2, 18, 7, 16, 0, 4, // н,
+ 37, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 1, 0, 1, 10, 27, 22, 15, 1, 2, 3, 7, 5, 32, 11, 7, 38, 8, 21, 24, 11, 23, 0, 2, 10, 2, 2, 3, 2, 0, 0, 1, 0, 0, 0, // о,
+ 47, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 0, 0, 2, 0, 1, 2, 4, 0, 0, 2, 0, 6, 0, 0, 5, 0, 2, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, // п,
+ 19, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 8, 0, 5, 47, 4, 6, 6, 5, 23, 0, 0, 5, 2, 6, 0, 0, 0, 23, 22, 0, 1, 14, 9, 1, 0, 1, 0, 0, 0, 7, 2, 8, 16, 0, 3, // р,
+ 53, 0, 0, 0, 0, 0, 0, 0, 4, 9, 2, 0, 1, 2, 21, 1, 4, 1, 2, 11, 0, 0, 12, 2, 4, 7, 1, 13, 15, 1, 4, 6, 3, 6, 0, 0, 0, 0, 0, 0, 1, 2, 3, 5, 0, 1, // с,
+ 28, 0, 0, 0, 0, 0, 0, 0, 1, 6, 0, 1, 0, 1, 32, 0, 1, 3, 0, 12, 0, 1, 22, 1, 4, 7, 1, 6, 23, 0, 14, 41, 14, 3, 0, 1, 1, 1, 21, 0, 2, 2, 6, 2, 1, 4, // т,
+ 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 2, 4, 2, 4, 6, 3, 0, 2, 0, 0, 6, 5, 6, 3, 0, 3, 7, 4, 7, 18, 1, 6, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, // у,
+ 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ф,
+ 41, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 2, 30, 0, 2, 0, 0, 11, 0, 0, 5, 1, 14, 3, 0, 3, 6, 0, 7, 0, 0, 1, 0, 1, 0, 2, 0, 0, 0, 4, 3, 5, 0, 0, // х,
+ 8, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 7, 0, 0, 0, 0, 4, 0, 0, 7, 1, 0, 1, 0, 2, 1, 0, 0, 9, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 1, 1, // ц,
+ 6, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 5, 0, 1, 5, 0, 2, 0, 0, 6, 0, 0, 1, 0, 0, 3, 0, 2, 0, 0, 2, 0, 1, 0, 0, 3, 0, 0, 2, 0, 0, 0, 0, // ч,
+ 12, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 17, 0, 0, 1, 0, 2, 0, 0, 26, 0, 0, 0, 0, 0, 22, 2, 6, 0, 0, 5, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, // ш,
+ 2, 0,255, 0,255,255,255,255,255, 0, 0, 0,255, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, // щ,
+ 0, 0,255,255,255,255, 0,255, 0, 0, 0,255,255,255, 0, 3, 4, 0, 2, 0, 0, 0, 0, 0, 11, 0, 1, 0, 0, 2, 2, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ъ,
+ 1, 0, 0,255,255,255,255,255, 0, 0, 0, 0, 0,255, 0, 3, 11, 0, 4, 0, 2, 1, 0, 0, 0, 3, 1, 16, 0, 0, 22, 2, 10, 0, 0, 0, 8, 6, 3, 0, 0, 0, 0, 0, 0, 0, // ы,
+ 0, 0, 0,255,255, 0, 0, 0,255, 0, 0, 0, 0, 0, 5, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 7, 3, 0, 1, 13, 7, 7, 0, 35, 6, 0, 0, 0, 0, 0, 0, 0, 6, 0, // ь,
+ 10, 0, 0,255,255,255,255,255, 0, 0, 0, 0,255, 0, 0, 1, 1, 10, 11, 0, 2, 2, 0, 0, 0, 9, 3, 9, 0, 0, 7, 6, 9, 0, 0, 8, 3, 2, 1, 0, 0, 0, 0, 17, 0, 0, // э,
+ 14, 0, 0, 0,255,255,255,255, 0, 0, 0, 0,255, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ю,
+ 5, 0, 0,255,255,255,255,255, 0, 9, 0, 0,255, 0, 11, 0, 3, 0, 0, 0, 0, 2, 24, 0, 0, 5, 2, 14, 1, 0, 2, 3, 1, 0, 0, 1, 3, 0, 0, 0, 0, 16, 1, 0, 0, 0, // я,
+ // , a, ѓ, ђ, љ, њ, ћ, џ, ў, і, ё, є, ј, ї, а, б, в, г, д, е, ж, з, и, й, к, л, м, н, о, п, р, с, т, у, ф, х, ц, ч, ш, щ, ъ, ы, ь, э, ю, я,
+ });
+ // clang-format on
+
+ // clang-format off
+ static constexpr std::array western = std::to_array<ubyte>({
+ 18, 3, 0,254, 74, 0, 5,254,254, 2, 25,254,149, 4,254, 66,148,254, 0,254,122,238, 8, 1, 20, 13,254, 35, 20, 3, 1, 0, // ,
+ 0, 3, 0, 0, 0, 0, 0, 5, 2, 0, 86, 9, 76, 0, 0, 0,241, 0, 0, 49, 0, 0, 0, 0, 11, 2, 0, 34, 0, 1, 2, 0, // a,
+ 19, 0, 0, 5, 5, 0, 0, 8, 13, 5, 0, 34, 22, 0, 0, 0, 4, 0, 0, 0, 6, 1, 3, 3, 42, 37, 8, 8, 0, 67, 0, 0, // b,
+ 0, 0, 0, 9, 6, 1, 0, 22, 10, 1, 0, 19, 54, 1, 0, 1, 18, 3, 1, 2, 40, 7, 0, 0, 6, 0, 3, 5, 1, 34, 0, 0, // c,
+ 0, 0, 0, 5, 5, 0, 0, 12, 45, 16, 1, 6, 42, 0, 13, 3, 10, 0, 2, 0, 66, 11, 5, 8, 33,104, 3, 4, 0, 19, 0, 0, // d,
+ 63, 5, 0, 0, 0, 0, 2, 33, 15, 1, 3, 0, 87, 0, 0, 0, 0, 0, 1, 21, 0, 0, 0, 49, 1, 11, 0, 3, 0, 9, 1, 0, // e,
+ 0, 0, 0, 8, 8, 0, 0, 10, 2, 7, 0,162, 23, 0, 13, 0, 4, 0, 0, 0, 1, 3, 0, 0, 15, 4, 0, 0, 0, 4, 0, 0, // f,
+ 1, 0, 0, 14, 16, 24, 0, 29, 11, 41, 0, 13, 86, 0, 14, 9, 3, 0, 0, 0, 20, 8, 7, 7, 13, 37, 14, 0, 0, 12, 0, 0, // g,
+ 1, 0, 0, 0, 0, 0, 0, 47, 2, 0, 0, 0, 1, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 29, 20, 0, 0, 0, 0, 45, 0, 0, // h,
+ 5, 4, 0,166,120, 0, 0,144, 0, 2, 3, 88,254, 0, 0, 0, 0, 0, 0, 3, 28,107, 0,112, 8, 2, 44, 32, 0, 3, 3, 0, // i,
+ 0, 0, 0, 0, 0, 0, 0, 39, 9, 0, 0, 2, 1, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 16, 18, 44, 0, 0, 0, 0, 0,255, // j,
+ 0, 2, 0, 0, 1, 0, 0, 48, 31, 32, 1, 60, 1, 0, 4, 0, 1, 0, 0, 0, 1, 3, 0, 2, 20, 47, 0, 0, 0, 20, 0, 0, // k,
+ 4, 0, 0, 12, 16, 0, 0, 54, 40, 48, 0, 64, 36, 0, 39, 6, 12, 3, 0, 0, 27, 9, 3, 24, 42, 33, 2, 9, 7, 77, 0, 0, // l,
+ 0, 0, 0, 14, 5, 4, 0, 60, 11, 4, 3, 48, 30, 7, 28, 1, 10, 1, 0, 0, 24, 41, 3, 3, 19, 24, 1, 8, 2, 36, 0, 0, // m,
+ 1, 1, 0, 24, 91, 16, 0,132, 62, 73, 1, 56, 71, 33, 78, 7, 35, 2, 3, 0, 94,254, 10, 21, 33, 38, 24, 21, 1, 61, 0, 0, // n,
+ 0, 1, 0, 0, 0, 0,254, 6, 0, 1, 27, 0, 13, 0, 0, 84,127, 0, 0, 62, 0, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, // o,
+ 0, 0, 0, 5, 2, 0, 0, 9, 15, 0, 0, 4, 34, 0, 6, 0, 6, 0, 0, 0, 20, 12, 9, 28, 10, 22, 0, 3, 0, 7, 0, 0, // p,
+ 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 33, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,255,255, // q,
+ 0, 0, 0, 83, 62, 1, 0,198,139,125, 0,229, 94, 54,190, 38, 18, 1, 0, 0,176, 24, 16, 29,193,181, 13, 13, 2,131, 0, 0, // r,
+ 1, 0, 0, 41, 34, 0, 0, 41, 24, 42, 0, 68,113, 15,159, 6, 43, 19, 4, 58, 14, 18, 1, 4, 48, 42, 4, 12, 9, 20, 0, 0, // s,
+ 7, 1, 0, 14, 20, 8, 0, 56, 37, 31, 0,104, 67, 14,113, 3, 50, 9, 5, 0, 89, 7, 19, 22, 13, 14, 40, 12, 15, 18, 0, 0, // t,
+ 0, 1, 5, 1, 2, 0, 0, 30, 0, 0, 1, 15, 2, 0, 1, 0, 1, 0, 0, 2, 4, 0, 0, 36, 0, 0, 0, 0, 0, 0, 0, 0, // u,
+ 0, 2, 0, 1, 6, 0, 0, 29, 33, 13, 0, 19, 46, 0, 15, 0, 7, 0, 1, 31, 2, 2, 3, 1, 32, 27, 0, 0, 1, 1, 0, 0, // v,
+ 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 3, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0,255, // w,
+ 0, 0, 0, 1, 16, 0, 0, 23, 0, 0, 0, 3, 14, 0, 0, 0, 2, 3, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, // x,
+ 0, 0, 0, 0, 0, 0, 0, 58, 8, 0, 0, 1, 1, 62, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 6, 82, 0, 0, 0, 0, 0,255, // y,
+ 0, 0, 0, 0, 2, 0, 0, 0, 14, 0, 0, 7, 3, 0, 6, 0, 3, 5, 0, 0, 0, 0, 4, 0, 1, 0, 0, 0, 0, 0, 0, 0, // z,
+ 0, 29, 0, 0, 0, 15, 0, 0, 0, 11, 0, 0, 0, 0, 0, 20, 0, 0, 0, 0, 0, 37, 0, 0, 0, 0, 0, 0,255,255, 0, 0,255,255, 4, 0, 0,255,255, 0,255, 0,255, 0, 0,255,255,255, 0, 0, 0, 8, 0,255, 0, 0, 2, 0, 0, // ß,
+ 6, 2, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 10, 1, 0, 0, 0, 0, 0, 0, 0,255, 0, 1, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // š,
+ 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0,255,255,255, 0, 0, 0,255,255,255, 0,255,255,255,255, 0, 0,255,255,255,255,255,255, 0,255,255,255, 0,255,255, // œ,
+ 107, 0, 22, 16, 18, 14, 6, 24, 46, 15, 2, 0, 42, 18, 17, 0, 36, 0, 34, 4,254, 1, 2, 0, 0, 1, 0, 0, 0,255, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0,255,255,255,255,255, 0, 0,255, 0, 0, 0, // à,
+ 41, 0, 10, 8, 21, 34, 5, 5, 60, 18, 5, 1, 29, 42, 26, 2, 16, 0, 27, 9, 43, 28, 7, 0, 0, 1, 4, 0, 0,255, 0, 0,255,255,255, 0,255, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255, 0, 0, 0, 0, 0,255, // á,
+ 24, 0, 1, 2, 0, 0, 0, 0, 7, 0, 0, 0, 3, 1, 0, 0, 0, 0, 2, 0, 5, 0, 1, 0, 0, 0, 0,255, 0,255, 0, 0, 0,255, 0,255, 0, 0, 0, 2, 0,255, 0,255, 0, 0, 0, 0,255, 0,255,255,255,255,255, 0,255, 0,255, // â,
+ 0, 0, 0, 1, 2, 3, 0, 1, 2, 12, 0, 0, 1, 7, 29, 4, 1,255, 11, 66, 11, 0, 1, 0, 0, 0, 0,255, 0,255,255,255, 0, 0, 0,255,255,127,255,255,255,255,255, 0, 0,255, 0, 0,255,255, 0,255,255,255,255,255,255,255,255, // ã,
+ 134, 1, 11, 0, 25, 6, 15, 11, 61, 24,123, 95,114, 68, 53, 1, 49, 0, 60, 98,198, 0, 88, 29, 0, 6, 12, 0, 0,255, 0,255, 0, 0,118, 0,255, 0,255, 0,255, 0,255, 0,255,255, 0,255,255, 0,255, 2,255,255,255, 0, 0, 0,255, // ä,
+ 156, 0, 12, 14, 19, 3, 12, 47, 17, 3, 12, 5, 30, 47, 22, 0,205, 0,184, 70, 19, 0, 22, 8, 0, 6, 1,255, 0,255,255, 0,255, 0, 0, 0, 0, 0,255, 0,255, 0,255, 0, 0,255,255,255,255,255,255, 0, 0,255,255,255,255,255,255, // å,
+ 26, 0, 7, 0, 4, 0, 23, 8, 15, 0, 18, 19, 56, 23, 24, 0, 9, 0, 82, 37, 24, 0, 71, 0, 0, 0, 0,255, 0,255,255, 0,255,255, 0, 0, 0, 0,255, 0,255,255,255, 0,255,255, 0,255,255,255,255, 0, 0,255,255,255,255, 0,255, // æ,
+ 17,112, 0, 2, 0, 15, 0, 0, 0, 35, 0, 0, 2, 0, 59, 9, 1, 0, 36, 0, 0, 8, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, // ç,
+ 254, 0, 9, 14, 20, 0, 15, 6, 70,144, 14, 45, 47, 92, 16, 3,123, 0, 38, 23,115, 52, 22, 42, 2, 80, 19,255, 0,255, 0, 0,255,255, 0,255,255, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0,255,255,255, 0, 0, 0, 1,255,255, // è,
+ 152, 2, 19, 24, 85, 0, 29, 23, 26, 25, 2, 9, 43, 60, 62, 1, 32, 0,122, 45,169, 15, 13, 30, 7, 4, 8, 0, 0,255, 0, 0, 0, 0, 0,255, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1,255, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, // é,
+ 5, 0, 0, 3, 7, 0, 0, 10, 2, 3, 0, 26, 6, 6, 20, 1, 2, 0, 20, 1, 11, 5, 5, 2, 0, 0, 1,255, 0,255,255,255, 0,255,255,255,255, 0, 0, 0, 0, 0,255, 0, 0, 0, 0,255, 0, 0,255,255,255, 0,255, 0, 0, 0,255, // ê,
+ 36, 2, 23, 15, 36,143, 5, 23, 52, 52, 66, 48, 92, 57,216, 10,125, 35, 89, 58,254, 9, 24, 14, 0, 0, 8,255, 0,255, 0,255,255,255, 0, 0,255, 1, 0, 0, 0, 0, 0,255, 0, 0, 0,255,255,255, 0, 0, 0, 0,255, 0, 0, 0,255, // ë,
+ 12, 0, 1, 4, 6, 0, 3, 21, 10, 0, 0, 0, 18, 8, 4, 0, 1, 0, 65, 35, 8, 3, 0, 0, 0, 0, 0,255, 0,255, 0, 0,255,255,255,255,255,255, 0, 0, 0,255, 0, 0, 0,255, 0, 0,255, 0,255,255,255, 0,255,255, 0, 0,255, // ì,
+ 40, 72, 7, 10, 16, 2, 23, 10, 34, 0, 0, 1, 34, 15, 21, 1, 3, 0,203, 28, 58, 23, 11, 0, 10, 0, 2, 0, 0, 0, 0, 0, 0,255, 0,255,255, 0, 0, 0, 0,255, 0, 0,255,255, 1,255, 0,255,255, 0,255,255, 0,255, 2, 0,255, // í,
+ 6, 5, 1, 9, 5, 0, 0, 0, 22, 0, 9, 8, 8, 6, 9, 1, 10, 0, 20, 6,182, 0, 13, 0, 0, 24, 1,255, 0,255,255,255, 0, 0,255, 0,255, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0,255,255,255,255,255, 0,255,255,255, // î,
+ 0, 6, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0,255, 0,255, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0,255, 0, 0, 0, 0,255,255, 0, 0, 0,255,255, // ï,
+ 0,254, 0, 0, 0, 26, 0, 0, 0, 61, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 25, 0, 0, 0, 0, 0,255,255,255, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 0,255, 0, 1, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,255, 0,255,255, // ñ,
+ 20, 0, 56, 43, 8,162, 14, 3, 23, 19, 2,118, 31, 26, 46, 0, 20, 0, 23, 6, 24, 19, 6, 21, 5, 27, 63,255, 0,255, 0, 0,255,255,255,255,255, 3, 0,255,255,255, 0, 0,255, 0, 0, 0, 0,255, 0,255,255, 0,255,255, 0,255,255, // ò,
+ 67, 0, 12, 15, 9, 7, 8, 66, 13,254, 3, 23, 14, 16, 16, 0, 8, 0, 29, 11, 26, 0, 5, 5, 1, 10, 13,255, 0,255,255, 0,255, 0, 0,255,255, 1,255, 0,255,255, 0, 0,255, 0, 1, 0, 0, 0, 0,255,255,255, 0,255,255, 0,255, // ó,
+ 18, 3, 3, 12, 1, 0, 2, 0, 7, 0, 1, 0, 2, 2, 8, 0, 6, 0, 6, 7, 4, 0, 2, 0, 0, 0, 1,255, 0, 0,255, 0, 0,255,255,255, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0,255,255,255,255, 0, 0,255,255,255, // ô,
+ 29, 2, 0, 0, 0, 0, 0, 0, 5, 2, 22, 30, 25, 38, 19, 0, 33,255, 4, 39, 24, 0, 88, 0, 0, 0, 0,255, 0,255,255, 0,255, 0,255,255,255, 36,255,255,255,255,255, 0,255,255, 0,255, 0, 0, 6, 0,255,255,255, 0, 0, 0,255, // õ,
+ 44, 0, 33, 0, 25, 0,142, 5, 46, 10, 25, 32, 26, 13, 6, 0, 3, 0, 30, 8, 35, 0, 25, 5, 0, 44, 7, 0, 0,255,255, 0,255,255, 73, 0,255, 0, 0, 0,255,255,255,255,255, 0, 0,255, 0, 0, 0, 39, 0,255,255,255, 0, 0, 0, // ö,
+ 52, 0, 21, 0, 57, 0,119, 12, 47, 3, 59, 33, 45, 15, 12, 0, 3, 0, 52, 82, 49, 1, 11, 0, 0, 0, 0, 0,255, 0,255,255,255,255,255, 0, 0, 0,255, 0,255,255,255, 0,255,255, 0,255,255,255,255, 0, 0,255,255,255,255,255, 0, // ø,
+ 25, 0, 4, 3, 53, 0, 0, 2, 12, 72, 0, 0, 30, 0, 0,254, 0, 0, 6, 3, 3, 0, 0, 0, 0, 0, 0,255, 0,255, 0,255, 0,255,255,255,255, 0, 0, 0, 0,255, 0,255,255,255,255, 0,255, 0, 0,255,255, 0, 0, 0, 0, 0, 0, // ù,
+ 19, 2, 1, 7, 9, 1, 12, 5, 9, 41, 1, 0, 10, 7, 9, 0, 8, 0, 12, 28, 8, 0, 0, 0, 0, 1, 0,255, 0,255,255, 0,255,255,255,255, 0, 0,255, 0,255,255,255, 0,255,255, 0, 0, 0,255, 0,255,255, 0, 0,255,255, 0,255, // ú,
+ 0, 0, 0, 0, 1, 5, 0, 0, 1, 0, 0, 0, 0, 0, 0, 45, 0, 0, 3, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255, 0,255,255,255,255, 0,255, 0,255,255,255, 0, 0,255,255,255,255, 0,255,255,255, 0,255, 0, 0,255, 0, // û,
+ 95, 2, 19, 0, 6, 2,121, 9, 15, 1, 5, 44, 18, 26, 7, 0, 11, 2, 68, 49, 20, 0, 2, 17, 0, 0, 6, 0, 0,255, 0,255,255,255, 0,255,255, 0,255, 0,255, 0,255,255,255, 0, 0,255,255,255, 0, 0,255, 0, 0, 0, 31, 0, 0, // ü,
+ 1, 1, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0,255,255, 0, 0,255, 0,255, 0,255,255,255,255, 0, 0, 0, 0,255, 0, 0, 0, 0, 0,255, // ž,
+ 0, 0, 0, 0, 0, 0,255, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0,255,255, 0,255,255,255,255,255,255, 0,255, 0,255,255,255,255,255,255,255,255,255,255,255,255,255, 0, 0,255, 0,255,255,255, 0, 0, 0, // ÿ,
+ // , a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, ß, š, œ, à, á, â, ã, ä, å, æ, ç, è, é, ê, ë, ì, í, î, ï, ñ, ò, ó, ô, õ, ö, ø, ù, ú, û, ü, ž, ÿ,
+
+ });
+ // clang-format on
+ };
+
+ namespace class_size {
+ constexpr std::size_t cyrillic_ascii = 2;
+ constexpr std::size_t cyrillic_non_ascii = 44;
+ constexpr std::size_t western_ascii = 27;
+ constexpr std::size_t western_non_ascii = 32;
+ }
+
+ constexpr std::size_t ASCII_DIGIT = 100;
+
+ struct ByteScore {
+ const Encoding encoding;
+ const std::array<ubyte, 128>& lower;
+ const std::array<ubyte, 128>& upper;
+ const std::span<const ubyte> probabilities;
+ const std::size_t ascii;
+ const std::size_t non_ascii;
+
+ static inline constexpr std::optional<std::size_t> compute_index(std::size_t x, std::size_t y, std::size_t ascii_classes, std::size_t non_ascii_classes) {
+ if (x == 0 && y == 0) {
+ return std::nullopt;
+ }
+
+ if (x < ascii_classes && y < ascii_classes) {
+ return std::nullopt;
+ }
+
+ if (y >= ascii_classes) {
+ return (ascii_classes * non_ascii_classes) + (ascii_classes + non_ascii_classes) * (y - ascii_classes) + x;
+ }
+
+ return y * non_ascii_classes + x - ascii_classes;
+ }
+
+ inline constexpr cbyte classify(cbyte byte) const {
+ cbyte high = byte >> 7;
+ cbyte low = byte & 0x7F;
+ if (high == 0) {
+ return lower[low];
+ }
+
+ return upper[low];
+ }
+
+ inline constexpr bool is_latin_alphabetic(cbyte caseless_class) const {
+ return caseless_class > 0 && caseless_class < (ascii + non_ascii);
+ }
+
+ inline constexpr bool is_non_latin_alphabetic(cbyte caseless_class) const {
+ return caseless_class > 1 && caseless_class < (ascii + non_ascii);
+ }
+
+ inline constexpr int64_t score(cbyte current_class, cbyte previous_class) const {
+ constexpr std::size_t IMPLAUSABILITY_PENALTY = -220;
+
+ constexpr std::size_t PLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE = 0;
+ constexpr std::size_t IMPLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE = 1;
+ constexpr std::size_t IMPLAUSIBLE_BEFORE_ALPHABETIC = 2;
+ constexpr std::size_t IMPLAUSIBLE_AFTER_ALPHABETIC = 3;
+ constexpr std::size_t PLAUSIBLE_NEXT_TO_NON_ASCII_ALPHABETIC_ON_EITHER_SIDE = 4;
+ constexpr std::size_t PLAUSIBLE_NEXT_TO_ASCII_ALPHABETIC_ON_EITHER_SIDE = 5;
+
+ std::size_t stored_boundary = ascii + non_ascii;
+ if (current_class < stored_boundary) {
+ if (previous_class < stored_boundary) {
+ if (auto index = compute_index(previous_class, current_class, ascii, non_ascii); index) {
+ ubyte b = probabilities[index.value()];
+ if (b == 255) {
+ return IMPLAUSABILITY_PENALTY;
+ }
+ return b;
+ }
+ return 0;
+ }
+
+ if (current_class == 0 || current_class == ASCII_DIGIT) {
+ return 0;
+ }
+
+ std::size_t previous_unstored = previous_class - stored_boundary;
+ switch (previous_unstored) {
+ case PLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE:
+ case IMPLAUSIBLE_AFTER_ALPHABETIC:
+ return 0;
+ case IMPLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE:
+ case IMPLAUSIBLE_BEFORE_ALPHABETIC:
+ return IMPLAUSABILITY_PENALTY;
+ case PLAUSIBLE_NEXT_TO_NON_ASCII_ALPHABETIC_ON_EITHER_SIDE:
+ if (current_class < ascii) {
+ return IMPLAUSABILITY_PENALTY;
+ }
+ return 0;
+ case PLAUSIBLE_NEXT_TO_ASCII_ALPHABETIC_ON_EITHER_SIDE:
+ if (current_class < ascii) {
+ return 0;
+ }
+ return IMPLAUSABILITY_PENALTY;
+ default:
+ assert(previous_class == ASCII_DIGIT);
+ return 0;
+ }
+ }
+
+ if (previous_class < stored_boundary) {
+ if (previous_class == 0 || previous_class == ASCII_DIGIT) {
+ return 0;
+ }
+
+ std::size_t current_unstored = current_class - stored_boundary;
+ switch (current_unstored) {
+ case PLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE:
+ case IMPLAUSIBLE_BEFORE_ALPHABETIC:
+ return 0;
+ case IMPLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE:
+ case IMPLAUSIBLE_AFTER_ALPHABETIC:
+ return IMPLAUSABILITY_PENALTY;
+ case PLAUSIBLE_NEXT_TO_NON_ASCII_ALPHABETIC_ON_EITHER_SIDE:
+ if (previous_class < ascii) {
+ return IMPLAUSABILITY_PENALTY;
+ }
+ return 0;
+ case PLAUSIBLE_NEXT_TO_ASCII_ALPHABETIC_ON_EITHER_SIDE:
+ if (previous_class < ascii) {
+ return 0;
+ }
+ return IMPLAUSABILITY_PENALTY;
+ default:
+ assert(current_class == ASCII_DIGIT);
+ return 0;
+ }
+ }
+
+ if (current_class == ASCII_DIGIT || previous_class == ASCII_DIGIT) {
+ return 0;
+ }
+
+ return IMPLAUSABILITY_PENALTY;
+ }
+ };
+
+ enum class ScoreIndex {
+ Windows1251,
+ Windows1252
+ };
+
+ static constexpr std::array byte_scores {
+ ByteScore {
+ .encoding = Encoding::Windows1251,
+ .lower = DetectorData::non_latin_ascii,
+ .upper = DetectorData::windows_1251,
+ .probabilities = DetectorData::cyrillic,
+ .ascii = class_size::cyrillic_ascii,
+ .non_ascii = class_size::cyrillic_non_ascii },
+ ByteScore {
+ .encoding = Encoding::Windows1252,
+ .lower = DetectorData::latin_ascii,
+ .upper = DetectorData::windows_1252,
+ .probabilities = DetectorData::western,
+ .ascii = class_size::western_ascii,
+ .non_ascii = class_size::western_non_ascii }
+ };
+
+ constexpr const ByteScore& get_byte_score(ScoreIndex index) {
+ return byte_scores[static_cast<std::underlying_type_t<ScoreIndex>>(index)];
+ }
+
+ struct Utf8Canidate {
+ std::optional<int64_t> read(const std::span<const cbyte>& buffer);
+ };
+
+ struct AsciiCanidate {
+ std::optional<int64_t> read(const std::span<const cbyte>& buffer);
+ };
+
+ struct NonLatinCasedCanidate {
+ enum class CaseState {
+ Space,
+ Upper,
+ Lower,
+ UpperLower,
+ AllCaps,
+ Mix,
+ };
+
+ const ByteScore& score_data;
+ cbyte prev {};
+ CaseState case_state = CaseState::Space;
+ bool prev_ascii = true;
+ uint64_t current_word_len {};
+ uint64_t longest_word {};
+ bool ibm866 = false;
+ bool prev_was_a0 = false;
+
+ std::optional<int64_t> read(const std::span<const cbyte>& buffer);
+ };
+
+ struct LatinCanidate {
+ enum class CaseState {
+ Space,
+ Upper,
+ Lower,
+ AllCaps,
+ };
+
+ enum class OrdinalState {
+ Other,
+ Space,
+ PeriodAfterN,
+ OrdinalExpectingSpace,
+ OrdinalExpectingSpaceUndoImplausibility,
+ OrdinalExpectingSpaceOrDigit,
+ OrdinalExpectingSpaceOrDigitUndoImplausibily,
+ UpperN,
+ LowerN,
+ FeminineAbbreviationStartLetter,
+ Digit,
+ Roman,
+ Copyright,
+ };
+
+ const ByteScore& score_data;
+ cbyte prev {};
+ CaseState case_state = CaseState::Space;
+ uint32_t prev_non_ascii {};
+ OrdinalState ordinal_state = OrdinalState::Space; // Used only when `windows1252 == true`
+ bool windows1252;
+
+ constexpr LatinCanidate(const ByteScore& data) : score_data(data) {
+ windows1252 = data.encoding == Encoding::Windows1252;
+ }
+
+ std::optional<int64_t> read(const std::span<const cbyte>& buffer);
+ };
+
+ using InnerCanidate = std::variant<NonLatinCasedCanidate, LatinCanidate, Utf8Canidate, AsciiCanidate>;
+
+ template<class... Ts>
+ struct overloaded : Ts... {
+ using Ts::operator()...;
+ };
+
+ template<class... Ts>
+ overloaded(Ts...) -> overloaded<Ts...>;
+
+ struct Canidate {
+ InnerCanidate inner;
+ std::optional<int64_t> score_value;
+
+ template<typename CanidateT>
+ static constexpr Canidate create_canidate() {
+ return {
+ .inner = CanidateT(),
+ .score_value = 0
+ };
+ }
+
+ template<typename CanidateT>
+ static constexpr Canidate create_canidate(const ByteScore& score) {
+ return {
+ .inner = CanidateT { score },
+ .score_value = 0
+ };
+ }
+
+ static constexpr Canidate new_utf8() {
+ return create_canidate<Utf8Canidate>();
+ }
+
+ static constexpr Canidate new_ascii() {
+ return create_canidate<AsciiCanidate>();
+ }
+
+ static constexpr Canidate new_latin(ScoreIndex index) {
+ return create_canidate<LatinCanidate>(get_byte_score(index));
+ }
+
+ static constexpr Canidate new_non_latin_cased(ScoreIndex index) {
+ return create_canidate<NonLatinCasedCanidate>(get_byte_score(index));
+ }
+
+ constexpr std::optional<int64_t> score(const std::span<const cbyte>& buffer, std::size_t encoding, bool expectation_is_valid) {
+ if (auto old_score = score_value) {
+ auto new_score = std::visit([&](auto& inner) {
+ return inner.read(buffer);
+ },
+ inner);
+ if (new_score) {
+ score_value = old_score.value() + new_score.value();
+ } else {
+ score_value = std::nullopt;
+ }
+ }
+
+ if (auto nlcc = std::get_if<NonLatinCasedCanidate>(&inner)) {
+ if (nlcc->longest_word < 2) {
+ return std::nullopt;
+ }
+ }
+ return score_value;
+ }
+
+ constexpr Encoding encoding() const {
+ return std::visit(
+ overloaded {
+ [](const Utf8Canidate& canidate) {
+ return Encoding::Utf8;
+ },
+ [](const AsciiCanidate& canidate) {
+ return Encoding::Ascii;
+ },
+ [](const LatinCanidate& canidate) {
+ return canidate.score_data.encoding;
+ },
+ [](const NonLatinCasedCanidate& canidate) {
+ return canidate.score_data.encoding;
+ } },
+ inner);
+ }
+ };
+
+ struct Detector {
+ std::vector<Canidate> canidates {
+ Canidate::new_ascii(),
+ Canidate::new_utf8(),
+ Canidate::new_latin(ScoreIndex::Windows1252),
+ Canidate::new_non_latin_cased(ScoreIndex::Windows1251),
+ };
+
+ Encoding default_fallback = Encoding::Unknown;
+
+ constexpr std::pair<Encoding, bool> detect_assess(std::span<const cbyte> buffer, bool allow_utf8 = true) {
+ int64_t max = 0;
+ Encoding encoding = default_fallback; // Presumes fallback, defaults to Unknown encoding if unknown (which skips conversion)
+ std::size_t i = 0;
+ for (Canidate& canidate : canidates) {
+ if (!allow_utf8 && canidate.encoding() == Encoding::Utf8) {
+ continue;
+ }
+
+ if (auto score = canidate.score(buffer, i, false)) {
+ switch (canidate.encoding()) {
+ using enum Encoding;
+ case Ascii:
+ case Utf8:
+ return { canidate.encoding(), true };
+ default: break;
+ }
+
+ auto value = score.value();
+ if (value > max) {
+ max = value;
+ encoding = canidate.encoding();
+ }
+ }
+ i++;
+ }
+ return { encoding, max >= 0 };
+ }
+
+ constexpr Encoding detect(std::span<const cbyte> buffer, bool allow_utf8 = true) {
+ return detect_assess(buffer, allow_utf8).first;
+ }
+
+ template<typename BufferEncoding>
+ std::pair<Encoding, bool> detect_assess(const lexy::buffer<BufferEncoding, void>& buffer, bool allow_utf8 = true) {
+ auto span = std::span<const cbyte>(buffer.data(), buffer.size());
+ return detect_assess(span);
+ }
+
+ template<typename BufferEncoding>
+ constexpr Encoding detect(const lexy::buffer<BufferEncoding, void>& buffer, bool allow_utf8 = true) {
+ return detect_assess(buffer, allow_utf8).first;
+ }
+ };
+} \ No newline at end of file
diff --git a/src/openvic-dataloader/detail/DetectUtf8.hpp b/src/openvic-dataloader/detail/DetectUtf8.hpp
deleted file mode 100644
index e9d0350..0000000
--- a/src/openvic-dataloader/detail/DetectUtf8.hpp
+++ /dev/null
@@ -1,53 +0,0 @@
-#pragma once
-
-#include <lexy/action/match.hpp>
-#include <lexy/dsl.hpp>
-
-#include "detail/dsl.hpp"
-
-namespace ovdl::detail {
- namespace detect_utf8 {
-
- template<bool INCLUDE_ASCII>
- struct DetectUtf8 {
- struct not_utf8 {
- static constexpr auto name = "not utf8";
- };
-
- static constexpr auto rule = [] {
- constexpr auto is_not_ascii_flag = lexy::dsl::context_flag<DetectUtf8>;
-
- // & 0b10000000 == 0b00000000
- constexpr auto ascii_values = dsl::make_range<0b00000000, 0b01111111>();
- // & 0b11100000 == 0b11000000
- constexpr auto two_byte = dsl::make_range<0b11000000, 0b11011111>();
- // & 0b11110000 == 0b11100000
- constexpr auto three_byte = dsl::make_range<0b11100000, 0b11101111>();
- // & 0b11111000 == 0b11110000
- constexpr auto four_byte = dsl::make_range<0b11110000, 0b11110111>();
- // & 0b11000000 == 0b10000000
- constexpr auto check_bytes = dsl::make_range<0b10000000, 0b10111111>();
-
- constexpr auto utf8_check =
- ((four_byte >> lexy::dsl::times<3>(check_bytes)) |
- (three_byte >> lexy::dsl::times<2>(check_bytes)) |
- (two_byte >> lexy::dsl::times<1>(check_bytes))) >>
- is_not_ascii_flag.set();
-
- return is_not_ascii_flag.template create<INCLUDE_ASCII>() +
- lexy::dsl::while_(utf8_check | ascii_values) +
- lexy::dsl::must(is_not_ascii_flag.is_set()).template error<not_utf8>;
- }();
- };
- }
-
- template<typename Input>
- constexpr bool is_utf8_no_ascii(const Input& input) {
- return lexy::match<detect_utf8::DetectUtf8<false>>(input);
- }
-
- template<typename Input>
- constexpr bool is_utf8(const Input& input) {
- return lexy::match<detect_utf8::DetectUtf8<true>>(input);
- }
-} \ No newline at end of file
diff --git a/src/openvic-dataloader/detail/Errors.hpp b/src/openvic-dataloader/detail/Errors.hpp
deleted file mode 100644
index fbebcc5..0000000
--- a/src/openvic-dataloader/detail/Errors.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-#pragma once
-
-#include <string_view>
-
-#include <openvic-dataloader/ParseError.hpp>
-
-namespace ovdl::errors {
- inline const ParseError make_no_file_error(std::string_view file_path) {
- std::string message;
- if (file_path.empty()) {
- message = "File path not specified.";
- } else {
- message = "File '" + std::string(file_path) + "' was not found.";
- }
-
- return ParseError { ParseError::Type::Fatal, message, 1 };
- }
-}
-
-namespace ovdl::v2script::errors {
-
-}
-
-namespace ovdl::ovscript::errors {
-} \ No newline at end of file
diff --git a/src/openvic-dataloader/detail/InternalConcepts.hpp b/src/openvic-dataloader/detail/InternalConcepts.hpp
new file mode 100644
index 0000000..0c7913d
--- /dev/null
+++ b/src/openvic-dataloader/detail/InternalConcepts.hpp
@@ -0,0 +1,127 @@
+#pragma once
+
+#include <concepts>
+#include <utility>
+
+#include <openvic-dataloader/NodeLocation.hpp>
+#include <openvic-dataloader/detail/Encoding.hpp>
+#include <openvic-dataloader/detail/SymbolIntern.hpp>
+
+#include <lexy/encoding.hpp>
+#include <lexy/input/buffer.hpp>
+
+#include <fmt/core.h>
+
+#include <lexy_ext/report_error.hpp>
+
+namespace ovdl::detail {
+ template<typename T>
+ concept IsFile =
+ requires(T t, const typename T::node_type* node, NodeLocation location) {
+ typename T::node_type;
+ { t.set_location(node, location) } -> std::same_as<void>;
+ { t.location_of(node) } -> std::same_as<NodeLocation>;
+ };
+
+ template<typename T>
+ concept IsAst =
+ requires(
+ T t,
+ const T ct,
+ const typename T::node_type* node,
+ NodeLocation loc //
+ ) {
+ requires IsFile<typename T::file_type>;
+ typename T::root_node_type;
+ typename T::node_type;
+ requires std::derived_from<typename T::root_node_type, typename T::node_type>;
+ { t.set_location(node, loc) } -> std::same_as<void>;
+ { t.location_of(node) } -> std::same_as<NodeLocation>;
+ { t.root() } -> std::same_as<typename T::root_node_type*>;
+ { ct.root() } -> std::same_as<const typename T::root_node_type*>;
+ { t.file() } -> std::same_as<typename T::file_type&>;
+ { ct.file() } -> std::same_as<const typename T::file_type&>;
+ };
+
+ template<typename T>
+ concept IsDiagnosticLogger = requires(
+ T t,
+ const T ct,
+ const char* str,
+ std::size_t length,
+ std::string_view sv,
+ lexy_ext::diagnostic_kind diag_kind //
+ ) {
+ typename T::error_range;
+ typename T::Writer;
+ { static_cast<bool>(ct) } -> std::same_as<bool>;
+ { ct.errored() } -> std::same_as<bool>;
+ { ct.warned() } -> std::same_as<bool>;
+ { ct.get_errors() } -> std::same_as<typename T::error_range>;
+ { t.intern(str, length) } -> std::same_as<ovdl::SymbolIntern::symbol_type>;
+ { t.intern(sv) } -> std::same_as<ovdl::SymbolIntern::symbol_type>;
+ { t.intern_cstr(str, length) } -> std::same_as<const char*>;
+ { t.intern_cstr(sv) } -> std::same_as<const char*>;
+ { t.symbol_interner() } -> std::same_as<SymbolIntern::symbol_interner_type&>;
+ { ct.symbol_interner() } -> std::same_as<const SymbolIntern::symbol_interner_type&>;
+ { t.error(std::declval<typename T::template format_str<>>()) } -> std::same_as<typename T::Writer>;
+ { t.warning(std::declval<typename T::template format_str<>>()) } -> std::same_as<typename T::Writer>;
+ { t.note(std::declval<typename T::template format_str<>>()) } -> std::same_as<typename T::Writer>;
+ { t.info(std::declval<typename T::template format_str<>>()) } -> std::same_as<typename T::Writer>;
+ { t.debug(std::declval<typename T::template format_str<>>()) } -> std::same_as<typename T::Writer>;
+ { t.fixit(std::declval<typename T::template format_str<>>()) } -> std::same_as<typename T::Writer>;
+ { t.help(std::declval<typename T::template format_str<>>()) } -> std::same_as<typename T::Writer>;
+ { t.error(sv) } -> std::same_as<typename T::Writer>;
+ { t.warning(sv) } -> std::same_as<typename T::Writer>;
+ { t.note(sv) } -> std::same_as<typename T::Writer>;
+ { t.info(sv) } -> std::same_as<typename T::Writer>;
+ { t.debug(sv) } -> std::same_as<typename T::Writer>;
+ { t.fixit(sv) } -> std::same_as<typename T::Writer>;
+ { t.help(sv) } -> std::same_as<typename T::Writer>;
+ { std::move(t.error_callback().sink()).finish() } -> std::same_as<std::size_t>;
+ { t.log(diag_kind, std::declval<typename T::template format_str<>>()) } -> std::same_as<typename T::Writer>;
+ };
+
+ template<typename T>
+ concept IsParseState = requires(
+ T t,
+ const T ct,
+ typename T::ast_type::file_type&& file,
+ lexy::buffer<lexy::default_encoding>&& buffer,
+ ovdl::detail::Encoding encoding,
+ const char* path //
+ ) {
+ requires IsAst<typename T::ast_type>;
+ requires IsDiagnosticLogger<typename T::diagnostic_logger_type>;
+ { T { std::move(file), encoding } } -> std::same_as<T>;
+ { T { std::move(buffer), encoding } } -> std::same_as<T>;
+ { T { path, std::move(buffer), encoding } } -> std::same_as<T>;
+ { t.ast() } -> std::same_as<typename T::ast_type&>;
+ { ct.ast() } -> std::same_as<const typename T::ast_type&>;
+ { t.logger() } -> std::same_as<typename T::diagnostic_logger_type&>;
+ { ct.logger() } -> std::same_as<const typename T::diagnostic_logger_type&>;
+ };
+
+ template<typename T>
+ concept IsFileParseState = requires(
+ T t,
+ const T ct,
+ typename T::file_type&& file,
+ lexy::buffer<lexy::default_encoding>&& buffer,
+ ovdl::detail::Encoding encoding,
+ const char* path //
+ ) {
+ requires IsFile<typename T::file_type>;
+ requires IsDiagnosticLogger<typename T::diagnostic_logger_type>;
+ { T { std::move(file), encoding } } -> std::same_as<T>;
+ { T { std::move(buffer), encoding } } -> std::same_as<T>;
+ { T { path, std::move(buffer), encoding } } -> std::same_as<T>;
+ { t.file() } -> std::same_as<typename T::file_type&>;
+ { ct.file() } -> std::same_as<const typename T::file_type&>;
+ { t.logger() } -> std::same_as<typename T::diagnostic_logger_type&>;
+ { ct.logger() } -> std::same_as<const typename T::diagnostic_logger_type&>;
+ };
+
+ template<typename T>
+ concept IsStateType = IsParseState<T> || IsFileParseState<T>;
+} \ No newline at end of file
diff --git a/src/openvic-dataloader/detail/ParseHandler.cpp b/src/openvic-dataloader/detail/ParseHandler.cpp
new file mode 100644
index 0000000..3818433
--- /dev/null
+++ b/src/openvic-dataloader/detail/ParseHandler.cpp
@@ -0,0 +1,347 @@
+#include "ParseHandler.hpp"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdlib>
+#include <string_view>
+#include <type_traits>
+
+#include <openvic-dataloader/detail/Encoding.hpp>
+
+using namespace ovdl::detail;
+
+#ifdef _WIN32
+#include <array>
+#include <cstdint>
+#include <utility>
+
+#define WIN32_LEAN_AND_MEAN
+#include <Windows.h>
+#undef WIN32_LEAN_AND_MEAN
+#endif
+
+template<size_t N>
+struct LangCodeLiteral {
+ char value[N];
+
+ constexpr LangCodeLiteral(const char (&str)[N]) {
+ std::copy_n(str, N, value);
+ }
+
+ static constexpr std::integral_constant<std::size_t, N - 1> size = {};
+
+ constexpr const char& operator[](std::size_t index) const noexcept {
+ return value[index];
+ }
+
+ constexpr operator std::string_view() const noexcept {
+ return std::string_view(value, size());
+ }
+
+ constexpr bool operator==(const std::string_view view) const noexcept {
+ return view.size() >= size() + 1 && view.starts_with(*this) && view[size()] == '_';
+ }
+};
+
+struct LangCodeView {
+ std::string_view view;
+ bool is_valid;
+
+ constexpr LangCodeView() = default;
+
+ template<std::size_t N>
+ constexpr LangCodeView(const char (&str)[N]) : view(str), is_valid(true) {}
+
+ constexpr LangCodeView(char* str) : view(str) {
+ is_valid = view.find('_') != std::string_view::npos;
+ }
+
+ constexpr std::size_t size() const noexcept {
+ return view.size();
+ }
+
+ constexpr const char& operator[](std::size_t index) const noexcept {
+ return view[index];
+ }
+
+ constexpr operator std::string_view() const noexcept {
+ return view;
+ }
+
+ template<std::size_t N>
+ constexpr bool operator==(const LangCodeLiteral<N>& literal) {
+ return is_valid && size() >= LangCodeLiteral<N>::size() && view.starts_with(literal);
+ }
+};
+
+struct FallbackSetter {
+ std::optional<Encoding>& fallback;
+
+ template<Encoding _Encoding, LangCodeLiteral LangCode>
+ constexpr bool encoded(auto&& view) const {
+ if (view == LangCode) {
+ fallback = _Encoding;
+ return true;
+ }
+ return false;
+ };
+};
+
+void ParseHandler::_detect_system_fallback_encoding() {
+ _system_fallback_encoding = Encoding::Unknown;
+ LangCodeView lang_code;
+
+#ifdef _WIN32
+ using namespace std::string_view_literals;
+
+ // Every Windows language id mapped to a language code according to https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-lcid/63d3d639-7fd2-4afb-abbe-0d5b5551eef8
+ constexpr std::array lang_id_to_lang_code = std::to_array<std::pair<std::uint8_t, LangCodeView>>({
+ { 0x0001, "ar" },
+ { 0x0002, "bg" },
+ { 0x0003, "ca" },
+ { 0x0004, "zh" },
+ { 0x0005, "cs" },
+ { 0x0006, "da" },
+ { 0x0007, "de" },
+ { 0x0008, "el" },
+ { 0x0009, "en" },
+ { 0x000A, "es" },
+ { 0x000B, "fi" },
+ { 0x000C, "fr" },
+ { 0x000D, "he" },
+ { 0x000E, "hu" },
+ { 0x000F, "is" },
+ { 0x0010, "it" },
+ { 0x0011, "ja" },
+ { 0x0012, "ko" },
+ { 0x0013, "nl" },
+ { 0x0014, "no" },
+ { 0x0015, "pl" },
+ { 0x0016, "pt" },
+ { 0x0017, "rm" },
+ { 0x0018, "ro" },
+ { 0x0019, "ru" },
+ { 0x001A, "hr" },
+ { 0x001B, "sk" },
+ { 0x001C, "sq" },
+ { 0x001D, "sv" },
+ { 0x001E, "th" },
+ { 0x001F, "tr" },
+ { 0x0020, "ur" },
+ { 0x0021, "id" },
+ { 0x0022, "uk" },
+ { 0x0023, "be" },
+ { 0x0024, "sl" },
+ { 0x0025, "et" },
+ { 0x0026, "lv" },
+ { 0x0027, "lt" },
+ { 0x0028, "tg" },
+ { 0x0029, "fa" },
+ { 0x002A, "vi" },
+ { 0x002B, "hy" },
+ { 0x002C, "az" },
+ { 0x002D, "eu" },
+ { 0x002E, "hsb" },
+ { 0x002F, "mk" },
+ { 0x0030, "st" },
+ { 0x0031, "ts" },
+ { 0x0032, "tn" },
+ { 0x0033, "ve" },
+ { 0x0034, "xh" },
+ { 0x0035, "zu" },
+ { 0x0036, "af" },
+ { 0x0037, "ka" },
+ { 0x0038, "fo" },
+ { 0x0039, "hi" },
+ { 0x003A, "mt" },
+ { 0x003B, "se" },
+ { 0x003C, "ga" },
+ { 0x003D, "yi" },
+ { 0x003E, "ms" },
+ { 0x003F, "kk" },
+ { 0x0040, "ky" },
+ { 0x0041, "sw" },
+ { 0x0042, "tk" },
+ { 0x0043, "uz" },
+ { 0x0044, "tt" },
+ { 0x0045, "bn" },
+ { 0x0046, "pa" },
+ { 0x0047, "gu" },
+ { 0x0048, "or" },
+ { 0x0049, "ta" },
+ { 0x004A, "te" },
+ { 0x004B, "kn" },
+ { 0x004C, "ml" },
+ { 0x004D, "as" },
+ { 0x004E, "mr" },
+ { 0x004F, "sa" },
+ { 0x0050, "mn" },
+ { 0x0051, "bo" },
+ { 0x0052, "cy" },
+ { 0x0053, "km" },
+ { 0x0054, "lo" },
+ { 0x0055, "my" },
+ { 0x0056, "gl" },
+ { 0x0057, "kok" },
+ { 0x0058, "mni" },
+ { 0x0059, "sd" },
+ { 0x005A, "syr" },
+ { 0x005B, "si" },
+ { 0x005C, "chr" },
+ { 0x005D, "iu" },
+ { 0x005E, "am" },
+ { 0x005F, "tzm" },
+ { 0x0060, "ks" },
+ { 0x0061, "ne" },
+ { 0x0062, "fy" },
+ { 0x0063, "ps" },
+ { 0x0064, "fil" },
+ { 0x0065, "dv" },
+ { 0x0066, "bin" },
+ { 0x0067, "ff" },
+ { 0x0068, "ha" },
+ { 0x0069, "ibb" },
+ { 0x006A, "yo" },
+ { 0x006B, "quz" },
+ { 0x006C, "nso" },
+ { 0x006D, "ba" },
+ { 0x006E, "lb" },
+ { 0x006F, "kl" },
+ { 0x0070, "ig" },
+ { 0x0071, "kr" },
+ { 0x0072, "om" },
+ { 0x0073, "ti" },
+ { 0x0074, "gn" },
+ { 0x0075, "haw" },
+ { 0x0076, "la" },
+ { 0x0077, "so" },
+ { 0x0078, "ii" },
+ { 0x0079, "pap" },
+ { 0x007A, "arn" },
+ { 0x007C, "moh" },
+ { 0x007E, "br" },
+ { 0x0080, "ug" },
+ { 0x0081, "mi" },
+ { 0x0082, "oc" },
+ { 0x0083, "co" },
+ { 0x0084, "gsw" },
+ { 0x0085, "sah" },
+ { 0x0086, "qut" },
+ { 0x0087, "rw" },
+ { 0x0088, "wo" },
+ { 0x008C, "prs" },
+ { 0x0091, "gd" },
+ { 0x0092, "ku" },
+ { 0x0093, "quc" } //
+ });
+
+#pragma pack(push, 1)
+ struct LocaleStruct {
+ struct {
+ uint8_t language_id;
+ uint8_t country_id;
+ } language_country;
+ uint8_t sort_id : 4;
+ uint16_t reserved : 12;
+ };
+#pragma pack(pop)
+
+ std::uint32_t locale_int = GetSystemDefaultLCID();
+ LocaleStruct locale_id;
+ std::memcpy(&locale_id, &locale_int, sizeof(locale_id));
+ // first 16 bytes are language-country id, next 4 are sort id, last 12 bytes are reserved
+ // first 8 are the language id, last 8 bytes are a country id
+ const std::uint8_t& lang_id = locale_id.language_country.language_id;
+
+ for (const auto& map : lang_id_to_lang_code) {
+ if (map.first != lang_id) continue;
+ lang_code = map.second;
+ break;
+ }
+#else
+ lang_code = std::getenv("LANG");
+#endif
+
+ constexpr FallbackSetter setter { _system_fallback_encoding };
+
+ if (lang_code.size() < 2) {
+ _system_fallback_encoding = Encoding::Unknown;
+ return;
+ }
+
+#define WIN1251(LANG_CODE) \
+ if (setter.encoded<Encoding::Windows1251, #LANG_CODE>(lang_code)) return;
+
+#define WIN1252(LANG_CODE) \
+ if (setter.encoded<Encoding::Windows1252, #LANG_CODE>(lang_code)) return;
+
+ // More common, prefer
+ WIN1252(en);
+ WIN1252(es);
+ WIN1252(fr);
+ WIN1252(de);
+
+ WIN1251(ru);
+
+ WIN1252(af);
+ WIN1252(sq);
+ WIN1252(eu);
+ WIN1252(br);
+ WIN1252(co);
+ WIN1252(fo);
+ WIN1252(gl);
+ WIN1252(is);
+ WIN1252(io);
+ WIN1252(ga);
+ WIN1252(id);
+ WIN1252(in);
+ WIN1252(it);
+ WIN1252(lb);
+ WIN1252(ms);
+ WIN1252(gv);
+ WIN1252(no);
+ WIN1252(oc);
+ WIN1252(pt);
+ WIN1252(gd);
+ WIN1252(sw);
+ WIN1252(fi);
+ WIN1252(da);
+ WIN1252(et);
+ WIN1252(tn);
+ WIN1252(ca);
+ WIN1252(rm);
+ WIN1252(nl);
+ WIN1252(sl);
+ WIN1252(cy);
+ WIN1252(hu);
+
+ WIN1251(be);
+ WIN1251(uk);
+ WIN1251(bg);
+ WIN1251(kk);
+ WIN1251(tg);
+ WIN1251(sr);
+ WIN1251(ky);
+ WIN1251(mn);
+ WIN1251(mk);
+ WIN1251(mo);
+
+ if (lang_code.size() < 3) {
+ return;
+ }
+
+ WIN1251(mol);
+
+ WIN1252(ast);
+ WIN1252(jbo);
+ WIN1252(gla);
+ WIN1252(sco);
+ WIN1252(sma);
+ WIN1252(roo);
+ WIN1252(swa);
+ WIN1252(tsn);
+ WIN1252(tok);
+
+#undef WIN1251
+#undef WIN1252
+} \ No newline at end of file
diff --git a/src/openvic-dataloader/detail/ParseHandler.hpp b/src/openvic-dataloader/detail/ParseHandler.hpp
index fbec0d7..9666a5b 100644
--- a/src/openvic-dataloader/detail/ParseHandler.hpp
+++ b/src/openvic-dataloader/detail/ParseHandler.hpp
@@ -1,20 +1,26 @@
#pragma once
+#include <cstddef>
+#include <optional>
+#include <string>
#include <utility>
-#include <openvic-dataloader/ParseState.hpp>
-#include <openvic-dataloader/detail/utility/Concepts.hpp>
+#include <openvic-dataloader/detail/Concepts.hpp>
#include <lexy/encoding.hpp>
#include <lexy/input/buffer.hpp>
#include <lexy/input/file.hpp>
+#include "openvic-dataloader/detail/Encoding.hpp"
+#include "openvic-dataloader/detail/Utility.hpp"
+
#include "detail/BufferError.hpp"
+#include "detail/Detect.hpp"
+#include "detail/InternalConcepts.hpp"
namespace ovdl::detail {
- template<typename Derived>
struct ParseHandler {
- std::string make_error_from(buffer_error error) {
+ std::string make_error_from(buffer_error error) const {
switch (error) {
using enum ovdl::detail::buffer_error;
case buffer_is_null:
@@ -30,116 +36,179 @@ namespace ovdl::detail {
}
}
- template<typename... Args>
- constexpr void _run_load_func(detail::LoadCallback<Derived, Args...> auto func, Args... args);
- };
-
- template<IsFileParseState ParseState, typename MemoryResource = void>
- struct BasicFileParseHandler : ParseHandler<BasicFileParseHandler<ParseState, MemoryResource>> {
- using parse_state_type = ParseState;
- using encoding_type = typename parse_state_type::file_type::encoding_type;
-
constexpr bool is_valid() const {
- if (!_parse_state) return false;
- return buffer().data() != nullptr;
+ return is_valid_impl();
}
- constexpr buffer_error load_buffer_size(const char* data, std::size_t size) {
- lexy::buffer<encoding_type, MemoryResource> buffer(data, size);
+ buffer_error load_buffer_size(const char* data, std::size_t size, std::optional<Encoding> fallback) {
+ lexy::buffer<lexy::default_encoding> buffer(data, size);
if (buffer.data() == nullptr) return buffer_error::buffer_is_null;
- _parse_state.reset(new parse_state_type { std::move(buffer) });
- return is_valid() ? buffer_error::success : buffer_error::buffer_is_null;
+ return load_buffer_impl(std::move(buffer), "", fallback);
}
- constexpr buffer_error load_buffer(const char* start, const char* end) {
- lexy::buffer<encoding_type, MemoryResource> buffer(start, end);
+ buffer_error load_buffer(const char* start, const char* end, std::optional<Encoding> fallback) {
+ lexy::buffer<lexy::default_encoding> buffer(start, end);
if (buffer.data() == nullptr) return buffer_error::buffer_is_null;
- _parse_state.reset(new parse_state_type { std::move(buffer) });
- return is_valid() ? buffer_error::success : buffer_error::buffer_is_null;
+ return load_buffer_impl(std::move(buffer), "", fallback);
}
- buffer_error load_file(const char* path) {
- lexy::read_file_result file = lexy::read_file<encoding_type, lexy::encoding_endianness::bom, MemoryResource>(path);
+ buffer_error load_file(const char* path, std::optional<Encoding> fallback) {
+ lexy::read_file_result file = lexy::read_file<lexy::default_encoding, lexy::encoding_endianness::bom>(path);
+
if (!file) {
- _parse_state.reset(new parse_state_type { path, lexy::buffer<typename parse_state_type::file_type::encoding_type>() });
return ovdl::detail::from_underlying<buffer_error>(ovdl::detail::to_underlying(file.error()));
}
- _parse_state.reset(new parse_state_type { path, std::move(file).buffer() });
- return is_valid() ? buffer_error::success : buffer_error::buffer_is_null;
+
+ return load_buffer_impl(std::move(file).buffer(), path, fallback);
}
const char* path() const {
+ return path_impl();
+ }
+
+ static Encoding get_system_fallback() {
+ return _system_fallback_encoding.value_or(Encoding::Unknown);
+ }
+
+ virtual ~ParseHandler() = default;
+
+ protected:
+ constexpr virtual bool is_valid_impl() const = 0;
+ constexpr virtual buffer_error load_buffer_impl(lexy::buffer<lexy::default_encoding>&& buffer, const char* path = "", std::optional<Encoding> fallback = std::nullopt) = 0;
+ virtual const char* path_impl() const = 0;
+
+ template<detail::IsStateType State, detail::IsEncoding BufferEncoding>
+ static constexpr auto generate_state = [](std::optional<State>* state, const char* path, auto&& buffer, Encoding encoding) {
+ if (path[0] != '\0') {
+ state->emplace(
+ path,
+ lexy::buffer<BufferEncoding>(std::move(buffer)),
+ encoding);
+ return;
+ }
+ state->emplace(lexy::buffer<BufferEncoding>(std::move(buffer)), encoding);
+ };
+
+ template<detail::IsStateType State>
+ static void create_state(std::optional<State>* state, const char* path, lexy::buffer<lexy::default_encoding>&& buffer, std::optional<Encoding> fallback) {
+ if (!_system_fallback_encoding.has_value()) {
+ _detect_system_fallback_encoding();
+ }
+ bool is_bad_fallback = false;
+ if (fallback.has_value()) {
+ is_bad_fallback = fallback.value() == Encoding::Ascii || fallback.value() == Encoding::Utf8;
+ if (is_bad_fallback)
+ fallback = _system_fallback_encoding.value();
+ } else {
+ fallback = _system_fallback_encoding.value();
+ }
+ auto [encoding, is_alone] = encoding_detect::Detector { .default_fallback = fallback.value() }.detect_assess(buffer);
+ switch (encoding) {
+ using enum Encoding;
+ case Ascii:
+ case Utf8: {
+ generate_state<State, lexy::utf8_char_encoding>(state, path, std::move(buffer), encoding);
+ break;
+ }
+ case Unknown:
+ case Windows1251:
+ case Windows1252: {
+ generate_state<State, lexy::default_encoding>(state, path, std::move(buffer), encoding);
+ break;
+ }
+ default:
+ ovdl::detail::unreachable();
+ }
+
+ if (!is_alone) {
+ (*state)->logger().info("encoding type could not be distinguished");
+ }
+
+ if (is_bad_fallback) {
+ (*state)->logger().warning("fallback encoding cannot be ascii or utf8");
+ }
+
+ if (encoding == ovdl::detail::Encoding::Unknown) {
+ (*state)->logger().warning("could not detect encoding");
+ }
+ }
+
+ private:
+ inline static std::optional<Encoding> _system_fallback_encoding = std::nullopt;
+ static void _detect_system_fallback_encoding();
+ };
+
+ template<detail::IsFileParseState ParseState>
+ struct BasicFileParseHandler : ParseHandler {
+ using parse_state_type = ParseState;
+
+ virtual constexpr bool is_valid_impl() const {
+ if (!_parse_state) return false;
+ return _parse_state.value().file().is_valid();
+ }
+
+ constexpr virtual buffer_error load_buffer_impl(lexy::buffer<lexy::default_encoding>&& buffer, const char* path, std::optional<Encoding> fallback) {
+ if (buffer.data() == nullptr) return buffer_error::buffer_is_null;
+ create_state(&_parse_state, path, std::move(buffer), fallback);
+ return is_valid_impl() ? buffer_error::success : buffer_error::buffer_is_null;
+ }
+
+ virtual const char* path_impl() const {
if (!_parse_state) return "";
- return _parse_state->file().path();
+ return _parse_state.value().file().path();
}
parse_state_type& parse_state() {
- return *_parse_state;
+ return _parse_state.value();
}
const parse_state_type& parse_state() const {
- return *_parse_state;
+ return _parse_state.value();
}
+ template<typename Encoding>
constexpr const auto& buffer() const {
- return _parse_state->file().buffer();
+ return _parse_state.value().file().template get_buffer_as<Encoding>();
}
protected:
- std::unique_ptr<parse_state_type> _parse_state;
+ std::optional<parse_state_type> _parse_state;
};
- template<IsParseState ParseState, typename MemoryResource = void>
- struct BasicStateParseHandler : ParseHandler<BasicStateParseHandler<ParseState, MemoryResource>> {
+ template<detail::IsParseState ParseState>
+ struct BasicStateParseHandler : ParseHandler {
using parse_state_type = ParseState;
- using encoding_type = typename parse_state_type::ast_type::file_type::encoding_type;
- constexpr bool is_valid() const {
+ virtual constexpr bool is_valid_impl() const {
if (!_parse_state) return false;
- return buffer().data() != nullptr;
- }
-
- constexpr buffer_error load_buffer_size(const char* data, std::size_t size) {
- lexy::buffer<encoding_type, MemoryResource> buffer(data, size);
- _parse_state.reset(new parse_state_type { std::move(buffer) });
- return is_valid() ? buffer_error::success : buffer_error::buffer_is_null;
- }
-
- constexpr buffer_error load_buffer(const char* start, const char* end) {
- lexy::buffer<encoding_type, MemoryResource> buffer(start, end);
- _parse_state.reset(new parse_state_type { std::move(buffer) });
- return is_valid() ? buffer_error::success : buffer_error::buffer_is_null;
+ return _parse_state.value().ast().file().is_valid();
}
- buffer_error load_file(const char* path) {
- lexy::read_file_result file = lexy::read_file<encoding_type, lexy::encoding_endianness::bom, MemoryResource>(path);
- if (!file) {
- _parse_state.reset(new parse_state_type { path, lexy::buffer<typename parse_state_type::ast_type::file_type::encoding_type>() });
- return ovdl::detail::from_underlying<buffer_error>(ovdl::detail::to_underlying(file.error()));
- }
-
- _parse_state.reset(new parse_state_type { path, std::move(file).buffer() });
- return is_valid() ? buffer_error::success : buffer_error::buffer_is_null;
+ constexpr virtual buffer_error load_buffer_impl(lexy::buffer<lexy::default_encoding>&& buffer, const char* path, std::optional<Encoding> fallback) {
+ if (buffer.data() == nullptr) return buffer_error::buffer_is_null;
+ create_state(&_parse_state, path, std::move(buffer), fallback);
+ return is_valid_impl() ? buffer_error::success : buffer_error::buffer_is_null;
}
- const char* path() const {
+ virtual const char* path_impl() const {
if (!_parse_state) return "";
- return _parse_state->ast().file().path();
+ return _parse_state.value().ast().file().path();
}
parse_state_type& parse_state() {
- return *_parse_state;
+ return _parse_state.value();
}
const parse_state_type& parse_state() const {
- return *_parse_state;
+ return _parse_state.value();
}
+ template<typename Encoding>
constexpr const auto& buffer() const {
- return _parse_state->ast().file().buffer();
+ return _parse_state.value().ast().file().template get_buffer_as<Encoding>();
}
protected:
- std::unique_ptr<parse_state_type> _parse_state;
+ std::optional<parse_state_type> _parse_state;
};
} \ No newline at end of file
diff --git a/src/openvic-dataloader/detail/Warnings.hpp b/src/openvic-dataloader/detail/Warnings.hpp
index ab718bc..3a0a239 100644
--- a/src/openvic-dataloader/detail/Warnings.hpp
+++ b/src/openvic-dataloader/detail/Warnings.hpp
@@ -1,18 +1,17 @@
#pragma once
+#include <string>
#include <string_view>
-#include <openvic-dataloader/ParseWarning.hpp>
-
namespace ovdl::v2script::warnings {
inline const std::string make_utf8_warning(std::string_view file_path) {
- constexpr std::string_view message_suffix = "This may cause problems. Prefer Windows-1252 encoding.";
+ constexpr std::string_view message_suffix = "This may cause problems. Prefer Windows-1252 encoding:";
std::string message;
if (file_path.empty()) {
- message = "Buffer is a UTF-8 encoded string. " + std::string(message_suffix);
+ message = "Buffer is UTF-8 encoded. " + std::string(message_suffix);
} else {
- message = "File '" + std::string(file_path) + "' is a UTF-8 encoded file. " + std::string(message_suffix);
+ message = "File is UTF-8 encoded. " + std::string(message_suffix);
}
return message;
diff --git a/src/openvic-dataloader/detail/dsl.hpp b/src/openvic-dataloader/detail/dsl.hpp
index ccc1af6..fd8981a 100644
--- a/src/openvic-dataloader/detail/dsl.hpp
+++ b/src/openvic-dataloader/detail/dsl.hpp
@@ -1,16 +1,20 @@
#pragma once
+#include <concepts> // IWYU pragma: keep
#include <type_traits>
#include <openvic-dataloader/NodeLocation.hpp>
-#include <openvic-dataloader/ParseState.hpp>
+#include <lexy/_detail/config.hpp>
#include <lexy/callback/adapter.hpp>
#include <lexy/callback/bind.hpp>
#include <lexy/callback/container.hpp>
#include <lexy/callback/fold.hpp>
#include <lexy/dsl.hpp>
+#include <lexy/dsl/literal.hpp>
+#include <lexy/encoding.hpp>
+#include "detail/InternalConcepts.hpp"
#include "detail/StringLiteral.hpp"
namespace ovdl::dsl {
@@ -20,10 +24,46 @@ namespace ovdl::dsl {
}
template<typename Sink>
- constexpr auto sink(Sink sink) {
+ constexpr auto bind_sink(Sink sink) {
return lexy::bind_sink(sink, lexy::parse_state);
}
+ template<typename ReturnT, typename Sink>
+ struct _sink_with_state {
+ using return_type = ReturnT;
+
+ LEXY_EMPTY_MEMBER Sink _sink_cb;
+
+ template<detail::IsStateType StateType, typename SinkCallback>
+ struct _sink_callback {
+ StateType& _state;
+ SinkCallback _sink_cb;
+
+ using return_type = decltype(LEXY_MOV(_sink_cb).finish());
+
+ template<typename... Args>
+ constexpr void operator()(Args&&... args) {
+ lexy::_detail::invoke(_sink_cb, _state, LEXY_FWD(args)...);
+ }
+
+ constexpr return_type finish() && { return LEXY_MOV(_sink_cb).finish(); }
+ };
+
+ template<typename... Args>
+ constexpr auto operator()(detail::IsStateType auto& state, Args... args) const -> decltype(_sink_cb(state, LEXY_FWD(args)...)) {
+ return _sink_cb(state, LEXY_FWD(args)...);
+ }
+
+ constexpr auto sink(detail::IsStateType auto& state) const {
+ return _sink_callback<std::decay_t<decltype(state)>, decltype(_sink_cb.sink())> { state, _sink_cb.sink() };
+ }
+ };
+
+ template<typename ReturnT, typename Sink>
+ constexpr auto sink(Sink&& sink) {
+ return bind_sink(_sink_with_state<ReturnT, Sink> { LEXY_FWD(sink) });
+ }
+
template<typename Container, typename Callback>
constexpr auto collect(Callback callback) {
return sink(lexy::collect<Container>(callback));
@@ -34,49 +74,76 @@ namespace ovdl::dsl {
return sink(lexy::collect(callback));
}
- template<IsParseState StateType, typename T>
+ template<typename T>
constexpr auto construct = callback<T*>(
- [](StateType& state, ovdl::NodeLocation loc, auto&& arg) {
- if constexpr (std::is_same_v<std::decay_t<decltype(arg)>, lexy::nullopt>)
+ [](detail::IsParseState auto& state, ovdl::NodeLocation loc, auto&& arg) {
+ if constexpr (std::same_as<std::decay_t<decltype(arg)>, lexy::nullopt>)
return state.ast().template create<T>(loc);
else
return state.ast().template create<T>(loc, DRYAD_FWD(arg));
},
- [](StateType& state, ovdl::NodeLocation loc, auto&&... args) {
+ [](detail::IsParseState auto& state, ovdl::NodeLocation loc, auto&&... args) {
return state.ast().template create<T>(loc, DRYAD_FWD(args)...);
});
- template<IsParseState StateType, typename T, typename ListType, bool DisableEmpty = false>
+ template<typename T, typename ListType, bool DisableEmpty = false>
constexpr auto construct_list = callback<T*>(
- [](StateType& state, const char* begin, ListType&& arg, const char* end) {
+ [](detail::IsParseState auto& state, const char* begin, ListType&& arg, const char* end) {
return state.ast().template create<T>(NodeLocation::make_from(begin, end), DRYAD_FWD(arg));
},
- [](StateType& state, const char* begin, lexy::nullopt, const char* end) {
+ [](detail::IsParseState auto& state, const char* begin, lexy::nullopt, const char* end) {
return state.ast().template create<T>(NodeLocation::make_from(begin, end));
},
- [](StateType& state, const char* begin, const char* end) {
+ [](detail::IsParseState auto& state, const char* begin, const char* end) {
return state.ast().template create<T>(NodeLocation::make_from(begin, end));
+ },
+ [](detail::IsParseState auto& state) {
+ return nullptr;
});
- template<IsParseState StateType, typename T, typename ListType>
- constexpr auto construct_list<StateType, T, ListType, true> = callback<T*>(
- [](StateType& state, const char* begin, ListType&& arg, const char* end) {
+ template<typename T, typename ListType>
+ constexpr auto construct_list<T, ListType, true> = callback<T*>(
+ [](detail::IsParseState auto& state, const char* begin, ListType&& arg, const char* end) {
return state.ast().template create<T>(NodeLocation::make_from(begin, end), DRYAD_FWD(arg));
},
- [](StateType& state, const char* begin, lexy::nullopt, const char* end) {
+ [](detail::IsParseState auto& state, const char* begin, lexy::nullopt, const char* end) {
return state.ast().template create<T>(NodeLocation::make_from(begin, end));
});
- template<unsigned char LOW, unsigned char HIGH>
- consteval auto make_range() {
- if constexpr (LOW == HIGH) {
- return ::lexy::dsl::lit_c<LOW>;
- } else if constexpr (LOW == (HIGH - 1)) {
- return ::lexy::dsl::lit_c<LOW> / ::lexy::dsl::lit_c<HIGH>;
- } else {
- return ::lexy::dsl::lit_c<LOW> / make_range<LOW + 1, HIGH>();
+ template<typename CharT, CharT LowC, CharT HighC>
+ struct _crange : lexyd::char_class_base<_crange<CharT, LowC, HighC>> {
+ static_assert(LowC >= 0, "LowC cannot be less than 0");
+ static_assert(HighC - LowC > 0, "LowC must be less than HighC");
+
+ static constexpr auto char_class_unicode() {
+ return LowC <= 0x7F && HighC <= 0x7F;
}
- }
+
+ static LEXY_CONSTEVAL auto char_class_name() {
+ return "range";
+ }
+
+ static LEXY_CONSTEVAL auto char_class_ascii() {
+ lexy::_detail::ascii_set result;
+ if constexpr (LowC <= 0x7F && HighC <= 0x7F)
+ for (auto c = LowC; c <= HighC; c++)
+ result.insert(c);
+ return result;
+ }
+
+ static constexpr auto char_class_match_cp([[maybe_unused]] char32_t cp) {
+ if constexpr (LowC <= 0x7F && HighC <= 0x7F)
+ return std::false_type {};
+ else
+ return LowC <= cp && cp <= HighC;
+ }
+ };
+
+ template<auto LowC, decltype(LowC) HighC>
+ constexpr auto lit_c_range = _crange<LEXY_DECAY_DECLTYPE(LowC), LowC, HighC> {};
+
+ template<unsigned char LowC, unsigned char HighC>
+ constexpr auto lit_b_range = _crange<unsigned char, LowC, HighC> {};
template<auto Open, auto Close>
constexpr auto position_brackets = lexy::dsl::brackets(lexy::dsl::position(lexy::dsl::lit_c<Open>), lexy::dsl::position(lexy::dsl::lit_c<Close>));
@@ -89,14 +156,13 @@ namespace ovdl::dsl {
template<typename Production>
constexpr auto p = lexy::dsl::position(lexy::dsl::p<Production>);
- template<IsParseState ParseType, typename ReturnType, ovdl::detail::string_literal Keyword>
+ template<typename ReturnType, ovdl::detail::string_literal Keyword>
static constexpr auto default_kw_value = dsl::callback<ReturnType*>(
- [](ParseType& state, NodeLocation loc) {
+ [](detail::IsParseState auto& state, NodeLocation loc) {
return state.ast().template create<ReturnType>(loc, state.ast().intern(Keyword.data(), Keyword.size()));
});
template<
- IsParseState ParseType,
auto Identifier,
typename RuleValue,
ovdl::detail::string_literal Keyword,
@@ -109,18 +175,17 @@ namespace ovdl::dsl {
static constexpr auto value = Value;
};
static constexpr auto rule = dsl::p<rule_t> >> Production;
- static constexpr auto value = construct<ParseType, RuleValue>;
+ static constexpr auto value = construct<RuleValue>;
};
template<
- IsParseState ParseType,
auto Identifier,
typename RuleValue,
ovdl::detail::string_literal Keyword,
auto Production,
auto Value>
- struct fkeyword_rule : keyword_rule<ParseType, Identifier, RuleValue, Keyword, Production, Value> {
- using base_type = keyword_rule<ParseType, Identifier, RuleValue, Keyword, Production, Value>;
+ struct fkeyword_rule : keyword_rule<Identifier, RuleValue, Keyword, Production, Value> {
+ using base_type = keyword_rule<Identifier, RuleValue, Keyword, Production, Value>;
struct context_t;
struct rule_t : base_type::rule_t {
static constexpr auto flag = lexy::dsl::context_flag<context_t>;
@@ -139,7 +204,7 @@ namespace ovdl::dsl {
static constexpr auto make_flag = rule_t::flag.create();
static constexpr auto rule = dsl::p<rule_t> >> (rule_t::must >> rule_t::flag.set()) >> Production;
- static constexpr auto value = construct<ParseType, RuleValue>;
+ static constexpr auto value = construct<RuleValue>;
};
template<typename... Args>
@@ -147,4 +212,71 @@ namespace ovdl::dsl {
static constexpr auto flags = (Args::make_flag + ...);
static constexpr auto p = (lexy::dsl::p<Args> | ...);
};
+
+ template<typename Rule, typename RuleUtf, typename Tag>
+ struct _peek : lexyd::branch_base {
+ template<typename Reader>
+ struct bp {
+ typename Reader::iterator begin;
+ typename Reader::marker end;
+
+ constexpr bool try_parse(const void*, Reader reader) {
+ using encoding = typename Reader::encoding;
+
+ auto parser = [&] {
+ if constexpr (std::same_as<encoding, lexy::default_encoding> || std::same_as<encoding, lexy::byte_encoding>) {
+ // We need to match the entire rule.
+ return lexy::token_parser_for<decltype(lexy::dsl::token(Rule {})), Reader> { reader };
+ } else {
+ // We need to match the entire rule.
+ return lexy::token_parser_for<decltype(lexy::dsl::token(RuleUtf {})), Reader> { reader };
+ }
+ }();
+
+ begin = reader.position();
+ auto result = parser.try_parse(reader);
+ end = parser.end;
+
+ return result;
+ }
+
+ template<typename Context>
+ constexpr void cancel(Context& context) {
+ context.on(lexyd::_ev::backtracked {}, begin, end.position());
+ }
+
+ template<typename NextParser, typename Context, typename... Args>
+ LEXY_PARSER_FUNC bool finish(Context& context, Reader& reader, Args&&... args) {
+ context.on(lexyd::_ev::backtracked {}, begin, end.position());
+ return NextParser::parse(context, reader, LEXY_FWD(args)...);
+ }
+ };
+
+ template<typename NextParser>
+ struct p {
+ template<typename Context, typename Reader, typename... Args>
+ LEXY_PARSER_FUNC static bool parse(Context& context, Reader& reader, Args&&... args) {
+ bp<Reader> impl {};
+ if (!impl.try_parse(context.control_block, reader)) {
+ // Report that we've failed.
+ using tag = lexy::_detail::type_or<Tag, lexy::peek_failure>;
+ auto err = lexy::error<Reader, tag>(impl.begin, impl.end.position());
+ context.on(lexyd::_ev::error {}, err);
+
+ // But recover immediately, as we wouldn't have consumed anything either way.
+ }
+
+ context.on(lexyd::_ev::backtracked {}, impl.begin, impl.end);
+ return NextParser::parse(context, reader, LEXY_FWD(args)...);
+ }
+ };
+
+ template<typename Error>
+ static constexpr _peek<Rule, RuleUtf, Error> error = {};
+ };
+
+ template<typename Rule, typename RuleUtf>
+ constexpr auto peek(Rule, RuleUtf) {
+ return _peek<Rule, RuleUtf, void> {};
+ }
} \ No newline at end of file
diff --git a/src/openvic-dataloader/v2script/AbstractSyntaxTree.cpp b/src/openvic-dataloader/v2script/AbstractSyntaxTree.cpp
index abade40..5a98b40 100644
--- a/src/openvic-dataloader/v2script/AbstractSyntaxTree.cpp
+++ b/src/openvic-dataloader/v2script/AbstractSyntaxTree.cpp
@@ -1,8 +1,7 @@
-#include <stddef.h>
-
-#include <openvic-dataloader/v2script/AbstractSyntaxTree.hpp>
+#include "openvic-dataloader/v2script/AbstractSyntaxTree.hpp"
#include <lexy/dsl/option.hpp>
+#include <lexy/encoding.hpp>
#include <lexy/input_location.hpp>
#include <dryad/node.hpp>
@@ -23,6 +22,15 @@ ListValue::ListValue(dryad::node_ctor ctor, StatementList statements)
}
}
+ListValue::ListValue(dryad::node_ctor ctor, AssignStatementList statements) : node_base(ctor) {
+ insert_child_list_after(nullptr, statements);
+ if (statements.empty()) {
+ _last_statement = nullptr;
+ } else {
+ _last_statement = statements.back();
+ }
+}
+
FileTree::FileTree(dryad::node_ctor ctor, StatementList statements) : node_base(ctor) {
insert_child_list_after(nullptr, statements);
if (statements.empty()) {
@@ -32,29 +40,22 @@ FileTree::FileTree(dryad::node_ctor ctor, StatementList statements) : node_base(
}
}
-// static void _handle_string_characters(std::string& string, bool allow_newline) {
-// size_t position = 0;
-// for (auto& c : string) {
-// switch (c) {
-// case '\r':
-// case '\n':
-// if (allow_newline) goto END_LOOP;
-// c = ' ';
-// break;
-// default: break;
-// }
-// END_LOOP:
-// position++;
-// }
-// }
-
-std::string AbstractSyntaxTree::make_list_visualizer() const {
+FileTree::FileTree(dryad::node_ctor ctor, AssignStatementList statements) : node_base(ctor) {
+ insert_child_list_after(nullptr, statements);
+ if (statements.empty()) {
+ _last_node = nullptr;
+ } else {
+ _last_node = statements.back();
+ }
+}
+
+std::string FileAbstractSyntaxTree::make_list_visualizer() const {
const int INDENT_SIZE = 2;
std::string result;
unsigned int level = 0;
- for (auto [event, node] : dryad::traverse(_tree)) {
+ for (auto [event, node] : dryad::traverse(this->_tree)) {
if (event == dryad::traverse_event::exit) {
--level;
continue;
@@ -66,7 +67,7 @@ std::string AbstractSyntaxTree::make_list_visualizer() const {
dryad::visit_node(
node,
[&](const FlatValue* value) {
- result.append(value->value(_symbol_interner));
+ result.append(value->value(this->_symbol_interner));
},
[&](const ListValue* value) {
},
@@ -89,19 +90,19 @@ std::string AbstractSyntaxTree::make_list_visualizer() const {
return result;
}
-std::string AbstractSyntaxTree::make_native_visualizer() const {
+std::string FileAbstractSyntaxTree::make_native_visualizer() const {
constexpr int INDENT_SIZE = 2;
std::string result;
unsigned int level = 0;
dryad::visit_tree(
- _tree,
+ this->_tree,
[&](const IdentifierValue* value) {
- result.append(value->value(_symbol_interner));
+ result.append(value->value(this->_symbol_interner));
},
[&](const StringValue* value) {
- result.append(1, '"').append(value->value(_symbol_interner)).append(1, '"');
+ result.append(1, '"').append(value->value(this->_symbol_interner)).append(1, '"');
},
[&](dryad::child_visitor<NodeKind> visitor, const ValueStatement* statement) {
visitor(statement->value());
diff --git a/src/openvic-dataloader/v2script/EventGrammar.hpp b/src/openvic-dataloader/v2script/EventGrammar.hpp
index 27f6459..130a233 100644
--- a/src/openvic-dataloader/v2script/EventGrammar.hpp
+++ b/src/openvic-dataloader/v2script/EventGrammar.hpp
@@ -11,8 +11,8 @@
#include "openvic-dataloader/NodeLocation.hpp"
-#include "ParseState.hpp"
#include "SimpleGrammar.hpp"
+#include "detail/InternalConcepts.hpp"
#include "detail/dsl.hpp"
#include "v2script/AiBehaviorGrammar.hpp"
#include "v2script/EffectGrammar.hpp"
@@ -28,7 +28,7 @@ namespace ovdl::v2script::grammar {
struct MonthValue {
static constexpr auto rule = lexy::dsl::p<Identifier<StringEscapeOption>>;
static constexpr auto value = dsl::callback<ast::IdentifierValue*>(
- [](ast::ParseState& state, ast::IdentifierValue* value) {
+ [](detail::IsParseState auto& state, ast::IdentifierValue* value) {
bool is_number = true;
for (auto* current = value->value(state.ast().symbol_interner()); *current; current++) {
is_number = is_number && std::isdigit(*current);
@@ -94,7 +94,7 @@ namespace ovdl::v2script::grammar {
static constexpr auto value =
dsl::callback<ast::EventStatement*>(
- [](ast::ParseState& state, NodeLocation loc, ast::IdentifierValue* name, ast::ListValue* list) {
+ [](detail::IsParseState auto& state, NodeLocation loc, ast::IdentifierValue* name, ast::ListValue* list) {
static auto country_decl = state.ast().intern_cstr("country_event");
static auto province_decl = state.ast().intern_cstr("province_event");
@@ -104,7 +104,7 @@ namespace ovdl::v2script::grammar {
.finish();
}
- return state.ast().create<ast::EventStatement>(loc, name->value(state.ast().symbol_interner()) == province_decl, list);
+ return state.ast().template create<ast::EventStatement>(loc, name->value(state.ast().symbol_interner()) == province_decl, list);
});
};
diff --git a/src/openvic-dataloader/v2script/LuaDefinesGrammar.hpp b/src/openvic-dataloader/v2script/LuaDefinesGrammar.hpp
index 96cce99..885413c 100644
--- a/src/openvic-dataloader/v2script/LuaDefinesGrammar.hpp
+++ b/src/openvic-dataloader/v2script/LuaDefinesGrammar.hpp
@@ -4,9 +4,12 @@
#include <lexy/_detail/config.hpp>
#include <lexy/dsl.hpp>
+#include <lexy/dsl/delimited.hpp>
+#include <lexy/dsl/recover.hpp>
+#include <lexy/dsl/unicode.hpp>
-#include "ParseState.hpp"
#include "SimpleGrammar.hpp"
+#include "detail/InternalConcepts.hpp"
#include "detail/dsl.hpp"
namespace ovdl::v2script::lua::grammar {
@@ -21,90 +24,118 @@ namespace ovdl::v2script::lua::grammar {
template<typename T>
constexpr auto construct_list = v2script::grammar::construct_list<T>;
- struct ParseOptions {
- };
-
- template<ParseOptions Options>
struct StatementListBlock;
static constexpr auto comment_specifier = LEXY_LIT("--") >> lexy::dsl::until(lexy::dsl::newline).or_eof();
- template<ParseOptions Options>
struct Identifier {
static constexpr auto rule = lexy::dsl::identifier(lexy::dsl::ascii::alpha_underscore, lexy::dsl::ascii::alpha_digit_underscore);
- static constexpr auto value = callback<ast::IdentifierValue*>(
- [](ast::ParseState& state, auto lexeme) {
- auto value = state.ast().intern(lexeme.data(), lexeme.size());
- return state.ast().create<ast::IdentifierValue>(lexeme.begin(), lexeme.end(), value);
- });
+ static constexpr auto value =
+ callback<ast::IdentifierValue*>(
+ [](detail::IsParseState auto& state, auto lexeme) {
+ auto value = state.ast().intern(lexeme.data(), lexeme.size());
+ return state.ast().template create<ast::IdentifierValue>(lexeme.begin(), lexeme.end(), value);
+ });
};
- template<ParseOptions Options>
struct Value {
static constexpr auto rule = lexy::dsl::identifier(lexy::dsl::ascii::digit / lexy::dsl::lit_c<'.'> / lexy::dsl::lit_c<'-'>);
- static constexpr auto value = callback<ast::IdentifierValue*>(
- [](ast::ParseState& state, auto lexeme) {
- auto value = state.ast().intern(lexeme.data(), lexeme.size());
- return state.ast().create<ast::IdentifierValue>(lexeme.begin(), lexeme.end(), value);
- });
- };
-
- template<ParseOptions Options>
- struct String {
- static constexpr auto rule = [] {
- // Arbitrary code points that aren't control characters.
- auto c = dsl::make_range<0x20, 0xFF>() - lexy::dsl::ascii::control;
-
- return lexy::dsl::delimited(lexy::dsl::position(lexy::dsl::lit_b<'"'>))(c) | lexy::dsl::delimited(lexy::dsl::position(lexy::dsl::lit_b<'\''>))(c);
- }();
-
static constexpr auto value =
- lexy::as_string<std::string> >>
- callback<ast::StringValue*>(
- [](ast::ParseState& state, const char* begin, const std::string& str, const char* end) {
- auto value = state.ast().intern(str.data(), str.length());
- return state.ast().create<ast::StringValue>(begin, end, value);
+ callback<ast::IdentifierValue*>(
+ [](detail::IsParseState auto& state, auto lexeme) {
+ auto value = state.ast().intern(lexeme.data(), lexeme.size());
+ return state.ast().template create<ast::IdentifierValue>(lexeme.begin(), lexeme.end(), value);
});
};
- template<ParseOptions Options>
+ struct String : lexy::scan_production<ast::StringValue*>,
+ lexy::token_production {
+ template<typename Context, typename Reader>
+ static constexpr scan_result scan(lexy::rule_scanner<Context, Reader>& scanner, detail::IsParseState auto& state) {
+ using encoding = typename Reader::encoding;
+
+ constexpr auto c = [] {
+ if constexpr (std::same_as<encoding, lexy::default_encoding> || std::same_as<encoding, lexy::byte_encoding>) {
+ // Arbitrary code points that aren't control characters.
+ return dsl::lit_b_range<0x20, 0xFF> - lexy::dsl::ascii::control;
+ } else {
+ return -lexy::dsl::unicode::control;
+ }
+ }();
+ auto rule = lexy::dsl::quoted(c) | lexy::dsl::single_quoted(c);
+ auto begin = scanner.position();
+ lexy::scan_result<std::string> str_result;
+ scanner.parse(str_result, rule);
+ if (!scanner || !str_result)
+ return lexy::scan_failed;
+ auto end = scanner.position();
+ auto str = str_result.value();
+ auto value = state.ast().intern(str.data(), str.size());
+ return state.ast().template create<ast::StringValue>(begin, end, value);
+ }
+
+ static constexpr auto rule = lexy::dsl::peek(lexy::dsl::quoted.open() | lexy::dsl::single_quoted.open()) >> lexy::dsl::scan;
+ static constexpr auto value = ovdl::v2script::grammar::convert_as_string<std::string> >> lexy::forward<ast::StringValue*>;
+ };
+
struct Expression {
- static constexpr auto rule = lexy::dsl::p<Value<Options>> | lexy::dsl::p<String<Options>>;
+ static constexpr auto rule = lexy::dsl::p<Value> | lexy::dsl::p<String>;
static constexpr auto value = lexy::forward<ast::Value*>;
};
- template<ParseOptions Options>
struct AssignmentStatement {
- static constexpr auto rule =
- dsl::p<Identifier<Options>> >>
- lexy::dsl::equal_sign >>
- (lexy::dsl::p<Expression<Options>> | lexy::dsl::recurse_branch<StatementListBlock<Options>>);
+ static constexpr auto rule = [] {
+ auto right_brace = lexy::dsl::lit_c<'}'>;
+
+ auto expression = lexy::dsl::p<Expression>;
+ auto statement_list = lexy::dsl::recurse_branch<StatementListBlock>;
+
+ auto rhs_recover = lexy::dsl::recover(expression, statement_list).limit(right_brace);
+ auto rhs_try = lexy::dsl::try_(expression | statement_list, rhs_recover);
+
+ auto identifier = dsl::p<Identifier> >> lexy::dsl::equal_sign + rhs_try;
+
+ auto recover = lexy::dsl::recover(identifier).limit(right_brace);
+ return lexy::dsl::try_(identifier, recover);
+ }();
static constexpr auto value = callback<ast::AssignStatement*>(
- [](ast::ParseState& state, const char* pos, ast::IdentifierValue* name, ast::Value* initializer) {
- return state.ast().create<ast::AssignStatement>(pos, name, initializer);
+ [](detail::IsParseState auto& state, const char* pos, ast::IdentifierValue* name, ast::Value* initializer) -> ast::AssignStatement* {
+ if (initializer == nullptr) return nullptr;
+ return state.ast().template create<ast::AssignStatement>(pos, name, initializer);
+ },
+ [](detail::IsParseState auto& state, ast::Value*) {
+ return nullptr;
+ },
+ [](detail::IsParseState auto& state) {
+ return nullptr;
});
};
- template<ParseOptions Options>
struct StatementListBlock {
- static constexpr auto rule =
- dsl::curly_bracketed(
- lexy::dsl::opt(
- lexy::dsl::list(
- lexy::dsl::recurse_branch<AssignmentStatement<Options>>,
- lexy::dsl::trailing_sep(lexy::dsl::lit_c<','>))));
+ static constexpr auto rule = [] {
+ auto right_brace = lexy::dsl::lit_c<'}'>;
+ auto comma = lexy::dsl::lit_c<','>;
+
+ auto assign_statement = lexy::dsl::recurse_branch<AssignmentStatement>;
+ auto assign_try = lexy::dsl::try_(assign_statement);
+
+ auto curly_bracket = dsl::curly_bracketed.opt_list(
+ assign_try,
+ lexy::dsl::trailing_sep(comma));
+
+ return lexy::dsl::try_(curly_bracket, lexy::dsl::find(right_brace));
+ }();
static constexpr auto value =
lexy::as_list<ast::AssignStatementList> >> construct_list<ast::ListValue>;
};
- template<ParseOptions Options = ParseOptions {}>
struct File {
// Allow arbitrary spaces between individual tokens.
static constexpr auto whitespace = ovdl::v2script::grammar::whitespace_specifier | comment_specifier;
- static constexpr auto rule = lexy::dsl::position + lexy::dsl::terminator(lexy::dsl::eof).opt_list(lexy::dsl::p<AssignmentStatement<Options>>);
+ static constexpr auto rule = lexy::dsl::position + lexy::dsl::terminator(lexy::dsl::eof).opt_list(lexy::dsl::p<AssignmentStatement>);
static constexpr auto value = lexy::as_list<ast::AssignStatementList> >> construct<ast::FileTree>;
};
diff --git a/src/openvic-dataloader/v2script/ModifierGrammar.hpp b/src/openvic-dataloader/v2script/ModifierGrammar.hpp
index 22592d4..122a8c7 100644
--- a/src/openvic-dataloader/v2script/ModifierGrammar.hpp
+++ b/src/openvic-dataloader/v2script/ModifierGrammar.hpp
@@ -10,9 +10,9 @@
#include "openvic-dataloader/NodeLocation.hpp"
-#include "ParseState.hpp"
#include "SimpleGrammar.hpp"
#include "TriggerGrammar.hpp"
+#include "detail/InternalConcepts.hpp"
#include "detail/dsl.hpp"
namespace ovdl::v2script::grammar {
@@ -22,9 +22,9 @@ namespace ovdl::v2script::grammar {
struct FactorStatement {
static constexpr auto rule = lexy::dsl::position(factor_keyword) >> (lexy::dsl::equal_sign + lexy::dsl::p<Identifier<StringEscapeOption>>);
static constexpr auto value = dsl::callback<ast::AssignStatement*>(
- [](ast::ParseState& state, NodeLocation loc, ast::IdentifierValue* value) {
- auto* factor = state.ast().create<ast::IdentifierValue>(loc, state.ast().intern("factor"));
- return state.ast().create<ast::AssignStatement>(loc, factor, value);
+ [](detail::IsParseState auto& state, NodeLocation loc, ast::IdentifierValue* value) {
+ auto* factor = state.ast().template create<ast::IdentifierValue>(loc, state.ast().intern("factor"));
+ return state.ast().template create<ast::AssignStatement>(loc, factor, value);
});
};
@@ -49,9 +49,9 @@ namespace ovdl::v2script::grammar {
lexy::dsl::position(modifier_keyword) >> lexy::dsl::equal_sign >> lexy::dsl::p<ModifierList>;
static constexpr auto value = dsl::callback<ast::AssignStatement*>(
- [](ast::ParseState& state, NodeLocation loc, ast::ListValue* list) {
- auto* factor = state.ast().create<ast::IdentifierValue>(loc, state.ast().intern("modifier"));
- return state.ast().create<ast::AssignStatement>(loc, factor, list);
+ [](detail::IsParseState auto& state, NodeLocation loc, ast::ListValue* list) {
+ auto* factor = state.ast().template create<ast::IdentifierValue>(loc, state.ast().intern("modifier"));
+ return state.ast().template create<ast::AssignStatement>(loc, factor, list);
});
};
} \ No newline at end of file
diff --git a/src/openvic-dataloader/v2script/ParseState.hpp b/src/openvic-dataloader/v2script/ParseState.hpp
index 8e29bf5..954e39d 100644
--- a/src/openvic-dataloader/v2script/ParseState.hpp
+++ b/src/openvic-dataloader/v2script/ParseState.hpp
@@ -1,23 +1,24 @@
#pragma once
-#include <openvic-dataloader/File.hpp>
-#include <openvic-dataloader/ParseState.hpp>
#include <openvic-dataloader/v2script/AbstractSyntaxTree.hpp>
#include <lexy/encoding.hpp>
+#include "../openvic-dataloader/ParseState.hpp"
+#include "AbstractSyntaxTree.hpp"
+#include "File.hpp"
+#include "detail/InternalConcepts.hpp"
+
namespace ovdl::v2script::ast {
- using File = ovdl::BasicFile<lexy::default_encoding, Node>;
- struct AbstractSyntaxTree : ovdl::BasicAbstractSyntaxTree<File, FileTree> {
- using BasicAbstractSyntaxTree::BasicAbstractSyntaxTree;
+
+ struct FileAbstractSyntaxTree : ovdl::BasicAbstractSyntaxTree<ovdl::BasicFile<Node>, FileTree> {
+ using ovdl::BasicAbstractSyntaxTree<ovdl::BasicFile<Node>, FileTree>::BasicAbstractSyntaxTree;
std::string make_list_visualizer() const;
std::string make_native_visualizer() const;
};
- using ParseState = ovdl::ParseState<AbstractSyntaxTree>;
+ using ParseState = ovdl::ParseState<FileAbstractSyntaxTree>;
- static_assert(IsFile<ast::File>, "File failed IsFile concept");
- static_assert(IsAst<ast::AbstractSyntaxTree>, "AbstractSyntaxTree failed IsAst concept");
- static_assert(IsParseState<ast::ParseState>, "ParseState failed IsParseState concept");
+ static_assert(detail::IsParseState<ast::ParseState>, "ParseState failed IsParseState concept");
} \ No newline at end of file
diff --git a/src/openvic-dataloader/v2script/Parser.cpp b/src/openvic-dataloader/v2script/Parser.cpp
index eb491d5..23dada7 100644
--- a/src/openvic-dataloader/v2script/Parser.cpp
+++ b/src/openvic-dataloader/v2script/Parser.cpp
@@ -4,16 +4,15 @@
#include <iostream>
#include <optional>
#include <string>
+#include <type_traits>
#include <utility>
-#include <openvic-dataloader/DiagnosticLogger.hpp>
+#include <openvic-dataloader/Error.hpp>
#include <openvic-dataloader/NodeLocation.hpp>
-#include <openvic-dataloader/ParseError.hpp>
-#include <openvic-dataloader/ParseWarning.hpp>
-#include <openvic-dataloader/detail/LexyReportError.hpp>
+#include <openvic-dataloader/detail/Concepts.hpp>
+#include <openvic-dataloader/detail/Encoding.hpp>
#include <openvic-dataloader/detail/OStreamOutputIterator.hpp>
-#include <openvic-dataloader/detail/utility/Concepts.hpp>
-#include <openvic-dataloader/detail/utility/Utility.hpp>
+#include <openvic-dataloader/detail/Utility.hpp>
#include <openvic-dataloader/v2script/AbstractSyntaxTree.hpp>
#include <lexy/action/parse.hpp>
@@ -29,10 +28,8 @@
#include <fmt/core.h>
-#include "openvic-dataloader/Error.hpp"
-
+#include "DiagnosticLogger.hpp"
#include "ParseState.hpp"
-#include "detail/DetectUtf8.hpp"
#include "detail/NullBuff.hpp"
#include "detail/ParseHandler.hpp"
#include "detail/Warnings.hpp"
@@ -44,29 +41,46 @@
using namespace ovdl;
using namespace ovdl::v2script;
-/// BufferHandler ///
+/// ParseHandler ///
struct Parser::ParseHandler final : detail::BasicStateParseHandler<v2script::ast::ParseState> {
- constexpr bool is_exclusive_utf8() const {
- return detail::is_utf8_no_ascii(buffer());
- }
-
template<typename Node>
std::optional<DiagnosticLogger::error_range> parse() {
- auto result = lexy::parse<Node>(buffer(), *_parse_state, _parse_state->logger().error_callback());
+ if (parse_state().encoding() == ovdl::detail::Encoding::Utf8) {
+ parse_state().logger().warning(warnings::make_utf8_warning(path()));
+ }
+
+ auto result = [&] {
+ switch (parse_state().encoding()) {
+ using enum detail::Encoding;
+ case Ascii:
+ case Utf8:
+ return lexy::parse<Node>(buffer<lexy::utf8_char_encoding>(), parse_state(), parse_state().logger().error_callback());
+ case Unknown:
+ case Windows1251:
+ case Windows1252:
+ return lexy::parse<Node>(buffer<lexy::default_encoding>(), parse_state(), parse_state().logger().error_callback());
+ default:
+ ovdl::detail::unreachable();
+ }
+ }();
if (!result) {
- return _parse_state->logger().get_errors();
+ return parse_state().logger().get_errors();
}
- _parse_state->ast().set_root(result.value());
+ parse_state().ast().set_root(result.value());
return std::nullopt;
}
ast::FileTree* root() {
- return _parse_state->ast().root();
+ return parse_state().ast().root();
+ }
+
+ Parser::error_range get_errors() {
+ return parse_state().logger().get_errors();
}
};
-/// BufferHandler ///
+/// ParseHandler ///
Parser::Parser()
: _parse_handler(std::make_unique<ParseHandler>()) {
@@ -82,29 +96,29 @@ Parser::Parser(Parser&&) = default;
Parser& Parser::operator=(Parser&&) = default;
Parser::~Parser() = default;
-Parser Parser::from_buffer(const char* data, std::size_t size) {
+Parser Parser::from_buffer(const char* data, std::size_t size, std::optional<detail::Encoding> encoding_fallback) {
Parser result;
- return std::move(result.load_from_buffer(data, size));
+ return std::move(result.load_from_buffer(data, size, encoding_fallback));
}
-Parser Parser::from_buffer(const char* start, const char* end) {
+Parser Parser::from_buffer(const char* start, const char* end, std::optional<detail::Encoding> encoding_fallback) {
Parser result;
- return std::move(result.load_from_buffer(start, end));
+ return std::move(result.load_from_buffer(start, end, encoding_fallback));
}
-Parser Parser::from_string(const std::string_view string) {
+Parser Parser::from_string(const std::string_view string, std::optional<detail::Encoding> encoding_fallback) {
Parser result;
- return std::move(result.load_from_string(string));
+ return std::move(result.load_from_string(string, encoding_fallback));
}
-Parser Parser::from_file(const char* path) {
+Parser Parser::from_file(const char* path, std::optional<detail::Encoding> encoding_fallback) {
Parser result;
- return std::move(result.load_from_file(path));
+ return std::move(result.load_from_file(path, encoding_fallback));
}
-Parser Parser::from_file(const std::filesystem::path& path) {
+Parser Parser::from_file(const std::filesystem::path& path, std::optional<detail::Encoding> encoding_fallback) {
Parser result;
- return std::move(result.load_from_file(path));
+ return std::move(result.load_from_file(path, encoding_fallback));
}
///
@@ -128,38 +142,38 @@ constexpr void Parser::_run_load_func(detail::LoadCallback<Parser::ParseHandler*
if (!error_message.empty()) {
_has_error = true;
_has_fatal_error = true;
- _parse_handler->parse_state().logger().create_log<error::BufferError>(DiagnosticLogger::DiagnosticKind::error, fmt::runtime(error_message));
+ _parse_handler->parse_state().logger().template create_log<error::BufferError>(DiagnosticLogger::DiagnosticKind::error, fmt::runtime(error_message));
}
if (has_error() && &_error_stream.get() != &detail::cnull) {
print_errors_to(_error_stream.get());
}
}
-constexpr Parser& Parser::load_from_buffer(const char* data, std::size_t size) {
+constexpr Parser& Parser::load_from_buffer(const char* data, std::size_t size, std::optional<detail::Encoding> encoding_fallback) {
// Type can't be deduced?
- _run_load_func(std::mem_fn(&ParseHandler::load_buffer_size), data, size);
+ _run_load_func(std::mem_fn(&ParseHandler::load_buffer_size), data, size, encoding_fallback);
return *this;
}
-constexpr Parser& Parser::load_from_buffer(const char* start, const char* end) {
+constexpr Parser& Parser::load_from_buffer(const char* start, const char* end, std::optional<detail::Encoding> encoding_fallback) {
// Type can't be deduced?
- _run_load_func(std::mem_fn(&ParseHandler::load_buffer), start, end);
+ _run_load_func(std::mem_fn(&ParseHandler::load_buffer), start, end, encoding_fallback);
return *this;
}
-constexpr Parser& Parser::load_from_string(const std::string_view string) {
- return load_from_buffer(string.data(), string.size());
+constexpr Parser& Parser::load_from_string(const std::string_view string, std::optional<detail::Encoding> encoding_fallback) {
+ return load_from_buffer(string.data(), string.size(), encoding_fallback);
}
-Parser& Parser::load_from_file(const char* path) {
+Parser& Parser::load_from_file(const char* path, std::optional<detail::Encoding> encoding_fallback) {
set_file_path(path);
// Type can be deduced??
- _run_load_func(std::mem_fn(&ParseHandler::load_file), path);
+ _run_load_func(std::mem_fn(&ParseHandler::load_file), get_file_path().data(), encoding_fallback);
return *this;
}
-Parser& Parser::load_from_file(const std::filesystem::path& path) {
- return load_from_file(path.string().c_str());
+Parser& Parser::load_from_file(const std::filesystem::path& path, std::optional<detail::Encoding> encoding_fallback) {
+ return load_from_file(path.string().c_str(), encoding_fallback);
}
/* REQUIREMENTS:
@@ -173,11 +187,7 @@ bool Parser::simple_parse() {
return false;
}
- if (_parse_handler->is_exclusive_utf8()) {
- _parse_handler->parse_state().logger().warning(warnings::make_utf8_warning(_file_path));
- }
-
- auto errors = _parse_handler->parse<grammar::File<grammar::NoStringEscapeOption>>();
+ std::optional<DiagnosticLogger::error_range> errors = _parse_handler->parse<grammar::File>();
_has_error = _parse_handler->parse_state().logger().errored();
_has_warning = _parse_handler->parse_state().logger().warned();
if (!_parse_handler->root()) {
@@ -196,14 +206,11 @@ bool Parser::event_parse() {
return false;
}
- if (_parse_handler->is_exclusive_utf8()) {
- _parse_handler->parse_state().logger().warning(warnings::make_utf8_warning(_file_path));
- }
-
- auto errors = _parse_handler->parse<grammar::EventFile>();
+ std::optional<DiagnosticLogger::error_range> errors = _parse_handler->parse<grammar::EventFile>();
_has_error = _parse_handler->parse_state().logger().errored();
_has_warning = _parse_handler->parse_state().logger().warned();
if (!_parse_handler->root()) {
+ _has_error = true;
_has_fatal_error = true;
if (&_error_stream.get() != &detail::cnull) {
print_errors_to(_error_stream);
@@ -218,14 +225,11 @@ bool Parser::decision_parse() {
return false;
}
- if (_parse_handler->is_exclusive_utf8()) {
- _parse_handler->parse_state().logger().warning(warnings::make_utf8_warning(_file_path));
- }
-
- auto errors = _parse_handler->parse<grammar::DecisionFile>();
+ std::optional<DiagnosticLogger::error_range> errors = _parse_handler->parse<grammar::DecisionFile>();
_has_error = _parse_handler->parse_state().logger().errored();
_has_warning = _parse_handler->parse_state().logger().warned();
if (!_parse_handler->root()) {
+ _has_error = true;
_has_fatal_error = true;
if (&_error_stream.get() != &detail::cnull) {
print_errors_to(_error_stream);
@@ -240,14 +244,11 @@ bool Parser::lua_defines_parse() {
return false;
}
- if (_parse_handler->is_exclusive_utf8()) {
- _parse_handler->parse_state().logger().warning(warnings::make_utf8_warning(_file_path));
- }
-
- auto errors = _parse_handler->parse<lua::grammar::File<>>();
+ std::optional<DiagnosticLogger::error_range> errors = _parse_handler->parse<lua::grammar::File>();
_has_error = _parse_handler->parse_state().logger().errored();
_has_warning = _parse_handler->parse_state().logger().warned();
if (!_parse_handler->root()) {
+ _has_error = true;
_has_fatal_error = true;
if (&_error_stream.get() != &detail::cnull) {
print_errors_to(_error_stream);
@@ -273,48 +274,66 @@ std::string Parser::make_list_string() const {
return _parse_handler->parse_state().ast().make_list_visualizer();
}
+// TODO: Remove reinterpret_cast
+// WARNING: This almost certainly breaks on utf16 and utf32 encodings, luckily we don't parse in that format
+// This is purely to silence the node_location errors because char8_t is useless
+#define REINTERPRET_IT(IT) reinterpret_cast<const std::decay_t<decltype(buffer)>::encoding::char_type*>((IT))
+
const FilePosition Parser::get_position(const ast::Node* node) const {
if (!node || !node->is_linked_in_tree()) {
return {};
}
- auto node_location = _parse_handler->parse_state().ast().location_of(node);
+
+ NodeLocation node_location;
+
+ node_location = _parse_handler->parse_state().ast().location_of(node);
+
if (node_location.is_synthesized()) {
- return {};
+ return FilePosition {};
}
- auto loc_begin = lexy::get_input_location(_parse_handler->buffer(), node_location.begin());
- FilePosition result { loc_begin.line_nr(), loc_begin.line_nr(), loc_begin.column_nr(), loc_begin.column_nr() };
- if (node_location.begin() < node_location.end()) {
- auto loc_end = lexy::get_input_location(_parse_handler->buffer(), node_location.end(), loc_begin.anchor());
- result.end_line = loc_end.line_nr();
- result.end_column = loc_end.column_nr();
- }
- return result;
+ return _parse_handler->parse_state().ast().file().visit_buffer(
+ [&](auto&& buffer) -> FilePosition {
+ auto loc_begin = lexy::get_input_location(buffer, REINTERPRET_IT(node_location.begin()));
+ FilePosition result { loc_begin.line_nr(), loc_begin.line_nr(), loc_begin.column_nr(), loc_begin.column_nr() };
+ if (node_location.begin() < node_location.end()) {
+ auto loc_end = lexy::get_input_location(buffer, REINTERPRET_IT(node_location.end()), loc_begin.anchor());
+ result.end_line = loc_end.line_nr();
+ result.end_column = loc_end.column_nr();
+ }
+ return result;
+ });
}
Parser::error_range Parser::get_errors() const {
- return _parse_handler->parse_state().logger().get_errors();
+ return _parse_handler->get_errors();
}
const FilePosition Parser::get_error_position(const error::Error* error) const {
if (!error || !error->is_linked_in_tree()) {
return {};
}
+
auto err_location = _parse_handler->parse_state().logger().location_of(error);
if (err_location.is_synthesized()) {
- return {};
+ return FilePosition {};
}
- auto loc_begin = lexy::get_input_location(_parse_handler->buffer(), err_location.begin());
- FilePosition result { loc_begin.line_nr(), loc_begin.line_nr(), loc_begin.column_nr(), loc_begin.column_nr() };
- if (err_location.begin() < err_location.end()) {
- auto loc_end = lexy::get_input_location(_parse_handler->buffer(), err_location.end(), loc_begin.anchor());
- result.end_line = loc_end.line_nr();
- result.end_column = loc_end.column_nr();
- }
- return result;
+ return _parse_handler->parse_state().ast().file().visit_buffer(
+ [&](auto&& buffer) -> FilePosition {
+ auto loc_begin = lexy::get_input_location(buffer, REINTERPRET_IT(err_location.begin()));
+ FilePosition result { loc_begin.line_nr(), loc_begin.line_nr(), loc_begin.column_nr(), loc_begin.column_nr() };
+ if (err_location.begin() < err_location.end()) {
+ auto loc_end = lexy::get_input_location(buffer, REINTERPRET_IT(err_location.end()), loc_begin.anchor());
+ result.end_line = loc_end.line_nr();
+ result.end_column = loc_end.column_nr();
+ }
+ return result;
+ });
}
+#undef REINTERPRET_IT
+
void Parser::print_errors_to(std::basic_ostream<char>& stream) const {
auto errors = get_errors();
if (errors.empty()) return;
@@ -324,19 +343,9 @@ void Parser::print_errors_to(std::basic_ostream<char>& stream) const {
[&](const error::BufferError* buffer_error) {
stream << "buffer error: " << buffer_error->message() << '\n';
},
- [&](const error::ParseError* parse_error) {
- auto position = get_error_position(parse_error);
- std::string pos_str = fmt::format(":{}:{}: ", position.start_line, position.start_column);
- stream << _file_path << pos_str << "parse error for '" << parse_error->production_name() << "': " << parse_error->message() << '\n';
- },
- [&](dryad::child_visitor<error::ErrorKind> visitor, const error::Semantic* semantic) {
- auto position = get_error_position(semantic);
- std::string pos_str = ": ";
- if (!position.is_empty()) {
- pos_str = fmt::format(":{}:{}: ", position.start_line, position.start_column);
- }
- stream << _file_path << pos_str << semantic->message() << '\n';
- auto annotations = semantic->annotations();
+ [&](dryad::child_visitor<error::ErrorKind> visitor, const error::AnnotatedError* annotated_error) {
+ stream << annotated_error->message() << '\n';
+ auto annotations = annotated_error->annotations();
for (auto annotation : annotations) {
visitor(annotation);
}
diff --git a/src/openvic-dataloader/v2script/SimpleGrammar.hpp b/src/openvic-dataloader/v2script/SimpleGrammar.hpp
index 37e295f..d42ce07 100644
--- a/src/openvic-dataloader/v2script/SimpleGrammar.hpp
+++ b/src/openvic-dataloader/v2script/SimpleGrammar.hpp
@@ -5,10 +5,22 @@
#include <lexy/callback.hpp>
#include <lexy/dsl.hpp>
+#include <lexy/dsl/any.hpp>
#include <lexy/dsl/identifier.hpp>
+#include <lexy/dsl/option.hpp>
+#include <lexy/dsl/peek.hpp>
+#include <lexy/dsl/punctuator.hpp>
+#include <lexy/dsl/recover.hpp>
+#include <lexy/dsl/scan.hpp>
#include <lexy/dsl/symbol.hpp>
-
-#include "ParseState.hpp"
+#include <lexy/dsl/unicode.hpp>
+#include <lexy/encoding.hpp>
+#include <lexy/input/base.hpp>
+#include <lexy/input/buffer.hpp>
+#include <lexy/lexeme.hpp>
+
+#include "detail/Convert.hpp"
+#include "detail/InternalConcepts.hpp"
#include "detail/dsl.hpp"
// Grammar Definitions //
@@ -23,17 +35,28 @@
*/
namespace ovdl::v2script::grammar {
template<typename T>
- constexpr auto construct = dsl::construct<ast::ParseState, T>;
+ constexpr auto construct = dsl::construct<T>;
template<typename T, bool DisableEmpty = false, typename ListType = ast::AssignStatementList>
- constexpr auto construct_list = dsl::construct_list<ast::ParseState, T, ListType, DisableEmpty>;
+ constexpr auto construct_list = dsl::construct_list<T, ListType, DisableEmpty>;
+
+ struct ConvertErrorHandler {
+ static constexpr void on_invalid_character(detail::IsStateType auto& state, auto reader) {
+ state.logger().warning("invalid character value '{}' found.", static_cast<int>(reader.peek())) //
+ .primary(BasicNodeLocation { reader.position() }, "here")
+ .finish();
+ }
+ };
+
+ template<typename String>
+ constexpr auto convert_as_string = convert::convert_as_string<String, ConvertErrorHandler>;
struct ParseOptions {
/// @brief Makes string parsing avoid string escapes
bool NoStringEscape;
};
- static constexpr ParseOptions NoStringEscapeOption = ParseOptions { true };
- static constexpr ParseOptions StringEscapeOption = ParseOptions { false };
+ static constexpr auto NoStringEscapeOption = ParseOptions { true };
+ static constexpr auto StringEscapeOption = ParseOptions { false };
/* REQUIREMENTS: DAT-630 */
static constexpr auto whitespace_specifier = lexy::dsl::ascii::blank / lexy::dsl::ascii::newline;
@@ -50,24 +73,28 @@ namespace ovdl::v2script::grammar {
ascii /
lexy::dsl::lit_b<0x8A> / lexy::dsl::lit_b<0x8C> / lexy::dsl::lit_b<0x8E> /
lexy::dsl::lit_b<0x92> / lexy::dsl::lit_b<0x97> / lexy::dsl::lit_b<0x9A> / lexy::dsl::lit_b<0x9C> /
- dsl::make_range<0x9E, 0x9F>() /
- dsl::make_range<0xC0, 0xD6>() /
- dsl::make_range<0xD8, 0xF6>() /
- dsl::make_range<0xF8, 0xFF>();
+ dsl::lit_b_range<0x9E, 0x9F> /
+ dsl::lit_b_range<0xC0, 0xD6> /
+ dsl::lit_b_range<0xD8, 0xF6> /
+ dsl::lit_b_range<0xF8, 0xFF>;
static constexpr auto windows_1251_data_specifier_additions =
- dsl::make_range<0x80, 0x81>() / lexy::dsl::lit_b<0x83> / lexy::dsl::lit_b<0x8D> / lexy::dsl::lit_b<0x8F> /
+ dsl::lit_b_range<0x80, 0x81> / lexy::dsl::lit_b<0x83> / lexy::dsl::lit_b<0x8D> / lexy::dsl::lit_b<0x8F> /
lexy::dsl::lit_b<0x90> / lexy::dsl::lit_b<0x9D> / lexy::dsl::lit_b<0x9F> /
- dsl::make_range<0xA1, 0xA3>() / lexy::dsl::lit_b<0xA5> / lexy::dsl::lit_b<0xA8> / lexy::dsl::lit_b<0xAA> /
+ dsl::lit_b_range<0xA1, 0xA3> / lexy::dsl::lit_b<0xA5> / lexy::dsl::lit_b<0xA8> / lexy::dsl::lit_b<0xAA> /
lexy::dsl::lit_b<0xAF> /
- dsl::make_range<0xB2, 0xB4>() / lexy::dsl::lit_b<0xB8> / lexy::dsl::lit_b<0xBA> /
- dsl::make_range<0xBC, 0xBF>() /
+ dsl::lit_b_range<0xB2, 0xB4> / lexy::dsl::lit_b<0xB8> / lexy::dsl::lit_b<0xBA> /
+ dsl::lit_b_range<0xBC, 0xBF> /
lexy::dsl::lit_b<0xD7> / lexy::dsl::lit_b<0xF7>;
static constexpr auto data_specifier = windows_1252_data_specifier / windows_1251_data_specifier_additions;
static constexpr auto data_char_class = LEXY_CHAR_CLASS("DataSpecifier", data_specifier);
+ static constexpr auto utf_data_specifier = lexy::dsl::unicode::xid_continue / LEXY_ASCII_ONE_OF("+:@%&'-.");
+
+ static constexpr auto utf_char_class = LEXY_CHAR_CLASS("DataSpecifier", utf_data_specifier);
+
static constexpr auto escaped_symbols = lexy::symbol_table<char> //
.map<'"'>('"')
.map<'\''>('\'')
@@ -79,50 +106,121 @@ namespace ovdl::v2script::grammar {
.map<'r'>('\r')
.map<'t'>('\t');
- static constexpr auto id = lexy::dsl::identifier(data_char_class);
+ static constexpr auto id = lexy::dsl::identifier(ascii);
template<ParseOptions Options>
struct SimpleGrammar {
struct StatementListBlock;
- struct Identifier {
- static constexpr auto rule = lexy::dsl::identifier(data_char_class);
- static constexpr auto value = dsl::callback<ast::IdentifierValue*>(
- [](ast::ParseState& state, auto lexeme) {
- auto value = state.ast().intern(lexeme.data(), lexeme.size());
- return state.ast().create<ast::IdentifierValue>(ovdl::NodeLocation::make_from(lexeme.begin(), lexeme.end()), value);
- });
+ struct Identifier : lexy::scan_production<ast::IdentifierValue*>,
+ lexy::token_production {
+
+ template<typename Context, typename Reader>
+ static constexpr scan_result scan(lexy::rule_scanner<Context, Reader>& scanner, detail::IsParseState auto& state) {
+ using encoding = typename Reader::encoding;
+ using char_type = typename encoding::char_type;
+
+ std::basic_string<char_type> value_result;
+
+ auto content_begin = scanner.position();
+ do {
+ if constexpr (std::same_as<encoding, lexy::default_encoding> || std::same_as<encoding, lexy::byte_encoding>) {
+ if (lexy::scan_result<lexy::lexeme<Reader>> ascii_result; scanner.branch(ascii_result, lexy::dsl::identifier(ascii))) {
+ value_result.append(ascii_result.value().begin(), ascii_result.value().end());
+ continue;
+ }
+
+ char_type char_array[] { *scanner.position(), char_type {} };
+ auto input = lexy::range_input(&char_array[0], &char_array[1]);
+ auto reader = input.reader();
+ convert::map_value val = convert::try_parse_map(state.encoding(), reader);
+
+ if (val.is_invalid()) {
+ ConvertErrorHandler::on_invalid_character(state, reader);
+ continue;
+ }
+
+ if (!val.is_pass()) {
+ // non-pass characters are not valid ascii and are mapped to utf8 values
+ value_result.append(val._value);
+ scanner.parse(data_char_class);
+ } else {
+ break;
+ }
+ } else {
+ auto lexeme_result = scanner.template parse<lexy::lexeme<Reader>>(lexy::dsl::identifier(utf_char_class));
+ if (lexeme_result) {
+ value_result.append(lexeme_result.value().begin(), lexeme_result.value().size());
+ break;
+ }
+ }
+ } while (scanner);
+ auto content_end = scanner.position();
+
+ if (value_result.empty()) {
+ return lexy::scan_failed;
+ }
+
+ auto value = state.ast().intern(value_result);
+ return state.ast().template create<ast::IdentifierValue>(ovdl::NodeLocation::make_from(content_begin, content_end), value);
+ }
+
+ static constexpr auto rule = dsl::peek(data_char_class, utf_char_class) >> lexy::dsl::scan;
};
/* REQUIREMENTS:
* DAT-633
* DAT-634
*/
- struct StringExpression {
- static constexpr auto rule = [] {
- if constexpr (Options.NoStringEscape) {
- auto c = dsl::make_range<0x20, 0xFF>() / lexy::dsl::lit_b<0x07> / lexy::dsl::lit_b<0x09> / lexy::dsl::lit_b<0x0A> / lexy::dsl::lit_b<0x0D>;
- return lexy::dsl::delimited(lexy::dsl::position(lexy::dsl::lit_b<'"'>))(c);
- } else {
- // Arbitrary code points that aren't control characters.
- auto c = dsl::make_range<0x20, 0xFF>() - lexy::dsl::ascii::control;
-
- // Escape sequences start with a backlash.
- // They either map one of the symbols,
- // or a Unicode code point of the form uXXXX.
- auto escape = lexy::dsl::backslash_escape //
- .symbol<escaped_symbols>();
- return lexy::dsl::delimited(lexy::dsl::position(lexy::dsl::lit_b<'"'>))(c, escape);
- }
- }();
-
- static constexpr auto value =
- lexy::as_string<std::string> >>
- dsl::callback<ast::StringValue*>(
- [](ast::ParseState& state, const char* begin, auto&& str, const char* end) {
- auto value = state.ast().intern(str.data(), str.length());
- return state.ast().create<ast::StringValue>(ovdl::NodeLocation::make_from(begin, end), value);
- });
+ struct StringExpression : lexy::scan_production<ast::StringValue*>,
+ lexy::token_production {
+
+ template<typename Context, typename Reader>
+ static constexpr scan_result scan(lexy::rule_scanner<Context, Reader>& scanner, detail::IsParseState auto& state) {
+ using encoding = typename Reader::encoding;
+
+ constexpr auto rule = [] {
+ if constexpr (Options.NoStringEscape) {
+ auto c = [] {
+ if constexpr (std::same_as<encoding, lexy::default_encoding> || std::same_as<encoding, lexy::byte_encoding>) {
+ return dsl::lit_b_range<0x20, 0xFF> / lexy::dsl::lit_b<0x07> / lexy::dsl::lit_b<0x09> / lexy::dsl::lit_b<0x0A> / lexy::dsl::lit_b<0x0D>;
+ } else {
+ return -lexy::dsl::unicode::control;
+ }
+ }();
+ return lexy::dsl::quoted(c);
+ } else {
+ // Arbitrary code points that aren't control characters.
+ auto c = [] {
+ if constexpr (std::same_as<encoding, lexy::default_encoding> || std::same_as<encoding, lexy::byte_encoding>) {
+ return dsl::lit_b_range<0x20, 0xFF> - lexy::dsl::ascii::control;
+ } else {
+ return -lexy::dsl::unicode::control;
+ }
+ }();
+
+ // Escape sequences start with a backlash.
+ // They either map one of the symbols,
+ // or a Unicode code point of the form uXXXX.
+ auto escape = lexy::dsl::backslash_escape //
+ .symbol<escaped_symbols>();
+ return lexy::dsl::quoted(c, escape);
+ }
+ }();
+
+ auto begin = scanner.position();
+ lexy::scan_result<std::string> str_result;
+ scanner.parse(str_result, rule);
+ if (!scanner || !str_result)
+ return lexy::scan_failed;
+ auto end = scanner.position();
+ auto str = str_result.value();
+ auto value = state.ast().intern(str.data(), str.size());
+ return state.ast().template create<ast::StringValue>(ovdl::NodeLocation::make_from(begin, end), value);
+ }
+
+ static constexpr auto rule = lexy::dsl::peek(lexy::dsl::quoted.open()) >> lexy::dsl::scan;
+ static constexpr auto value = convert_as_string<std::string> >> lexy::forward<ast::StringValue*>;
};
/* REQUIREMENTS: DAT-638 */
@@ -132,59 +230,112 @@ namespace ovdl::v2script::grammar {
};
struct SimpleAssignmentStatement {
- static constexpr auto rule =
- dsl::p<Identifier> >>
- (lexy::dsl::equal_sign >>
- (lexy::dsl::p<ValueExpression> | lexy::dsl::recurse_branch<StatementListBlock>));
+ static constexpr auto rule = [] {
+ auto right_brace = lexy::dsl::lit_c<'}'>;
+
+ auto value_expression = lexy::dsl::p<ValueExpression>;
+ auto statement_list_expression = lexy::dsl::recurse_branch<StatementListBlock>;
+
+ auto rhs_recover = lexy::dsl::recover(value_expression, statement_list_expression).limit(right_brace);
+ auto rhs_try = lexy::dsl::try_(value_expression | statement_list_expression, rhs_recover);
+
+ auto identifier =
+ dsl::p<Identifier> >>
+ (lexy::dsl::equal_sign >> rhs_try);
+
+ auto recover = lexy::dsl::recover(identifier).limit(right_brace);
+ return lexy::dsl::try_(identifier, recover);
+ }();
static constexpr auto value = construct<ast::AssignStatement>;
};
/* REQUIREMENTS: DAT-639 */
struct AssignmentStatement {
- static constexpr auto rule =
- dsl::p<Identifier> >>
+ static constexpr auto rule = [] {
+ auto right_brace = lexy::dsl::lit_c<'}'>;
+
+ auto value_expression = lexy::dsl::p<ValueExpression>;
+ auto statement_list_expression = lexy::dsl::recurse_branch<StatementListBlock>;
+
+ auto rhs_recover = lexy::dsl::recover(value_expression, statement_list_expression).limit(right_brace);
+ auto rhs_try = lexy::dsl::try_(value_expression | statement_list_expression, rhs_recover);
+
+ auto identifier =
+ dsl::p<Identifier> >>
(lexy::dsl::equal_sign >>
- (lexy::dsl::p<ValueExpression> | lexy::dsl::recurse_branch<StatementListBlock>) |
- lexy::dsl::else_ >> lexy::dsl::return_) |
- dsl::p<StringExpression> |
- lexy::dsl::recurse_branch<StatementListBlock>;
+ rhs_try |
+ lexy::dsl::else_ >> lexy::dsl::return_);
+
+ auto string_expression = dsl::p<StringExpression>;
+ auto statement_list = lexy::dsl::recurse_branch<StatementListBlock>;
+
+ return identifier | string_expression | statement_list;
+ }();
static constexpr auto value = dsl::callback<ast::Statement*>(
- [](ast::ParseState& state, const char* pos, ast::IdentifierValue* name, ast::Value* initializer) {
- return state.ast().create<ast::AssignStatement>(pos, name, initializer);
+ [](detail::IsParseState auto& state, const char* pos, ast::IdentifierValue* name, ast::Value* initializer) {
+ return state.ast().template create<ast::AssignStatement>(pos, name, initializer);
},
- [](ast::ParseState& state, const char* pos, ast::Value* left, lexy::nullopt = {}) {
- return state.ast().create<ast::ValueStatement>(pos, left);
+ [](detail::IsParseState auto& state, bool&, const char* pos, ast::IdentifierValue* name, ast::Value* initializer) {
+ return state.ast().template create<ast::AssignStatement>(pos, name, initializer);
},
- [](ast::ParseState& state, ast::Value* left) {
- return state.ast().create<ast::ValueStatement>(state.ast().location_of(left), left);
+ [](detail::IsParseState auto& state, bool&, bool&, const char* pos, ast::IdentifierValue* name, ast::Value* initializer) {
+ return state.ast().template create<ast::AssignStatement>(pos, name, initializer);
+ },
+ [](detail::IsParseState auto& state, bool&, bool&, const char* pos, ast::Value* name) {
+ return state.ast().template create<ast::ValueStatement>(pos, name);
+ },
+ [](detail::IsParseState auto& state, const char* pos, ast::Value* left, lexy::nullopt = {}) {
+ return state.ast().template create<ast::ValueStatement>(pos, left);
+ },
+ [](detail::IsParseState auto& state, bool&, const char* pos, ast::Value* left, lexy::nullopt = {}) {
+ return state.ast().template create<ast::ValueStatement>(pos, left);
+ },
+ [](detail::IsParseState auto& state, ast::Value* left) -> ast::ValueStatement* {
+ if (left == nullptr) return nullptr;
+ return state.ast().template create<ast::ValueStatement>(state.ast().location_of(left), left);
+ },
+ [](detail::IsParseState auto& state, bool&, ast::Value* left) -> ast::ValueStatement* {
+ if (left == nullptr) return nullptr;
+ return state.ast().template create<ast::ValueStatement>(state.ast().location_of(left), left);
});
};
/* REQUIREMENTS: DAT-640 */
struct StatementListBlock {
- static constexpr auto rule =
- dsl::curly_bracketed(
- (lexy::dsl::opt(lexy::dsl::list(lexy::dsl::recurse_branch<AssignmentStatement>)) +
- lexy::dsl::opt(lexy::dsl::semicolon)));
+ static constexpr auto rule = [] {
+ auto right_brace = lexy::dsl::lit_c<'}'>;
+
+ auto assign_statement = lexy::dsl::recurse_branch<AssignmentStatement>;
+
+ auto assign_try = lexy::dsl::try_(assign_statement);
+ auto assign_opt = lexy::dsl::opt(lexy::dsl::list(assign_try));
+
+ auto curly_bracket = dsl::curly_bracketed(assign_opt + lexy::dsl::opt(lexy::dsl::semicolon));
+
+ return lexy::dsl::try_(curly_bracket, lexy::dsl::find(right_brace));
+ }();
static constexpr auto value =
lexy::as_list<ast::StatementList> >>
dsl::callback<ast::ListValue*>(
- [](ast::ParseState& state, const char* begin, auto&& list, const char* end) {
+ [](detail::IsParseState auto& state, const char* begin, auto&& list, const char* end) {
if constexpr (std::is_same_v<std::decay_t<decltype(list)>, lexy::nullopt>) {
- return state.ast().create<ast::ListValue>(ovdl::NodeLocation::make_from(begin, end));
+ return state.ast().template create<ast::ListValue>(ovdl::NodeLocation::make_from(begin, end));
} else {
- return state.ast().create<ast::ListValue>(ovdl::NodeLocation::make_from(begin, end), LEXY_MOV(list));
+ return state.ast().template create<ast::ListValue>(ovdl::NodeLocation::make_from(begin, end), LEXY_MOV(list));
}
},
- [](ast::ParseState& state, const char* begin, auto&& list, auto&& semicolon, const char* end) {
+ [](detail::IsParseState auto& state, const char* begin, auto&& list, auto&& semicolon, const char* end) {
if constexpr (std::is_same_v<std::decay_t<decltype(list)>, lexy::nullopt>) {
- return state.ast().create<ast::ListValue>(ovdl::NodeLocation::make_from(begin, end));
+ return state.ast().template create<ast::ListValue>(ovdl::NodeLocation::make_from(begin, end));
} else {
- return state.ast().create<ast::ListValue>(ovdl::NodeLocation::make_from(begin, end), LEXY_MOV(list));
+ return state.ast().template create<ast::ListValue>(ovdl::NodeLocation::make_from(begin, end), LEXY_MOV(list));
}
+ },
+ [](detail::IsParseState auto& state, lexy::nullopt fail = {}) {
+ return fail;
});
};
};
@@ -198,22 +349,20 @@ namespace ovdl::v2script::grammar {
template<ParseOptions Options>
using SAssignStatement = typename SimpleGrammar<Options>::SimpleAssignmentStatement;
- template<ovdl::detail::string_literal Keyword, auto Production, auto Value = dsl::default_kw_value<ast::ParseState, ast::IdentifierValue, Keyword>>
+ template<ovdl::detail::string_literal Keyword, auto Production, auto Value = dsl::default_kw_value<ast::IdentifierValue, Keyword>>
using keyword_rule = dsl::keyword_rule<
- ast::ParseState,
id,
ast::AssignStatement,
Keyword, Production, Value>;
- template<ovdl::detail::string_literal Keyword, auto Production, auto Value = dsl::default_kw_value<ast::ParseState, ast::IdentifierValue, Keyword>>
+ template<ovdl::detail::string_literal Keyword, auto Production, auto Value = dsl::default_kw_value<ast::IdentifierValue, Keyword>>
using fkeyword_rule = dsl::fkeyword_rule<
- ast::ParseState,
id,
ast::AssignStatement,
Keyword, Production, Value>;
template<ParseOptions Options>
- struct File {
+ struct BasicFile {
// Allow arbitrary spaces between individual tokens.
static constexpr auto whitespace = whitespace_specifier | comment_specifier;
@@ -223,4 +372,6 @@ namespace ovdl::v2script::grammar {
static constexpr auto value = lexy::as_list<ast::StatementList> >> construct<ast::FileTree>;
};
+
+ using File = BasicFile<NoStringEscapeOption>;
}