aboutsummaryrefslogtreecommitdiff
path: root/src/openvic-dataloader/v2script/Parser.cpp
diff options
context:
space:
mode:
author Spartan322 <Megacake1234@gmail.com>2024-05-09 16:06:02 +0200
committer Spartan322 <Megacake1234@gmail.com>2024-06-18 01:31:12 +0200
commitb0c3ba3f91926b0c95625bdbf4aab69269130b13 (patch)
treef15ebc47d6bf370031af28e4bb4814ae30ef46e1 /src/openvic-dataloader/v2script/Parser.cpp
parent7b521d6023113372cf6b02e562828273c4040f0e (diff)
Add runtime encoding detection and conversionfix/char-detection
Win-1251/1252 detection is a reduced C++ version of https://github.com/hsivonen/chardetng Add manually-specified encoding fallback Add default system encoding fallback Add error recovery to v2script Add unknown encoding detection warning Remove csv::Parser templating Fix lua files dropping data Update lexy to foonathan/lexy@1e5d99fa3826b1c3c8628d3a11117fb4fb4cc0d0 Remove exclusive reliance on lexy::default_encoding for v2script Move internal concepts to src/openvic-detail/InternalConcepts.hpp Move contents of DetectUtf8.hpp to src/detail/Detect.hpp Move openvic-dataloader/AbstractSyntaxTree.hpp to src Move DiagnosticLogger.hpp to src Move File.hpp to src Move openvic-dataloader/detail/utlity files to openvic-dataloader/detail Add ovdl::utility::type_concat Add ovdl::utility::type_prepend Add ovdl::utility::is_instance_of Overhaul parse error messages
Diffstat (limited to 'src/openvic-dataloader/v2script/Parser.cpp')
-rw-r--r--src/openvic-dataloader/v2script/Parser.cpp195
1 files changed, 102 insertions, 93 deletions
diff --git a/src/openvic-dataloader/v2script/Parser.cpp b/src/openvic-dataloader/v2script/Parser.cpp
index eb491d5..23dada7 100644
--- a/src/openvic-dataloader/v2script/Parser.cpp
+++ b/src/openvic-dataloader/v2script/Parser.cpp
@@ -4,16 +4,15 @@
#include <iostream>
#include <optional>
#include <string>
+#include <type_traits>
#include <utility>
-#include <openvic-dataloader/DiagnosticLogger.hpp>
+#include <openvic-dataloader/Error.hpp>
#include <openvic-dataloader/NodeLocation.hpp>
-#include <openvic-dataloader/ParseError.hpp>
-#include <openvic-dataloader/ParseWarning.hpp>
-#include <openvic-dataloader/detail/LexyReportError.hpp>
+#include <openvic-dataloader/detail/Concepts.hpp>
+#include <openvic-dataloader/detail/Encoding.hpp>
#include <openvic-dataloader/detail/OStreamOutputIterator.hpp>
-#include <openvic-dataloader/detail/utility/Concepts.hpp>
-#include <openvic-dataloader/detail/utility/Utility.hpp>
+#include <openvic-dataloader/detail/Utility.hpp>
#include <openvic-dataloader/v2script/AbstractSyntaxTree.hpp>
#include <lexy/action/parse.hpp>
@@ -29,10 +28,8 @@
#include <fmt/core.h>
-#include "openvic-dataloader/Error.hpp"
-
+#include "DiagnosticLogger.hpp"
#include "ParseState.hpp"
-#include "detail/DetectUtf8.hpp"
#include "detail/NullBuff.hpp"
#include "detail/ParseHandler.hpp"
#include "detail/Warnings.hpp"
@@ -44,29 +41,46 @@
using namespace ovdl;
using namespace ovdl::v2script;
-/// BufferHandler ///
+/// ParseHandler ///
struct Parser::ParseHandler final : detail::BasicStateParseHandler<v2script::ast::ParseState> {
- constexpr bool is_exclusive_utf8() const {
- return detail::is_utf8_no_ascii(buffer());
- }
-
template<typename Node>
std::optional<DiagnosticLogger::error_range> parse() {
- auto result = lexy::parse<Node>(buffer(), *_parse_state, _parse_state->logger().error_callback());
+ if (parse_state().encoding() == ovdl::detail::Encoding::Utf8) {
+ parse_state().logger().warning(warnings::make_utf8_warning(path()));
+ }
+
+ auto result = [&] {
+ switch (parse_state().encoding()) {
+ using enum detail::Encoding;
+ case Ascii:
+ case Utf8:
+ return lexy::parse<Node>(buffer<lexy::utf8_char_encoding>(), parse_state(), parse_state().logger().error_callback());
+ case Unknown:
+ case Windows1251:
+ case Windows1252:
+ return lexy::parse<Node>(buffer<lexy::default_encoding>(), parse_state(), parse_state().logger().error_callback());
+ default:
+ ovdl::detail::unreachable();
+ }
+ }();
if (!result) {
- return _parse_state->logger().get_errors();
+ return parse_state().logger().get_errors();
}
- _parse_state->ast().set_root(result.value());
+ parse_state().ast().set_root(result.value());
return std::nullopt;
}
ast::FileTree* root() {
- return _parse_state->ast().root();
+ return parse_state().ast().root();
+ }
+
+ Parser::error_range get_errors() {
+ return parse_state().logger().get_errors();
}
};
-/// BufferHandler ///
+/// ParseHandler ///
Parser::Parser()
: _parse_handler(std::make_unique<ParseHandler>()) {
@@ -82,29 +96,29 @@ Parser::Parser(Parser&&) = default;
Parser& Parser::operator=(Parser&&) = default;
Parser::~Parser() = default;
-Parser Parser::from_buffer(const char* data, std::size_t size) {
+Parser Parser::from_buffer(const char* data, std::size_t size, std::optional<detail::Encoding> encoding_fallback) {
Parser result;
- return std::move(result.load_from_buffer(data, size));
+ return std::move(result.load_from_buffer(data, size, encoding_fallback));
}
-Parser Parser::from_buffer(const char* start, const char* end) {
+Parser Parser::from_buffer(const char* start, const char* end, std::optional<detail::Encoding> encoding_fallback) {
Parser result;
- return std::move(result.load_from_buffer(start, end));
+ return std::move(result.load_from_buffer(start, end, encoding_fallback));
}
-Parser Parser::from_string(const std::string_view string) {
+Parser Parser::from_string(const std::string_view string, std::optional<detail::Encoding> encoding_fallback) {
Parser result;
- return std::move(result.load_from_string(string));
+ return std::move(result.load_from_string(string, encoding_fallback));
}
-Parser Parser::from_file(const char* path) {
+Parser Parser::from_file(const char* path, std::optional<detail::Encoding> encoding_fallback) {
Parser result;
- return std::move(result.load_from_file(path));
+ return std::move(result.load_from_file(path, encoding_fallback));
}
-Parser Parser::from_file(const std::filesystem::path& path) {
+Parser Parser::from_file(const std::filesystem::path& path, std::optional<detail::Encoding> encoding_fallback) {
Parser result;
- return std::move(result.load_from_file(path));
+ return std::move(result.load_from_file(path, encoding_fallback));
}
///
@@ -128,38 +142,38 @@ constexpr void Parser::_run_load_func(detail::LoadCallback<Parser::ParseHandler*
if (!error_message.empty()) {
_has_error = true;
_has_fatal_error = true;
- _parse_handler->parse_state().logger().create_log<error::BufferError>(DiagnosticLogger::DiagnosticKind::error, fmt::runtime(error_message));
+ _parse_handler->parse_state().logger().template create_log<error::BufferError>(DiagnosticLogger::DiagnosticKind::error, fmt::runtime(error_message));
}
if (has_error() && &_error_stream.get() != &detail::cnull) {
print_errors_to(_error_stream.get());
}
}
-constexpr Parser& Parser::load_from_buffer(const char* data, std::size_t size) {
+constexpr Parser& Parser::load_from_buffer(const char* data, std::size_t size, std::optional<detail::Encoding> encoding_fallback) {
// Type can't be deduced?
- _run_load_func(std::mem_fn(&ParseHandler::load_buffer_size), data, size);
+ _run_load_func(std::mem_fn(&ParseHandler::load_buffer_size), data, size, encoding_fallback);
return *this;
}
-constexpr Parser& Parser::load_from_buffer(const char* start, const char* end) {
+constexpr Parser& Parser::load_from_buffer(const char* start, const char* end, std::optional<detail::Encoding> encoding_fallback) {
// Type can't be deduced?
- _run_load_func(std::mem_fn(&ParseHandler::load_buffer), start, end);
+ _run_load_func(std::mem_fn(&ParseHandler::load_buffer), start, end, encoding_fallback);
return *this;
}
-constexpr Parser& Parser::load_from_string(const std::string_view string) {
- return load_from_buffer(string.data(), string.size());
+constexpr Parser& Parser::load_from_string(const std::string_view string, std::optional<detail::Encoding> encoding_fallback) {
+ return load_from_buffer(string.data(), string.size(), encoding_fallback);
}
-Parser& Parser::load_from_file(const char* path) {
+Parser& Parser::load_from_file(const char* path, std::optional<detail::Encoding> encoding_fallback) {
set_file_path(path);
// Type can be deduced??
- _run_load_func(std::mem_fn(&ParseHandler::load_file), path);
+ _run_load_func(std::mem_fn(&ParseHandler::load_file), get_file_path().data(), encoding_fallback);
return *this;
}
-Parser& Parser::load_from_file(const std::filesystem::path& path) {
- return load_from_file(path.string().c_str());
+Parser& Parser::load_from_file(const std::filesystem::path& path, std::optional<detail::Encoding> encoding_fallback) {
+ return load_from_file(path.string().c_str(), encoding_fallback);
}
/* REQUIREMENTS:
@@ -173,11 +187,7 @@ bool Parser::simple_parse() {
return false;
}
- if (_parse_handler->is_exclusive_utf8()) {
- _parse_handler->parse_state().logger().warning(warnings::make_utf8_warning(_file_path));
- }
-
- auto errors = _parse_handler->parse<grammar::File<grammar::NoStringEscapeOption>>();
+ std::optional<DiagnosticLogger::error_range> errors = _parse_handler->parse<grammar::File>();
_has_error = _parse_handler->parse_state().logger().errored();
_has_warning = _parse_handler->parse_state().logger().warned();
if (!_parse_handler->root()) {
@@ -196,14 +206,11 @@ bool Parser::event_parse() {
return false;
}
- if (_parse_handler->is_exclusive_utf8()) {
- _parse_handler->parse_state().logger().warning(warnings::make_utf8_warning(_file_path));
- }
-
- auto errors = _parse_handler->parse<grammar::EventFile>();
+ std::optional<DiagnosticLogger::error_range> errors = _parse_handler->parse<grammar::EventFile>();
_has_error = _parse_handler->parse_state().logger().errored();
_has_warning = _parse_handler->parse_state().logger().warned();
if (!_parse_handler->root()) {
+ _has_error = true;
_has_fatal_error = true;
if (&_error_stream.get() != &detail::cnull) {
print_errors_to(_error_stream);
@@ -218,14 +225,11 @@ bool Parser::decision_parse() {
return false;
}
- if (_parse_handler->is_exclusive_utf8()) {
- _parse_handler->parse_state().logger().warning(warnings::make_utf8_warning(_file_path));
- }
-
- auto errors = _parse_handler->parse<grammar::DecisionFile>();
+ std::optional<DiagnosticLogger::error_range> errors = _parse_handler->parse<grammar::DecisionFile>();
_has_error = _parse_handler->parse_state().logger().errored();
_has_warning = _parse_handler->parse_state().logger().warned();
if (!_parse_handler->root()) {
+ _has_error = true;
_has_fatal_error = true;
if (&_error_stream.get() != &detail::cnull) {
print_errors_to(_error_stream);
@@ -240,14 +244,11 @@ bool Parser::lua_defines_parse() {
return false;
}
- if (_parse_handler->is_exclusive_utf8()) {
- _parse_handler->parse_state().logger().warning(warnings::make_utf8_warning(_file_path));
- }
-
- auto errors = _parse_handler->parse<lua::grammar::File<>>();
+ std::optional<DiagnosticLogger::error_range> errors = _parse_handler->parse<lua::grammar::File>();
_has_error = _parse_handler->parse_state().logger().errored();
_has_warning = _parse_handler->parse_state().logger().warned();
if (!_parse_handler->root()) {
+ _has_error = true;
_has_fatal_error = true;
if (&_error_stream.get() != &detail::cnull) {
print_errors_to(_error_stream);
@@ -273,48 +274,66 @@ std::string Parser::make_list_string() const {
return _parse_handler->parse_state().ast().make_list_visualizer();
}
+// TODO: Remove reinterpret_cast
+// WARNING: This almost certainly breaks on utf16 and utf32 encodings, luckily we don't parse in that format
+// This is purely to silence the node_location errors because char8_t is useless
+#define REINTERPRET_IT(IT) reinterpret_cast<const std::decay_t<decltype(buffer)>::encoding::char_type*>((IT))
+
const FilePosition Parser::get_position(const ast::Node* node) const {
if (!node || !node->is_linked_in_tree()) {
return {};
}
- auto node_location = _parse_handler->parse_state().ast().location_of(node);
+
+ NodeLocation node_location;
+
+ node_location = _parse_handler->parse_state().ast().location_of(node);
+
if (node_location.is_synthesized()) {
- return {};
+ return FilePosition {};
}
- auto loc_begin = lexy::get_input_location(_parse_handler->buffer(), node_location.begin());
- FilePosition result { loc_begin.line_nr(), loc_begin.line_nr(), loc_begin.column_nr(), loc_begin.column_nr() };
- if (node_location.begin() < node_location.end()) {
- auto loc_end = lexy::get_input_location(_parse_handler->buffer(), node_location.end(), loc_begin.anchor());
- result.end_line = loc_end.line_nr();
- result.end_column = loc_end.column_nr();
- }
- return result;
+ return _parse_handler->parse_state().ast().file().visit_buffer(
+ [&](auto&& buffer) -> FilePosition {
+ auto loc_begin = lexy::get_input_location(buffer, REINTERPRET_IT(node_location.begin()));
+ FilePosition result { loc_begin.line_nr(), loc_begin.line_nr(), loc_begin.column_nr(), loc_begin.column_nr() };
+ if (node_location.begin() < node_location.end()) {
+ auto loc_end = lexy::get_input_location(buffer, REINTERPRET_IT(node_location.end()), loc_begin.anchor());
+ result.end_line = loc_end.line_nr();
+ result.end_column = loc_end.column_nr();
+ }
+ return result;
+ });
}
Parser::error_range Parser::get_errors() const {
- return _parse_handler->parse_state().logger().get_errors();
+ return _parse_handler->get_errors();
}
const FilePosition Parser::get_error_position(const error::Error* error) const {
if (!error || !error->is_linked_in_tree()) {
return {};
}
+
auto err_location = _parse_handler->parse_state().logger().location_of(error);
if (err_location.is_synthesized()) {
- return {};
+ return FilePosition {};
}
- auto loc_begin = lexy::get_input_location(_parse_handler->buffer(), err_location.begin());
- FilePosition result { loc_begin.line_nr(), loc_begin.line_nr(), loc_begin.column_nr(), loc_begin.column_nr() };
- if (err_location.begin() < err_location.end()) {
- auto loc_end = lexy::get_input_location(_parse_handler->buffer(), err_location.end(), loc_begin.anchor());
- result.end_line = loc_end.line_nr();
- result.end_column = loc_end.column_nr();
- }
- return result;
+ return _parse_handler->parse_state().ast().file().visit_buffer(
+ [&](auto&& buffer) -> FilePosition {
+ auto loc_begin = lexy::get_input_location(buffer, REINTERPRET_IT(err_location.begin()));
+ FilePosition result { loc_begin.line_nr(), loc_begin.line_nr(), loc_begin.column_nr(), loc_begin.column_nr() };
+ if (err_location.begin() < err_location.end()) {
+ auto loc_end = lexy::get_input_location(buffer, REINTERPRET_IT(err_location.end()), loc_begin.anchor());
+ result.end_line = loc_end.line_nr();
+ result.end_column = loc_end.column_nr();
+ }
+ return result;
+ });
}
+#undef REINTERPRET_IT
+
void Parser::print_errors_to(std::basic_ostream<char>& stream) const {
auto errors = get_errors();
if (errors.empty()) return;
@@ -324,19 +343,9 @@ void Parser::print_errors_to(std::basic_ostream<char>& stream) const {
[&](const error::BufferError* buffer_error) {
stream << "buffer error: " << buffer_error->message() << '\n';
},
- [&](const error::ParseError* parse_error) {
- auto position = get_error_position(parse_error);
- std::string pos_str = fmt::format(":{}:{}: ", position.start_line, position.start_column);
- stream << _file_path << pos_str << "parse error for '" << parse_error->production_name() << "': " << parse_error->message() << '\n';
- },
- [&](dryad::child_visitor<error::ErrorKind> visitor, const error::Semantic* semantic) {
- auto position = get_error_position(semantic);
- std::string pos_str = ": ";
- if (!position.is_empty()) {
- pos_str = fmt::format(":{}:{}: ", position.start_line, position.start_column);
- }
- stream << _file_path << pos_str << semantic->message() << '\n';
- auto annotations = semantic->annotations();
+ [&](dryad::child_visitor<error::ErrorKind> visitor, const error::AnnotatedError* annotated_error) {
+ stream << annotated_error->message() << '\n';
+ auto annotations = annotated_error->annotations();
for (auto annotation : annotations) {
visitor(annotation);
}