aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
author Spartan322 <Megacake1234@gmail.com>2024-08-01 02:39:48 +0200
committer Spartan322 <Megacake1234@gmail.com>2024-08-02 00:59:42 +0200
commitc7c080987e44f606bed73dd8b0c2747e7b386f41 (patch)
tree7d7775dce507efd83afa2ba452ec5b227e35ebb0
parent847280022ec8afb35d7d8639afd639c5ec42e3c7 (diff)
Add `\x8F` to `Ę` conversion for Windows-1252fix/polish-todos
To support special vanilla Polish TODOs that break utf8 Add CSV Win1252->Utf8 conversion tests Fix map_value warning not triggering Optimize for ascii characters
-rw-r--r--.github/workflows/builds.yml2
-rw-r--r--SConstruct2
-rw-r--r--src/openvic-dataloader/detail/Convert.hpp110
-rw-r--r--tests/SCsub5
-rw-r--r--tests/src/csv/Parser.cpp210
5 files changed, 284 insertions, 45 deletions
diff --git a/.github/workflows/builds.yml b/.github/workflows/builds.yml
index 8f56794..c644fdf 100644
--- a/.github/workflows/builds.yml
+++ b/.github/workflows/builds.yml
@@ -97,7 +97,7 @@ jobs:
with:
platform: ${{ matrix.platform }}
target: ${{ matrix.target }}
- sconsflags: arch=${{ matrix.arch }} build_ovdl_library=yes run_ovdl_tests=yes
+ sconsflags: arch=${{ matrix.arch }} build_ovdl_library=yes run_ovdl_tests=yes ubuntu_gcc_invalid_char_hang_bug=${{ matrix.runner == 'ubuntu-20.04' }}
- name: Delete compilation files
if: ${{ matrix.platform == 'windows' }}
diff --git a/SConstruct b/SConstruct
index 05547c5..1ece1c9 100644
--- a/SConstruct
+++ b/SConstruct
@@ -18,6 +18,8 @@ opts.Add(BoolVariable("run_ovdl_tests", "Build and run the openvic dataloader te
opts.Add(BoolVariable("build_ovdl_library", "Build the openvic dataloader library.", env.get("build_ovdl_library", not env.is_standalone)))
opts.Add(BoolVariable("build_ovdl_headless", "Build the openvic dataloader headless executable", env.is_standalone))
+opts.Add(BoolVariable("ubuntu_gcc_invalid_char_hang_bug", "Skips test section which triggers a hang build for gcc-12 on ubuntu-20", False))
+
env.FinalizeOptions()
env.exposed_includes = []
diff --git a/src/openvic-dataloader/detail/Convert.hpp b/src/openvic-dataloader/detail/Convert.hpp
index 5d9fca0..6f8f279 100644
--- a/src/openvic-dataloader/detail/Convert.hpp
+++ b/src/openvic-dataloader/detail/Convert.hpp
@@ -23,28 +23,22 @@
#include "v2script/ParseState.hpp"
namespace ovdl::convert {
- struct MappedChar {
- char value;
- std::string_view utf8;
-
- constexpr bool is_invalid() const { return value == 0; }
- constexpr bool is_pass() const { return value == 1; }
- };
- constexpr MappedChar invalid_map { 0, "" };
- constexpr MappedChar pass_map { 1, "" };
-
struct map_value {
std::string_view _value;
constexpr map_value() noexcept : _value("") {}
- constexpr map_value(std::nullptr_t) noexcept : _value("\0") {}
+ constexpr map_value(std::nullptr_t) noexcept : _value("\0", 1) {}
constexpr explicit map_value(std::string_view val) noexcept : _value(val) {}
- constexpr bool is_invalid() const {
+ static constexpr map_value invalid_value() noexcept {
+ return map_value(nullptr);
+ }
+
+ constexpr bool is_invalid() const noexcept {
return !_value.empty() && _value[0] == '\0';
}
- constexpr bool is_pass() const {
+ constexpr bool is_pass() const noexcept {
return _value.empty();
}
@@ -203,13 +197,19 @@ namespace ovdl::convert {
.map<'\xFC'>("ü")
.map<'\xFD'>("ý")
.map<'\xFE'>("þ")
- .map<'\xFF'>("ÿ");
+ .map<'\xFF'>("ÿ")
+
+ // Paradox being special, invalid Windows-1252
+ // Used for (semantically incorrect) Polish localization TODOs
+ .map<'\x8F'>("Ę");
template<typename Reader>
static constexpr map_value try_parse(Reader& reader) {
auto index = map.try_parse(reader);
if (index) {
return map_value(map[index]);
+ } else if (*reader.position() < 0) {
+ return map_value::invalid_value();
}
return {};
}
@@ -358,6 +358,8 @@ namespace ovdl::convert {
auto index = map.try_parse(reader);
if (index) {
return map_value(map[index]);
+ } else if (*reader.position() < 0) {
+ return map_value::invalid_value();
}
return {};
}
@@ -405,6 +407,11 @@ namespace ovdl::convert {
break;
// Skip Ascii and Utf8 encoding
default: {
+ // If within ASCII range
+ if (c >= CharT {}) {
+ break;
+ }
+
map_value val = {};
CharT char_array[] { c, CharT() };
auto input = lexy::range_input(&char_array[0], &char_array[1]);
@@ -454,19 +461,24 @@ namespace ovdl::convert {
auto begin = reader.position();
auto last_it = begin;
while (reader.peek() != eof) {
- map_value val = try_parse_map(state.encoding(), reader);
+ // If not within ASCII range
+ if (*reader.position() < 0) {
+ map_value val = try_parse_map(state.encoding(), reader);
+
+ if (val.is_invalid()) {
+ Error::on_invalid_character(state, reader);
+ reader.bump();
+ continue;
+ } else if (!val.is_pass()) {
+ result.append(val._value);
+ last_it = reader.position();
+ continue;
+ }
+ }
- if (val.is_invalid()) {
- Error::on_invalid_character(state, reader);
+ while (reader.peek() != eof && *reader.position() > 0) {
reader.bump();
- continue;
- } else if (!val.is_pass()) {
- result.append(val._value);
- last_it = reader.position();
- continue;
}
-
- reader.bump();
result.append(last_it, reader.position());
last_it = reader.position();
}
@@ -503,19 +515,24 @@ namespace ovdl::convert {
auto begin = reader.position();
auto last_it = begin;
while (reader.peek() != eof) {
- map_value val = try_parse_map(state.encoding(), reader);
+ // If not within ASCII range
+ if (*reader.position() < 0) {
+ map_value val = try_parse_map(state.encoding(), reader);
+
+ if (val.is_invalid()) {
+ Error::on_invalid_character(state, reader);
+ reader.bump();
+ continue;
+ } else if (!val.is_pass()) {
+ result.append(val._value);
+ last_it = reader.position();
+ continue;
+ }
+ }
- if (val.is_invalid()) {
- Error::on_invalid_character(state, reader);
+ while (reader.peek() != eof && *reader.position() > 0) {
reader.bump();
- continue;
- } else if (!val.is_pass()) {
- result.append(val._value);
- last_it = reader.position();
- continue;
}
-
- reader.bump();
result.append(last_it, reader.position());
last_it = reader.position();
}
@@ -550,19 +567,24 @@ namespace ovdl::convert {
auto begin = reader.position();
auto last_it = begin;
while (reader.peek() != eof) {
- map_value val = try_parse_map(state.encoding(), reader);
+ // If not within ASCII range
+ if (*reader.position() < 0) {
+ map_value val = try_parse_map(state.encoding(), reader);
- if (val.is_invalid()) {
- Error::on_invalid_character(state, reader);
- reader.bump();
- continue;
- } else if (!val.is_pass()) {
- result.append(val._value);
- last_it = reader.position();
- continue;
+ if (val.is_invalid()) {
+ Error::on_invalid_character(state, reader);
+ reader.bump();
+ continue;
+ } else if (!val.is_pass()) {
+ result.append(val._value);
+ last_it = reader.position();
+ continue;
+ }
}
- reader.bump();
+ while (reader.peek() != eof && *reader.position() > 0) {
+ reader.bump();
+ }
result.append(last_it, reader.position());
last_it = reader.position();
}
diff --git a/tests/SCsub b/tests/SCsub
index 0a18777..49e2742 100644
--- a/tests/SCsub
+++ b/tests/SCsub
@@ -47,6 +47,11 @@ tests_env.tests_sources = env.GlobRecursive("*.cpp", [source_path])
SConscript("deps/SCsub", {"env": tests_env })
+# Blame Ubuntu 22's GCC-12 distribution for this crap
+# Compiler bug hangs if it can see if there is any reference to \x8F in a character
+if env["ubuntu_gcc_invalid_char_hang_bug"]:
+ tests_env.Append(CPPDEFINES=["_OVDL_TEST_UBUNTU_GCC_12_BUG_"])
+
tests_program = tests_env.UnitTest(
source=tests_env.tests_sources,
target=os.path.join(BINDIR, tests_name),
diff --git a/tests/src/csv/Parser.cpp b/tests/src/csv/Parser.cpp
index e72c02a..fe26726 100644
--- a/tests/src/csv/Parser.cpp
+++ b/tests/src/csv/Parser.cpp
@@ -5,11 +5,14 @@
#include <openvic-dataloader/csv/LineObject.hpp>
#include <openvic-dataloader/csv/Parser.hpp>
+#include <fmt/core.h>
+
#include "Helper.hpp"
#include <detail/NullBuff.hpp>
#include <range/v3/range/primitives.hpp>
#include <range/v3/view/enumerate.hpp>
#include <range/v3/view/iota.hpp>
+#include <range/v3/view/join.hpp>
#include <snitch/snitch.hpp>
using namespace ovdl;
@@ -568,4 +571,211 @@ TEST_CASE("CSV Parse", "[csv-parse]") {
}
}
}
+
+ SECTION("Score militaire;Militär;;Puntuación militar") {
+ static constexpr auto buffer = "Score militaire;Militär;;Puntuación militar"sv;
+ parser.load_from_string(buffer);
+
+ CHECK_PARSE();
+
+ const std::vector<LineObject>& line_list = parser.get_lines();
+ CHECK_FALSE(line_list.empty());
+ CHECK(ranges::size(line_list) == 1);
+
+ const LineObject& line = line_list.front();
+ CHECK_FALSE(line.empty());
+ CHECK(ranges::size(line) == 3);
+ CHECK(line.value_count() == 4);
+ CHECK(line.prefix_end() == 0);
+ CHECK(line.suffix_end() == 4);
+
+ for (const auto [index, val] : line | ranges::views::enumerate) {
+ CAPTURE(index);
+ CHECK_FALSE_OR_CONTINUE(val.second.empty());
+ switch (index) {
+ case 0:
+ CHECK_OR_CONTINUE(val.first == 0);
+ CHECK_OR_CONTINUE(val.second == "Score militaire"sv);
+ break;
+ case 1:
+ CHECK_OR_CONTINUE(val.first == 1);
+ CHECK_OR_CONTINUE(val.second == "Militär"sv);
+ break;
+ case 2:
+ CHECK_OR_CONTINUE(val.first == 3);
+ CHECK_OR_CONTINUE(val.second == "Puntuación militar"sv);
+ break;
+ default: CHECK_OR_CONTINUE(false); break;
+ }
+ }
+
+ CHECK(line.value_count() == 4);
+
+ for (const auto index : ranges::views::iota(size_t(0), line.value_count())) {
+ CAPTURE(index);
+ switch (index) {
+ case 0: CHECK_OR_CONTINUE(line.get_value_for(index) == "Score militaire"sv); break;
+ case 1: CHECK_OR_CONTINUE(line.get_value_for(index) == "Militär"sv); break;
+ case 2: CHECK_OR_CONTINUE(line.get_value_for(index) == ""sv); break;
+ case 3: CHECK_OR_CONTINUE(line.get_value_for(index) == "Puntuación militar"sv); break;
+ default: CHECK_OR_CONTINUE(false); break;
+ }
+ }
+ }
+
+ SECTION(";§RNo research set§W;§RAucune recherche définie§W;") {
+ static constexpr auto buffer = ";§RNo research set§W;§RAucune recherche définie§W;"sv;
+ parser.load_from_string(buffer);
+
+ CHECK_PARSE();
+
+ const std::vector<LineObject>& line_list = parser.get_lines();
+ CHECK_FALSE(line_list.empty());
+ CHECK(ranges::size(line_list) == 1);
+
+ const LineObject& line = line_list.front();
+ CHECK_FALSE(line.empty());
+ CHECK(ranges::size(line) == 2);
+ CHECK(line.value_count() == 3);
+ CHECK(line.prefix_end() == 1);
+ CHECK(line.suffix_end() == 3);
+
+ for (const auto [index, val] : line | ranges::views::enumerate) {
+ CAPTURE(index);
+ CHECK_FALSE_OR_CONTINUE(val.second.empty());
+ switch (index) {
+ case 0:
+ CHECK_OR_CONTINUE(val.first == 1);
+ CHECK_OR_CONTINUE(val.second == "§RNo research set§W"sv);
+ break;
+ case 1:
+ CHECK_OR_CONTINUE(val.first == 2);
+ CHECK_OR_CONTINUE(val.second == "§RAucune recherche définie§W"sv);
+ break;
+ default: CHECK_OR_CONTINUE(false); break;
+ }
+ }
+
+ CHECK(line.value_count() == 3);
+
+ for (const auto index : ranges::views::iota(size_t(0), line.value_count())) {
+ CAPTURE(index);
+ switch (index) {
+ case 0: CHECK_OR_CONTINUE(line.get_value_for(index) == ""sv); break;
+ case 1: CHECK_OR_CONTINUE(line.get_value_for(index) == "§RNo research set§W"sv); break;
+ case 2: CHECK_OR_CONTINUE(line.get_value_for(index) == "§RAucune recherche définie§W"sv); break;
+ default: CHECK_OR_CONTINUE(false); break;
+ }
+ }
+ }
+
+ SECTION("Württemberg;Wurtemberg;Württemberg;;Württemberg;") {
+ static constexpr auto buffer = "Württemberg;Wurtemberg;Württemberg;;Württemberg;"sv;
+ parser.load_from_string(buffer);
+
+ CHECK_PARSE();
+
+ const std::vector<LineObject>& line_list = parser.get_lines();
+ CHECK_FALSE(line_list.empty());
+ CHECK(ranges::size(line_list) == 1);
+
+ const LineObject& line = line_list.front();
+ CHECK_FALSE(line.empty());
+ CHECK(ranges::size(line) == 4);
+ CHECK(line.value_count() == 5);
+ CHECK(line.prefix_end() == 0);
+ CHECK(line.suffix_end() == 5);
+
+ for (const auto [index, val] : line | ranges::views::enumerate) {
+ CAPTURE(index);
+ CHECK_FALSE_OR_CONTINUE(val.second.empty());
+ switch (index) {
+ case 0:
+ CHECK_OR_CONTINUE(val.first == 0);
+ CHECK_OR_CONTINUE(val.second == "Württemberg"sv);
+ break;
+ case 1:
+ CHECK_OR_CONTINUE(val.first == 1);
+ CHECK_OR_CONTINUE(val.second == "Wurtemberg"sv);
+ break;
+ case 2:
+ CHECK_OR_CONTINUE(val.first == 2);
+ CHECK_OR_CONTINUE(val.second == "Württemberg"sv);
+ break;
+ case 3:
+ CHECK_OR_CONTINUE(val.first == 4);
+ CHECK_OR_CONTINUE(val.second == "Württemberg"sv);
+ break;
+ default: CHECK_OR_CONTINUE(false); break;
+ }
+ }
+
+ CHECK(line.value_count() == 5);
+
+ for (const auto index : ranges::views::iota(size_t(0), line.value_count())) {
+ CAPTURE(index);
+ switch (index) {
+ case 0: CHECK_OR_CONTINUE(line.get_value_for(index) == "Württemberg"sv); break;
+ case 1: CHECK_OR_CONTINUE(line.get_value_for(index) == "Wurtemberg"sv); break;
+ case 2: CHECK_OR_CONTINUE(line.get_value_for(index) == "Württemberg"sv); break;
+ case 3: CHECK_OR_CONTINUE(line.get_value_for(index) == ""sv); break;
+ case 4: CHECK_OR_CONTINUE(line.get_value_for(index) == "Württemberg"sv); break;
+ default: CHECK_OR_CONTINUE(false); break;
+ }
+ }
+ }
+
+ // Blame Ubuntu 22's GCC-12 distribution for this crap
+ // Compiler bug hangs if it can see if there is any reference to \x8F in a character
+#if !defined(_OVDL_TEST_UBUNTU_GCC_12_BUG_)
+ SECTION(";$NAME$ wurde in $PROV$ gebaut.;ID'\\x8F' DO;") {
+ static auto buffer = ";$NAME$ wurde in $PROV$ gebaut.;ID\x8F DO;";
+ parser.load_from_string(buffer);
+
+ CHECK_PARSE();
+
+ const std::vector<LineObject>& line_list = parser.get_lines();
+ CHECK_FALSE(line_list.empty());
+ CHECK(ranges::size(line_list) == 1);
+
+ const LineObject& line = line_list.front();
+ CHECK_FALSE(line.empty());
+ CHECK(ranges::size(line) == 2);
+ CHECK(line.value_count() == 3);
+ CHECK(line.prefix_end() == 1);
+ CHECK(line.suffix_end() == 3);
+
+ for (const auto [index, val] : line | ranges::views::enumerate) {
+ CAPTURE(index);
+ CHECK_FALSE_OR_CONTINUE(val.second.empty());
+ switch (index) {
+ case 0:
+ CHECK_OR_CONTINUE(val.first == 1);
+ CHECK_OR_CONTINUE(val.second == "$NAME$ wurde in $PROV$ gebaut."sv);
+ break;
+ case 1:
+ CHECK_OR_CONTINUE(val.first == 2);
+ CHECK_OR_CONTINUE(val.second == "IDĘ DO"sv);
+ break;
+ case 2:
+ CHECK_OR_CONTINUE(val.first == 3);
+ CHECK_OR_CONTINUE(val.second == ""sv);
+ break;
+ default: CHECK_OR_CONTINUE(false); break;
+ }
+ }
+
+ CHECK(line.value_count() == 3);
+
+ for (const auto index : ranges::views::iota(size_t(0), line.value_count())) {
+ CAPTURE(index);
+ switch (index) {
+ case 0: CHECK_OR_CONTINUE(line.get_value_for(index) == ""sv); break;
+ case 1: CHECK_OR_CONTINUE(line.get_value_for(index) == "$NAME$ wurde in $PROV$ gebaut."sv); break;
+ case 2: CHECK_OR_CONTINUE(line.get_value_for(index) == "IDĘ DO"sv); break;
+ default: CHECK_OR_CONTINUE(false); break;
+ }
+ }
+ }
+#endif
} \ No newline at end of file