From c7c080987e44f606bed73dd8b0c2747e7b386f41 Mon Sep 17 00:00:00 2001 From: Spartan322 Date: Wed, 31 Jul 2024 20:39:48 -0400 Subject: Add `\x8F` to `Ę` conversion for Windows-1252 To support special vanilla Polish TODOs that break utf8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add CSV Win1252->Utf8 conversion tests Fix map_value warning not triggering Optimize for ascii characters --- .github/workflows/builds.yml | 2 +- SConstruct | 2 + src/openvic-dataloader/detail/Convert.hpp | 110 +++++++++------- tests/SCsub | 5 + tests/src/csv/Parser.cpp | 210 ++++++++++++++++++++++++++++++ 5 files changed, 284 insertions(+), 45 deletions(-) diff --git a/.github/workflows/builds.yml b/.github/workflows/builds.yml index 8f56794..c644fdf 100644 --- a/.github/workflows/builds.yml +++ b/.github/workflows/builds.yml @@ -97,7 +97,7 @@ jobs: with: platform: ${{ matrix.platform }} target: ${{ matrix.target }} - sconsflags: arch=${{ matrix.arch }} build_ovdl_library=yes run_ovdl_tests=yes + sconsflags: arch=${{ matrix.arch }} build_ovdl_library=yes run_ovdl_tests=yes ubuntu_gcc_invalid_char_hang_bug=${{ matrix.runner == 'ubuntu-20.04' }} - name: Delete compilation files if: ${{ matrix.platform == 'windows' }} diff --git a/SConstruct b/SConstruct index 05547c5..1ece1c9 100644 --- a/SConstruct +++ b/SConstruct @@ -18,6 +18,8 @@ opts.Add(BoolVariable("run_ovdl_tests", "Build and run the openvic dataloader te opts.Add(BoolVariable("build_ovdl_library", "Build the openvic dataloader library.", env.get("build_ovdl_library", not env.is_standalone))) opts.Add(BoolVariable("build_ovdl_headless", "Build the openvic dataloader headless executable", env.is_standalone)) +opts.Add(BoolVariable("ubuntu_gcc_invalid_char_hang_bug", "Skips test section which triggers a hang build for gcc-12 on ubuntu-20", False)) + env.FinalizeOptions() env.exposed_includes = [] diff --git a/src/openvic-dataloader/detail/Convert.hpp b/src/openvic-dataloader/detail/Convert.hpp index 5d9fca0..6f8f279 100644 --- a/src/openvic-dataloader/detail/Convert.hpp +++ b/src/openvic-dataloader/detail/Convert.hpp @@ -23,28 +23,22 @@ #include "v2script/ParseState.hpp" namespace ovdl::convert { - struct MappedChar { - char value; - std::string_view utf8; - - constexpr bool is_invalid() const { return value == 0; } - constexpr bool is_pass() const { return value == 1; } - }; - constexpr MappedChar invalid_map { 0, "" }; - constexpr MappedChar pass_map { 1, "" }; - struct map_value { std::string_view _value; constexpr map_value() noexcept : _value("") {} - constexpr map_value(std::nullptr_t) noexcept : _value("\0") {} + constexpr map_value(std::nullptr_t) noexcept : _value("\0", 1) {} constexpr explicit map_value(std::string_view val) noexcept : _value(val) {} - constexpr bool is_invalid() const { + static constexpr map_value invalid_value() noexcept { + return map_value(nullptr); + } + + constexpr bool is_invalid() const noexcept { return !_value.empty() && _value[0] == '\0'; } - constexpr bool is_pass() const { + constexpr bool is_pass() const noexcept { return _value.empty(); } @@ -203,13 +197,19 @@ namespace ovdl::convert { .map<'\xFC'>("ü") .map<'\xFD'>("ý") .map<'\xFE'>("þ") - .map<'\xFF'>("ÿ"); + .map<'\xFF'>("ÿ") + + // Paradox being special, invalid Windows-1252 + // Used for (semantically incorrect) Polish localization TODOs + .map<'\x8F'>("Ę"); template static constexpr map_value try_parse(Reader& reader) { auto index = map.try_parse(reader); if (index) { return map_value(map[index]); + } else if (*reader.position() < 0) { + return map_value::invalid_value(); } return {}; } @@ -358,6 +358,8 @@ namespace ovdl::convert { auto index = map.try_parse(reader); if (index) { return map_value(map[index]); + } else if (*reader.position() < 0) { + return map_value::invalid_value(); } return {}; } @@ -405,6 +407,11 @@ namespace ovdl::convert { break; // Skip Ascii and Utf8 encoding default: { + // If within ASCII range + if (c >= CharT {}) { + break; + } + map_value val = {}; CharT char_array[] { c, CharT() }; auto input = lexy::range_input(&char_array[0], &char_array[1]); @@ -454,19 +461,24 @@ namespace ovdl::convert { auto begin = reader.position(); auto last_it = begin; while (reader.peek() != eof) { - map_value val = try_parse_map(state.encoding(), reader); + // If not within ASCII range + if (*reader.position() < 0) { + map_value val = try_parse_map(state.encoding(), reader); + + if (val.is_invalid()) { + Error::on_invalid_character(state, reader); + reader.bump(); + continue; + } else if (!val.is_pass()) { + result.append(val._value); + last_it = reader.position(); + continue; + } + } - if (val.is_invalid()) { - Error::on_invalid_character(state, reader); + while (reader.peek() != eof && *reader.position() > 0) { reader.bump(); - continue; - } else if (!val.is_pass()) { - result.append(val._value); - last_it = reader.position(); - continue; } - - reader.bump(); result.append(last_it, reader.position()); last_it = reader.position(); } @@ -503,19 +515,24 @@ namespace ovdl::convert { auto begin = reader.position(); auto last_it = begin; while (reader.peek() != eof) { - map_value val = try_parse_map(state.encoding(), reader); + // If not within ASCII range + if (*reader.position() < 0) { + map_value val = try_parse_map(state.encoding(), reader); + + if (val.is_invalid()) { + Error::on_invalid_character(state, reader); + reader.bump(); + continue; + } else if (!val.is_pass()) { + result.append(val._value); + last_it = reader.position(); + continue; + } + } - if (val.is_invalid()) { - Error::on_invalid_character(state, reader); + while (reader.peek() != eof && *reader.position() > 0) { reader.bump(); - continue; - } else if (!val.is_pass()) { - result.append(val._value); - last_it = reader.position(); - continue; } - - reader.bump(); result.append(last_it, reader.position()); last_it = reader.position(); } @@ -550,19 +567,24 @@ namespace ovdl::convert { auto begin = reader.position(); auto last_it = begin; while (reader.peek() != eof) { - map_value val = try_parse_map(state.encoding(), reader); + // If not within ASCII range + if (*reader.position() < 0) { + map_value val = try_parse_map(state.encoding(), reader); - if (val.is_invalid()) { - Error::on_invalid_character(state, reader); - reader.bump(); - continue; - } else if (!val.is_pass()) { - result.append(val._value); - last_it = reader.position(); - continue; + if (val.is_invalid()) { + Error::on_invalid_character(state, reader); + reader.bump(); + continue; + } else if (!val.is_pass()) { + result.append(val._value); + last_it = reader.position(); + continue; + } } - reader.bump(); + while (reader.peek() != eof && *reader.position() > 0) { + reader.bump(); + } result.append(last_it, reader.position()); last_it = reader.position(); } diff --git a/tests/SCsub b/tests/SCsub index 0a18777..49e2742 100644 --- a/tests/SCsub +++ b/tests/SCsub @@ -47,6 +47,11 @@ tests_env.tests_sources = env.GlobRecursive("*.cpp", [source_path]) SConscript("deps/SCsub", {"env": tests_env }) +# Blame Ubuntu 22's GCC-12 distribution for this crap +# Compiler bug hangs if it can see if there is any reference to \x8F in a character +if env["ubuntu_gcc_invalid_char_hang_bug"]: + tests_env.Append(CPPDEFINES=["_OVDL_TEST_UBUNTU_GCC_12_BUG_"]) + tests_program = tests_env.UnitTest( source=tests_env.tests_sources, target=os.path.join(BINDIR, tests_name), diff --git a/tests/src/csv/Parser.cpp b/tests/src/csv/Parser.cpp index e72c02a..fe26726 100644 --- a/tests/src/csv/Parser.cpp +++ b/tests/src/csv/Parser.cpp @@ -5,11 +5,14 @@ #include #include +#include + #include "Helper.hpp" #include #include #include #include +#include #include using namespace ovdl; @@ -568,4 +571,211 @@ TEST_CASE("CSV Parse", "[csv-parse]") { } } } + + SECTION("Score militaire;Militär;;Puntuación militar") { + static constexpr auto buffer = "Score militaire;Militär;;Puntuación militar"sv; + parser.load_from_string(buffer); + + CHECK_PARSE(); + + const std::vector& line_list = parser.get_lines(); + CHECK_FALSE(line_list.empty()); + CHECK(ranges::size(line_list) == 1); + + const LineObject& line = line_list.front(); + CHECK_FALSE(line.empty()); + CHECK(ranges::size(line) == 3); + CHECK(line.value_count() == 4); + CHECK(line.prefix_end() == 0); + CHECK(line.suffix_end() == 4); + + for (const auto [index, val] : line | ranges::views::enumerate) { + CAPTURE(index); + CHECK_FALSE_OR_CONTINUE(val.second.empty()); + switch (index) { + case 0: + CHECK_OR_CONTINUE(val.first == 0); + CHECK_OR_CONTINUE(val.second == "Score militaire"sv); + break; + case 1: + CHECK_OR_CONTINUE(val.first == 1); + CHECK_OR_CONTINUE(val.second == "Militär"sv); + break; + case 2: + CHECK_OR_CONTINUE(val.first == 3); + CHECK_OR_CONTINUE(val.second == "Puntuación militar"sv); + break; + default: CHECK_OR_CONTINUE(false); break; + } + } + + CHECK(line.value_count() == 4); + + for (const auto index : ranges::views::iota(size_t(0), line.value_count())) { + CAPTURE(index); + switch (index) { + case 0: CHECK_OR_CONTINUE(line.get_value_for(index) == "Score militaire"sv); break; + case 1: CHECK_OR_CONTINUE(line.get_value_for(index) == "Militär"sv); break; + case 2: CHECK_OR_CONTINUE(line.get_value_for(index) == ""sv); break; + case 3: CHECK_OR_CONTINUE(line.get_value_for(index) == "Puntuación militar"sv); break; + default: CHECK_OR_CONTINUE(false); break; + } + } + } + + SECTION(";§RNo research set§W;§RAucune recherche définie§W;") { + static constexpr auto buffer = ";§RNo research set§W;§RAucune recherche définie§W;"sv; + parser.load_from_string(buffer); + + CHECK_PARSE(); + + const std::vector& line_list = parser.get_lines(); + CHECK_FALSE(line_list.empty()); + CHECK(ranges::size(line_list) == 1); + + const LineObject& line = line_list.front(); + CHECK_FALSE(line.empty()); + CHECK(ranges::size(line) == 2); + CHECK(line.value_count() == 3); + CHECK(line.prefix_end() == 1); + CHECK(line.suffix_end() == 3); + + for (const auto [index, val] : line | ranges::views::enumerate) { + CAPTURE(index); + CHECK_FALSE_OR_CONTINUE(val.second.empty()); + switch (index) { + case 0: + CHECK_OR_CONTINUE(val.first == 1); + CHECK_OR_CONTINUE(val.second == "§RNo research set§W"sv); + break; + case 1: + CHECK_OR_CONTINUE(val.first == 2); + CHECK_OR_CONTINUE(val.second == "§RAucune recherche définie§W"sv); + break; + default: CHECK_OR_CONTINUE(false); break; + } + } + + CHECK(line.value_count() == 3); + + for (const auto index : ranges::views::iota(size_t(0), line.value_count())) { + CAPTURE(index); + switch (index) { + case 0: CHECK_OR_CONTINUE(line.get_value_for(index) == ""sv); break; + case 1: CHECK_OR_CONTINUE(line.get_value_for(index) == "§RNo research set§W"sv); break; + case 2: CHECK_OR_CONTINUE(line.get_value_for(index) == "§RAucune recherche définie§W"sv); break; + default: CHECK_OR_CONTINUE(false); break; + } + } + } + + SECTION("Württemberg;Wurtemberg;Württemberg;;Württemberg;") { + static constexpr auto buffer = "Württemberg;Wurtemberg;Württemberg;;Württemberg;"sv; + parser.load_from_string(buffer); + + CHECK_PARSE(); + + const std::vector& line_list = parser.get_lines(); + CHECK_FALSE(line_list.empty()); + CHECK(ranges::size(line_list) == 1); + + const LineObject& line = line_list.front(); + CHECK_FALSE(line.empty()); + CHECK(ranges::size(line) == 4); + CHECK(line.value_count() == 5); + CHECK(line.prefix_end() == 0); + CHECK(line.suffix_end() == 5); + + for (const auto [index, val] : line | ranges::views::enumerate) { + CAPTURE(index); + CHECK_FALSE_OR_CONTINUE(val.second.empty()); + switch (index) { + case 0: + CHECK_OR_CONTINUE(val.first == 0); + CHECK_OR_CONTINUE(val.second == "Württemberg"sv); + break; + case 1: + CHECK_OR_CONTINUE(val.first == 1); + CHECK_OR_CONTINUE(val.second == "Wurtemberg"sv); + break; + case 2: + CHECK_OR_CONTINUE(val.first == 2); + CHECK_OR_CONTINUE(val.second == "Württemberg"sv); + break; + case 3: + CHECK_OR_CONTINUE(val.first == 4); + CHECK_OR_CONTINUE(val.second == "Württemberg"sv); + break; + default: CHECK_OR_CONTINUE(false); break; + } + } + + CHECK(line.value_count() == 5); + + for (const auto index : ranges::views::iota(size_t(0), line.value_count())) { + CAPTURE(index); + switch (index) { + case 0: CHECK_OR_CONTINUE(line.get_value_for(index) == "Württemberg"sv); break; + case 1: CHECK_OR_CONTINUE(line.get_value_for(index) == "Wurtemberg"sv); break; + case 2: CHECK_OR_CONTINUE(line.get_value_for(index) == "Württemberg"sv); break; + case 3: CHECK_OR_CONTINUE(line.get_value_for(index) == ""sv); break; + case 4: CHECK_OR_CONTINUE(line.get_value_for(index) == "Württemberg"sv); break; + default: CHECK_OR_CONTINUE(false); break; + } + } + } + + // Blame Ubuntu 22's GCC-12 distribution for this crap + // Compiler bug hangs if it can see if there is any reference to \x8F in a character +#if !defined(_OVDL_TEST_UBUNTU_GCC_12_BUG_) + SECTION(";$NAME$ wurde in $PROV$ gebaut.;ID'\\x8F' DO;") { + static auto buffer = ";$NAME$ wurde in $PROV$ gebaut.;ID\x8F DO;"; + parser.load_from_string(buffer); + + CHECK_PARSE(); + + const std::vector& line_list = parser.get_lines(); + CHECK_FALSE(line_list.empty()); + CHECK(ranges::size(line_list) == 1); + + const LineObject& line = line_list.front(); + CHECK_FALSE(line.empty()); + CHECK(ranges::size(line) == 2); + CHECK(line.value_count() == 3); + CHECK(line.prefix_end() == 1); + CHECK(line.suffix_end() == 3); + + for (const auto [index, val] : line | ranges::views::enumerate) { + CAPTURE(index); + CHECK_FALSE_OR_CONTINUE(val.second.empty()); + switch (index) { + case 0: + CHECK_OR_CONTINUE(val.first == 1); + CHECK_OR_CONTINUE(val.second == "$NAME$ wurde in $PROV$ gebaut."sv); + break; + case 1: + CHECK_OR_CONTINUE(val.first == 2); + CHECK_OR_CONTINUE(val.second == "IDĘ DO"sv); + break; + case 2: + CHECK_OR_CONTINUE(val.first == 3); + CHECK_OR_CONTINUE(val.second == ""sv); + break; + default: CHECK_OR_CONTINUE(false); break; + } + } + + CHECK(line.value_count() == 3); + + for (const auto index : ranges::views::iota(size_t(0), line.value_count())) { + CAPTURE(index); + switch (index) { + case 0: CHECK_OR_CONTINUE(line.get_value_for(index) == ""sv); break; + case 1: CHECK_OR_CONTINUE(line.get_value_for(index) == "$NAME$ wurde in $PROV$ gebaut."sv); break; + case 2: CHECK_OR_CONTINUE(line.get_value_for(index) == "IDĘ DO"sv); break; + default: CHECK_OR_CONTINUE(false); break; + } + } + } +#endif } \ No newline at end of file -- cgit v1.2.3-56-ga3b1