aboutsummaryrefslogtreecommitdiff
path: root/src/openvic-dataloader/detail/ParseHandler.cpp
diff options
context:
space:
mode:
author Spartan322 <Megacake1234@gmail.com>2024-05-09 16:06:02 +0200
committer Spartan322 <Megacake1234@gmail.com>2024-06-18 01:31:12 +0200
commitb0c3ba3f91926b0c95625bdbf4aab69269130b13 (patch)
treef15ebc47d6bf370031af28e4bb4814ae30ef46e1 /src/openvic-dataloader/detail/ParseHandler.cpp
parent7b521d6023113372cf6b02e562828273c4040f0e (diff)
Add runtime encoding detection and conversionfix/char-detection
Win-1251/1252 detection is a reduced C++ version of https://github.com/hsivonen/chardetng Add manually-specified encoding fallback Add default system encoding fallback Add error recovery to v2script Add unknown encoding detection warning Remove csv::Parser templating Fix lua files dropping data Update lexy to foonathan/lexy@1e5d99fa3826b1c3c8628d3a11117fb4fb4cc0d0 Remove exclusive reliance on lexy::default_encoding for v2script Move internal concepts to src/openvic-detail/InternalConcepts.hpp Move contents of DetectUtf8.hpp to src/detail/Detect.hpp Move openvic-dataloader/AbstractSyntaxTree.hpp to src Move DiagnosticLogger.hpp to src Move File.hpp to src Move openvic-dataloader/detail/utlity files to openvic-dataloader/detail Add ovdl::utility::type_concat Add ovdl::utility::type_prepend Add ovdl::utility::is_instance_of Overhaul parse error messages
Diffstat (limited to 'src/openvic-dataloader/detail/ParseHandler.cpp')
-rw-r--r--src/openvic-dataloader/detail/ParseHandler.cpp347
1 files changed, 347 insertions, 0 deletions
diff --git a/src/openvic-dataloader/detail/ParseHandler.cpp b/src/openvic-dataloader/detail/ParseHandler.cpp
new file mode 100644
index 0000000..3818433
--- /dev/null
+++ b/src/openvic-dataloader/detail/ParseHandler.cpp
@@ -0,0 +1,347 @@
+#include "ParseHandler.hpp"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdlib>
+#include <string_view>
+#include <type_traits>
+
+#include <openvic-dataloader/detail/Encoding.hpp>
+
+using namespace ovdl::detail;
+
+#ifdef _WIN32
+#include <array>
+#include <cstdint>
+#include <utility>
+
+#define WIN32_LEAN_AND_MEAN
+#include <Windows.h>
+#undef WIN32_LEAN_AND_MEAN
+#endif
+
+template<size_t N>
+struct LangCodeLiteral {
+ char value[N];
+
+ constexpr LangCodeLiteral(const char (&str)[N]) {
+ std::copy_n(str, N, value);
+ }
+
+ static constexpr std::integral_constant<std::size_t, N - 1> size = {};
+
+ constexpr const char& operator[](std::size_t index) const noexcept {
+ return value[index];
+ }
+
+ constexpr operator std::string_view() const noexcept {
+ return std::string_view(value, size());
+ }
+
+ constexpr bool operator==(const std::string_view view) const noexcept {
+ return view.size() >= size() + 1 && view.starts_with(*this) && view[size()] == '_';
+ }
+};
+
+struct LangCodeView {
+ std::string_view view;
+ bool is_valid;
+
+ constexpr LangCodeView() = default;
+
+ template<std::size_t N>
+ constexpr LangCodeView(const char (&str)[N]) : view(str), is_valid(true) {}
+
+ constexpr LangCodeView(char* str) : view(str) {
+ is_valid = view.find('_') != std::string_view::npos;
+ }
+
+ constexpr std::size_t size() const noexcept {
+ return view.size();
+ }
+
+ constexpr const char& operator[](std::size_t index) const noexcept {
+ return view[index];
+ }
+
+ constexpr operator std::string_view() const noexcept {
+ return view;
+ }
+
+ template<std::size_t N>
+ constexpr bool operator==(const LangCodeLiteral<N>& literal) {
+ return is_valid && size() >= LangCodeLiteral<N>::size() && view.starts_with(literal);
+ }
+};
+
+struct FallbackSetter {
+ std::optional<Encoding>& fallback;
+
+ template<Encoding _Encoding, LangCodeLiteral LangCode>
+ constexpr bool encoded(auto&& view) const {
+ if (view == LangCode) {
+ fallback = _Encoding;
+ return true;
+ }
+ return false;
+ };
+};
+
+void ParseHandler::_detect_system_fallback_encoding() {
+ _system_fallback_encoding = Encoding::Unknown;
+ LangCodeView lang_code;
+
+#ifdef _WIN32
+ using namespace std::string_view_literals;
+
+ // Every Windows language id mapped to a language code according to https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-lcid/63d3d639-7fd2-4afb-abbe-0d5b5551eef8
+ constexpr std::array lang_id_to_lang_code = std::to_array<std::pair<std::uint8_t, LangCodeView>>({
+ { 0x0001, "ar" },
+ { 0x0002, "bg" },
+ { 0x0003, "ca" },
+ { 0x0004, "zh" },
+ { 0x0005, "cs" },
+ { 0x0006, "da" },
+ { 0x0007, "de" },
+ { 0x0008, "el" },
+ { 0x0009, "en" },
+ { 0x000A, "es" },
+ { 0x000B, "fi" },
+ { 0x000C, "fr" },
+ { 0x000D, "he" },
+ { 0x000E, "hu" },
+ { 0x000F, "is" },
+ { 0x0010, "it" },
+ { 0x0011, "ja" },
+ { 0x0012, "ko" },
+ { 0x0013, "nl" },
+ { 0x0014, "no" },
+ { 0x0015, "pl" },
+ { 0x0016, "pt" },
+ { 0x0017, "rm" },
+ { 0x0018, "ro" },
+ { 0x0019, "ru" },
+ { 0x001A, "hr" },
+ { 0x001B, "sk" },
+ { 0x001C, "sq" },
+ { 0x001D, "sv" },
+ { 0x001E, "th" },
+ { 0x001F, "tr" },
+ { 0x0020, "ur" },
+ { 0x0021, "id" },
+ { 0x0022, "uk" },
+ { 0x0023, "be" },
+ { 0x0024, "sl" },
+ { 0x0025, "et" },
+ { 0x0026, "lv" },
+ { 0x0027, "lt" },
+ { 0x0028, "tg" },
+ { 0x0029, "fa" },
+ { 0x002A, "vi" },
+ { 0x002B, "hy" },
+ { 0x002C, "az" },
+ { 0x002D, "eu" },
+ { 0x002E, "hsb" },
+ { 0x002F, "mk" },
+ { 0x0030, "st" },
+ { 0x0031, "ts" },
+ { 0x0032, "tn" },
+ { 0x0033, "ve" },
+ { 0x0034, "xh" },
+ { 0x0035, "zu" },
+ { 0x0036, "af" },
+ { 0x0037, "ka" },
+ { 0x0038, "fo" },
+ { 0x0039, "hi" },
+ { 0x003A, "mt" },
+ { 0x003B, "se" },
+ { 0x003C, "ga" },
+ { 0x003D, "yi" },
+ { 0x003E, "ms" },
+ { 0x003F, "kk" },
+ { 0x0040, "ky" },
+ { 0x0041, "sw" },
+ { 0x0042, "tk" },
+ { 0x0043, "uz" },
+ { 0x0044, "tt" },
+ { 0x0045, "bn" },
+ { 0x0046, "pa" },
+ { 0x0047, "gu" },
+ { 0x0048, "or" },
+ { 0x0049, "ta" },
+ { 0x004A, "te" },
+ { 0x004B, "kn" },
+ { 0x004C, "ml" },
+ { 0x004D, "as" },
+ { 0x004E, "mr" },
+ { 0x004F, "sa" },
+ { 0x0050, "mn" },
+ { 0x0051, "bo" },
+ { 0x0052, "cy" },
+ { 0x0053, "km" },
+ { 0x0054, "lo" },
+ { 0x0055, "my" },
+ { 0x0056, "gl" },
+ { 0x0057, "kok" },
+ { 0x0058, "mni" },
+ { 0x0059, "sd" },
+ { 0x005A, "syr" },
+ { 0x005B, "si" },
+ { 0x005C, "chr" },
+ { 0x005D, "iu" },
+ { 0x005E, "am" },
+ { 0x005F, "tzm" },
+ { 0x0060, "ks" },
+ { 0x0061, "ne" },
+ { 0x0062, "fy" },
+ { 0x0063, "ps" },
+ { 0x0064, "fil" },
+ { 0x0065, "dv" },
+ { 0x0066, "bin" },
+ { 0x0067, "ff" },
+ { 0x0068, "ha" },
+ { 0x0069, "ibb" },
+ { 0x006A, "yo" },
+ { 0x006B, "quz" },
+ { 0x006C, "nso" },
+ { 0x006D, "ba" },
+ { 0x006E, "lb" },
+ { 0x006F, "kl" },
+ { 0x0070, "ig" },
+ { 0x0071, "kr" },
+ { 0x0072, "om" },
+ { 0x0073, "ti" },
+ { 0x0074, "gn" },
+ { 0x0075, "haw" },
+ { 0x0076, "la" },
+ { 0x0077, "so" },
+ { 0x0078, "ii" },
+ { 0x0079, "pap" },
+ { 0x007A, "arn" },
+ { 0x007C, "moh" },
+ { 0x007E, "br" },
+ { 0x0080, "ug" },
+ { 0x0081, "mi" },
+ { 0x0082, "oc" },
+ { 0x0083, "co" },
+ { 0x0084, "gsw" },
+ { 0x0085, "sah" },
+ { 0x0086, "qut" },
+ { 0x0087, "rw" },
+ { 0x0088, "wo" },
+ { 0x008C, "prs" },
+ { 0x0091, "gd" },
+ { 0x0092, "ku" },
+ { 0x0093, "quc" } //
+ });
+
+#pragma pack(push, 1)
+ struct LocaleStruct {
+ struct {
+ uint8_t language_id;
+ uint8_t country_id;
+ } language_country;
+ uint8_t sort_id : 4;
+ uint16_t reserved : 12;
+ };
+#pragma pack(pop)
+
+ std::uint32_t locale_int = GetSystemDefaultLCID();
+ LocaleStruct locale_id;
+ std::memcpy(&locale_id, &locale_int, sizeof(locale_id));
+ // first 16 bytes are language-country id, next 4 are sort id, last 12 bytes are reserved
+ // first 8 are the language id, last 8 bytes are a country id
+ const std::uint8_t& lang_id = locale_id.language_country.language_id;
+
+ for (const auto& map : lang_id_to_lang_code) {
+ if (map.first != lang_id) continue;
+ lang_code = map.second;
+ break;
+ }
+#else
+ lang_code = std::getenv("LANG");
+#endif
+
+ constexpr FallbackSetter setter { _system_fallback_encoding };
+
+ if (lang_code.size() < 2) {
+ _system_fallback_encoding = Encoding::Unknown;
+ return;
+ }
+
+#define WIN1251(LANG_CODE) \
+ if (setter.encoded<Encoding::Windows1251, #LANG_CODE>(lang_code)) return;
+
+#define WIN1252(LANG_CODE) \
+ if (setter.encoded<Encoding::Windows1252, #LANG_CODE>(lang_code)) return;
+
+ // More common, prefer
+ WIN1252(en);
+ WIN1252(es);
+ WIN1252(fr);
+ WIN1252(de);
+
+ WIN1251(ru);
+
+ WIN1252(af);
+ WIN1252(sq);
+ WIN1252(eu);
+ WIN1252(br);
+ WIN1252(co);
+ WIN1252(fo);
+ WIN1252(gl);
+ WIN1252(is);
+ WIN1252(io);
+ WIN1252(ga);
+ WIN1252(id);
+ WIN1252(in);
+ WIN1252(it);
+ WIN1252(lb);
+ WIN1252(ms);
+ WIN1252(gv);
+ WIN1252(no);
+ WIN1252(oc);
+ WIN1252(pt);
+ WIN1252(gd);
+ WIN1252(sw);
+ WIN1252(fi);
+ WIN1252(da);
+ WIN1252(et);
+ WIN1252(tn);
+ WIN1252(ca);
+ WIN1252(rm);
+ WIN1252(nl);
+ WIN1252(sl);
+ WIN1252(cy);
+ WIN1252(hu);
+
+ WIN1251(be);
+ WIN1251(uk);
+ WIN1251(bg);
+ WIN1251(kk);
+ WIN1251(tg);
+ WIN1251(sr);
+ WIN1251(ky);
+ WIN1251(mn);
+ WIN1251(mk);
+ WIN1251(mo);
+
+ if (lang_code.size() < 3) {
+ return;
+ }
+
+ WIN1251(mol);
+
+ WIN1252(ast);
+ WIN1252(jbo);
+ WIN1252(gla);
+ WIN1252(sco);
+ WIN1252(sma);
+ WIN1252(roo);
+ WIN1252(swa);
+ WIN1252(tsn);
+ WIN1252(tok);
+
+#undef WIN1251
+#undef WIN1252
+} \ No newline at end of file