aboutsummaryrefslogtreecommitdiff
path: root/src/openvic-dataloader/detail/Detect.cpp
diff options
context:
space:
mode:
author Spartan322 <Megacake1234@gmail.com>2024-05-09 16:06:02 +0200
committer Spartan322 <Megacake1234@gmail.com>2024-06-18 01:31:12 +0200
commitb0c3ba3f91926b0c95625bdbf4aab69269130b13 (patch)
treef15ebc47d6bf370031af28e4bb4814ae30ef46e1 /src/openvic-dataloader/detail/Detect.cpp
parent7b521d6023113372cf6b02e562828273c4040f0e (diff)
Add runtime encoding detection and conversionfix/char-detection
Win-1251/1252 detection is a reduced C++ version of https://github.com/hsivonen/chardetng Add manually-specified encoding fallback Add default system encoding fallback Add error recovery to v2script Add unknown encoding detection warning Remove csv::Parser templating Fix lua files dropping data Update lexy to foonathan/lexy@1e5d99fa3826b1c3c8628d3a11117fb4fb4cc0d0 Remove exclusive reliance on lexy::default_encoding for v2script Move internal concepts to src/openvic-detail/InternalConcepts.hpp Move contents of DetectUtf8.hpp to src/detail/Detect.hpp Move openvic-dataloader/AbstractSyntaxTree.hpp to src Move DiagnosticLogger.hpp to src Move File.hpp to src Move openvic-dataloader/detail/utlity files to openvic-dataloader/detail Add ovdl::utility::type_concat Add ovdl::utility::type_prepend Add ovdl::utility::is_instance_of Overhaul parse error messages
Diffstat (limited to 'src/openvic-dataloader/detail/Detect.cpp')
-rw-r--r--src/openvic-dataloader/detail/Detect.cpp351
1 files changed, 351 insertions, 0 deletions
diff --git a/src/openvic-dataloader/detail/Detect.cpp b/src/openvic-dataloader/detail/Detect.cpp
new file mode 100644
index 0000000..1516fc7
--- /dev/null
+++ b/src/openvic-dataloader/detail/Detect.cpp
@@ -0,0 +1,351 @@
+#include "detail/Detect.hpp"
+
+using namespace ovdl;
+using namespace ovdl::encoding_detect;
+
+static constexpr int64_t INVALID_CLASS = 255;
+
+std::optional<int64_t> Utf8Canidate::read(const std::span<const cbyte>& buffer) {
+ auto lexy_buffer = lexy::make_buffer_from_raw<lexy::default_encoding, lexy::encoding_endianness::little>(buffer.data(), buffer.size());
+ if (is_utf8(lexy_buffer)) {
+ return 0;
+ }
+
+ return std::nullopt;
+}
+
+std::optional<int64_t> AsciiCanidate::read(const std::span<const cbyte>& buffer) {
+ auto lexy_buffer = lexy::make_buffer_from_raw<lexy::default_encoding, lexy::encoding_endianness::little>(buffer.data(), buffer.size());
+ if (is_ascii(lexy_buffer)) {
+ return 0;
+ }
+
+ return std::nullopt;
+}
+
+std::optional<int64_t> NonLatinCasedCanidate::read(const std::span<const cbyte>& buffer) {
+ static constexpr cbyte LATIN_LETTER = 1;
+ static constexpr int64_t NON_LATIN_MIXED_CASE_PENALTY = -20;
+ static constexpr int64_t NON_LATIN_ALL_CAPS_PENALTY = -40;
+ static constexpr int64_t NON_LATIN_CAPITALIZATION_BONUS = 40;
+ static constexpr int64_t LATIN_ADJACENCY_PENALTY = -50;
+
+ int64_t score = 0;
+ for (const ubyte& b : buffer) {
+ const ubyte byte_class = score_data.classify(b);
+ if (byte_class == INVALID_CLASS) {
+ return std::nullopt;
+ }
+
+ const ubyte caseless_class = byte_class & 0x7F;
+ const bool ascii = b < 0x80;
+ const bool ascii_pair = prev_ascii == 0 && ascii;
+ const bool non_ascii_alphabetic = score_data.is_non_latin_alphabetic(caseless_class);
+
+ if (caseless_class == LATIN_LETTER) {
+ case_state = CaseState::Mix;
+ } else if (!non_ascii_alphabetic) {
+ switch (case_state) {
+ default: break;
+ case CaseState::UpperLower:
+ score += NON_LATIN_CAPITALIZATION_BONUS;
+ break;
+ case CaseState::AllCaps:
+ // pass
+ break;
+ case CaseState::Mix:
+ score += NON_LATIN_MIXED_CASE_PENALTY * current_word_len;
+ break;
+ }
+ case_state = CaseState::Space;
+ } else if (byte_class >> 7 == 0) {
+ switch (case_state) {
+ default: break;
+ case CaseState::Space:
+ case_state = CaseState::Lower;
+ break;
+ case CaseState::Upper:
+ case_state = CaseState::UpperLower;
+ break;
+ case CaseState::AllCaps:
+ case_state = CaseState::Mix;
+ break;
+ }
+ } else {
+ switch (case_state) {
+ default: break;
+ case CaseState::Space:
+ case_state = CaseState::Upper;
+ break;
+ case CaseState::Upper:
+ case_state = CaseState::AllCaps;
+ break;
+ case CaseState::Lower:
+ case CaseState::UpperLower:
+ case_state = CaseState::Mix;
+ break;
+ }
+ }
+
+ if (non_ascii_alphabetic) {
+ current_word_len += 1;
+ } else {
+ if (current_word_len > longest_word) {
+ longest_word = current_word_len;
+ }
+ current_word_len = 0;
+ }
+
+ const bool is_a0 = b == 0xA0;
+
+ if (!ascii_pair) {
+ // 0xA0 is no-break space in many other encodings, so avoid
+ // assigning score to IBM866 when 0xA0 occurs next to itself
+ // or a space-like byte.
+ if (!(ibm866 && ((is_a0 && (prev_was_a0 || prev == 0)) || caseless_class == 0 && prev_was_a0))) {
+ score += score_data.score(caseless_class, prev);
+ }
+
+ if (prev == LATIN_LETTER &&
+ non_ascii_alphabetic) {
+ score += LATIN_ADJACENCY_PENALTY;
+ } else if (caseless_class == LATIN_LETTER && score_data.is_non_latin_alphabetic(prev)) {
+ score += LATIN_ADJACENCY_PENALTY;
+ }
+ }
+
+ prev_ascii = ascii;
+ prev = caseless_class;
+ prev_was_a0 = is_a0;
+ }
+ return score;
+}
+
+std::optional<int64_t> LatinCanidate::read(const std::span<const cbyte>& buffer) {
+ static constexpr int64_t IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY = -180;
+ static constexpr int64_t ORDINAL_BONUS = 300;
+ static constexpr int64_t COPYRIGHT_BONUS = 222;
+ static constexpr int64_t IMPLAUSIBILITY_PENALTY = -220;
+
+ int64_t score = 0;
+ for (const ubyte& b : buffer) {
+ const ubyte byte_class = score_data.classify(b);
+ if (byte_class == INVALID_CLASS) {
+ return std::nullopt;
+ }
+
+ const ubyte caseless_class = byte_class & 0x7F;
+ const bool ascii = b < 0x80;
+ const bool ascii_pair = prev_non_ascii == 0 && ascii;
+
+ int16_t non_ascii_penalty = -200;
+ switch (prev_non_ascii) {
+ case 0:
+ case 1:
+ case 2:
+ non_ascii_penalty = 0;
+ break;
+ case 3:
+ non_ascii_penalty = -5;
+ break;
+ case 4:
+ non_ascii_penalty = 20;
+ break;
+ }
+ score += non_ascii_penalty;
+
+ if (!score_data.is_latin_alphabetic(caseless_class)) {
+ case_state = CaseState::Space;
+ } else if (byte_class >> 7 == 0) {
+ if (case_state == CaseState::AllCaps && !ascii_pair) {
+ score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY;
+ }
+ case_state = CaseState::Lower;
+ } else {
+ switch (case_state) {
+ case CaseState::Lower:
+ if (!ascii_pair) {
+ score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY;
+ }
+ [[fallthrough]];
+ case CaseState::Space:
+ case_state = CaseState::Upper;
+ break;
+ case CaseState::Upper:
+ case CaseState::AllCaps:
+ case_state = CaseState::AllCaps;
+ break;
+ }
+ }
+
+ bool ascii_ish_pair = ascii_pair || (ascii && prev == 0) || (caseless_class == 0 && prev_non_ascii == 0);
+
+ if (!ascii_ish_pair) {
+ score += score_data.score(caseless_class, prev);
+ }
+
+ if (windows1252) {
+ switch (ordinal_state) {
+ case OrdinalState::Other:
+ if (caseless_class == 0) {
+ ordinal_state = OrdinalState::Space;
+ }
+ break;
+ case OrdinalState::Space:
+ if (caseless_class == 0) {
+ // pass
+ } else if (b == 0xAA || b == 0xBA) {
+ ordinal_state = OrdinalState::OrdinalExpectingSpace;
+ } else if (b == 'M' || b == 'D' || b == 'S') {
+ ordinal_state = OrdinalState::FeminineAbbreviationStartLetter;
+ } else if (b == 'N') {
+ // numero or Nuestra
+ ordinal_state = OrdinalState::UpperN;
+ } else if (b == 'n') {
+ // numero
+ ordinal_state = OrdinalState::LowerN;
+ } else if (caseless_class == ASCII_DIGIT) {
+ ordinal_state = OrdinalState::Digit;
+ } else if (caseless_class == 9 /* I */ || caseless_class == 22 /* V */ || caseless_class == 24)
+ /* X */
+ {
+ ordinal_state = OrdinalState::Roman;
+ } else if (b == 0xA9) {
+ ordinal_state = OrdinalState::Copyright;
+ } else {
+ ordinal_state = OrdinalState::Other;
+ }
+ break;
+ case OrdinalState::OrdinalExpectingSpace:
+ if (caseless_class == 0) {
+ score += ORDINAL_BONUS;
+ ordinal_state = OrdinalState::Space;
+ } else {
+ ordinal_state = OrdinalState::Other;
+ }
+ case OrdinalState::OrdinalExpectingSpaceUndoImplausibility:
+ if (caseless_class == 0) {
+ score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY;
+ ordinal_state = OrdinalState::Space;
+ } else {
+ ordinal_state = OrdinalState::Other;
+ }
+ break;
+ case OrdinalState::OrdinalExpectingSpaceOrDigit:
+ if (caseless_class == 0) {
+ score += ORDINAL_BONUS;
+ ordinal_state = OrdinalState::Space;
+ } else if (caseless_class == ASCII_DIGIT) {
+ score += ORDINAL_BONUS;
+ // Deliberately set to `Other`
+ ordinal_state = OrdinalState::Other;
+ } else {
+ ordinal_state = OrdinalState::Other;
+ }
+ break;
+ case OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily:
+ if (caseless_class == 0) {
+ score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY;
+ ordinal_state = OrdinalState::Space;
+ } else if (caseless_class == ASCII_DIGIT) {
+ score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY;
+ // Deliberately set to `Other`
+ ordinal_state = OrdinalState::Other;
+ } else {
+ ordinal_state = OrdinalState::Other;
+ }
+ break;
+ case OrdinalState::UpperN:
+ if (b == 0xAA) {
+ ordinal_state =
+ OrdinalState::OrdinalExpectingSpaceUndoImplausibility;
+ } else if (b == 0xBA) {
+ ordinal_state =
+ OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily;
+ } else if (b == '.') {
+ ordinal_state = OrdinalState::PeriodAfterN;
+ } else if (caseless_class == 0) {
+ ordinal_state = OrdinalState::Space;
+ } else {
+ ordinal_state = OrdinalState::Other;
+ }
+ break;
+ case OrdinalState::LowerN:
+ if (b == 0xBA) {
+ ordinal_state =
+ OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily;
+ } else if (b == '.') {
+ ordinal_state = OrdinalState::PeriodAfterN;
+ } else if (caseless_class == 0) {
+ ordinal_state = OrdinalState::Space;
+ } else {
+ ordinal_state = OrdinalState::Other;
+ }
+ break;
+ case OrdinalState::FeminineAbbreviationStartLetter:
+ if (b == 0xAA) {
+ ordinal_state =
+ OrdinalState::OrdinalExpectingSpaceUndoImplausibility;
+ } else if (caseless_class == 0) {
+ ordinal_state = OrdinalState::Space;
+ } else {
+ ordinal_state = OrdinalState::Other;
+ }
+ break;
+ case OrdinalState::Digit:
+ if (b == 0xAA || b == 0xBA) {
+ ordinal_state = OrdinalState::OrdinalExpectingSpace;
+ } else if (caseless_class == 0) {
+ ordinal_state = OrdinalState::Space;
+ } else if (caseless_class == ASCII_DIGIT) {
+ // pass
+ } else {
+ ordinal_state = OrdinalState::Other;
+ }
+ break;
+ case OrdinalState::Roman:
+ if (b == 0xAA || b == 0xBA) {
+ ordinal_state =
+ OrdinalState::OrdinalExpectingSpaceUndoImplausibility;
+ } else if (caseless_class == 0) {
+ ordinal_state = OrdinalState::Space;
+ } else if (caseless_class == 9 /* I */ || caseless_class == 22 /* V */ || caseless_class == 24)
+ /* X */
+ {
+ // pass
+ } else {
+ ordinal_state = OrdinalState::Other;
+ }
+ break;
+ case OrdinalState::PeriodAfterN:
+ if (b == 0xBA) {
+ ordinal_state = OrdinalState::OrdinalExpectingSpaceOrDigit;
+ } else if (caseless_class == 0) {
+ ordinal_state = OrdinalState::Space;
+ } else {
+ ordinal_state = OrdinalState::Other;
+ }
+ break;
+ case OrdinalState::Copyright:
+ if (caseless_class == 0) {
+ score += COPYRIGHT_BONUS;
+ ordinal_state = OrdinalState::Space;
+ } else {
+ ordinal_state = OrdinalState::Other;
+ }
+ break;
+ }
+ }
+
+ if (ascii) {
+ prev_non_ascii = 0;
+ } else {
+ prev_non_ascii += 1;
+ }
+ prev = caseless_class;
+ }
+ return score;
+}
+
+template struct ovdl::encoding_detect::DetectUtf8<true>;
+template struct ovdl::encoding_detect::DetectUtf8<false>;