diff options
author | Spartan322 <Megacake1234@gmail.com> | 2024-05-09 16:06:02 +0200 |
---|---|---|
committer | Spartan322 <Megacake1234@gmail.com> | 2024-06-18 01:31:12 +0200 |
commit | b0c3ba3f91926b0c95625bdbf4aab69269130b13 (patch) | |
tree | f15ebc47d6bf370031af28e4bb4814ae30ef46e1 /src/openvic-dataloader/detail/Detect.cpp | |
parent | 7b521d6023113372cf6b02e562828273c4040f0e (diff) |
Add runtime encoding detection and conversionfix/char-detection
Win-1251/1252 detection is a reduced C++ version of https://github.com/hsivonen/chardetng
Add manually-specified encoding fallback
Add default system encoding fallback
Add error recovery to v2script
Add unknown encoding detection warning
Remove csv::Parser templating
Fix lua files dropping data
Update lexy to foonathan/lexy@1e5d99fa3826b1c3c8628d3a11117fb4fb4cc0d0
Remove exclusive reliance on lexy::default_encoding for v2script
Move internal concepts to src/openvic-detail/InternalConcepts.hpp
Move contents of DetectUtf8.hpp to src/detail/Detect.hpp
Move openvic-dataloader/AbstractSyntaxTree.hpp to src
Move DiagnosticLogger.hpp to src
Move File.hpp to src
Move openvic-dataloader/detail/utlity files to openvic-dataloader/detail
Add ovdl::utility::type_concat
Add ovdl::utility::type_prepend
Add ovdl::utility::is_instance_of
Overhaul parse error messages
Diffstat (limited to 'src/openvic-dataloader/detail/Detect.cpp')
-rw-r--r-- | src/openvic-dataloader/detail/Detect.cpp | 351 |
1 files changed, 351 insertions, 0 deletions
diff --git a/src/openvic-dataloader/detail/Detect.cpp b/src/openvic-dataloader/detail/Detect.cpp new file mode 100644 index 0000000..1516fc7 --- /dev/null +++ b/src/openvic-dataloader/detail/Detect.cpp @@ -0,0 +1,351 @@ +#include "detail/Detect.hpp" + +using namespace ovdl; +using namespace ovdl::encoding_detect; + +static constexpr int64_t INVALID_CLASS = 255; + +std::optional<int64_t> Utf8Canidate::read(const std::span<const cbyte>& buffer) { + auto lexy_buffer = lexy::make_buffer_from_raw<lexy::default_encoding, lexy::encoding_endianness::little>(buffer.data(), buffer.size()); + if (is_utf8(lexy_buffer)) { + return 0; + } + + return std::nullopt; +} + +std::optional<int64_t> AsciiCanidate::read(const std::span<const cbyte>& buffer) { + auto lexy_buffer = lexy::make_buffer_from_raw<lexy::default_encoding, lexy::encoding_endianness::little>(buffer.data(), buffer.size()); + if (is_ascii(lexy_buffer)) { + return 0; + } + + return std::nullopt; +} + +std::optional<int64_t> NonLatinCasedCanidate::read(const std::span<const cbyte>& buffer) { + static constexpr cbyte LATIN_LETTER = 1; + static constexpr int64_t NON_LATIN_MIXED_CASE_PENALTY = -20; + static constexpr int64_t NON_LATIN_ALL_CAPS_PENALTY = -40; + static constexpr int64_t NON_LATIN_CAPITALIZATION_BONUS = 40; + static constexpr int64_t LATIN_ADJACENCY_PENALTY = -50; + + int64_t score = 0; + for (const ubyte& b : buffer) { + const ubyte byte_class = score_data.classify(b); + if (byte_class == INVALID_CLASS) { + return std::nullopt; + } + + const ubyte caseless_class = byte_class & 0x7F; + const bool ascii = b < 0x80; + const bool ascii_pair = prev_ascii == 0 && ascii; + const bool non_ascii_alphabetic = score_data.is_non_latin_alphabetic(caseless_class); + + if (caseless_class == LATIN_LETTER) { + case_state = CaseState::Mix; + } else if (!non_ascii_alphabetic) { + switch (case_state) { + default: break; + case CaseState::UpperLower: + score += NON_LATIN_CAPITALIZATION_BONUS; + break; + case CaseState::AllCaps: + // pass + break; + case CaseState::Mix: + score += NON_LATIN_MIXED_CASE_PENALTY * current_word_len; + break; + } + case_state = CaseState::Space; + } else if (byte_class >> 7 == 0) { + switch (case_state) { + default: break; + case CaseState::Space: + case_state = CaseState::Lower; + break; + case CaseState::Upper: + case_state = CaseState::UpperLower; + break; + case CaseState::AllCaps: + case_state = CaseState::Mix; + break; + } + } else { + switch (case_state) { + default: break; + case CaseState::Space: + case_state = CaseState::Upper; + break; + case CaseState::Upper: + case_state = CaseState::AllCaps; + break; + case CaseState::Lower: + case CaseState::UpperLower: + case_state = CaseState::Mix; + break; + } + } + + if (non_ascii_alphabetic) { + current_word_len += 1; + } else { + if (current_word_len > longest_word) { + longest_word = current_word_len; + } + current_word_len = 0; + } + + const bool is_a0 = b == 0xA0; + + if (!ascii_pair) { + // 0xA0 is no-break space in many other encodings, so avoid + // assigning score to IBM866 when 0xA0 occurs next to itself + // or a space-like byte. + if (!(ibm866 && ((is_a0 && (prev_was_a0 || prev == 0)) || caseless_class == 0 && prev_was_a0))) { + score += score_data.score(caseless_class, prev); + } + + if (prev == LATIN_LETTER && + non_ascii_alphabetic) { + score += LATIN_ADJACENCY_PENALTY; + } else if (caseless_class == LATIN_LETTER && score_data.is_non_latin_alphabetic(prev)) { + score += LATIN_ADJACENCY_PENALTY; + } + } + + prev_ascii = ascii; + prev = caseless_class; + prev_was_a0 = is_a0; + } + return score; +} + +std::optional<int64_t> LatinCanidate::read(const std::span<const cbyte>& buffer) { + static constexpr int64_t IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY = -180; + static constexpr int64_t ORDINAL_BONUS = 300; + static constexpr int64_t COPYRIGHT_BONUS = 222; + static constexpr int64_t IMPLAUSIBILITY_PENALTY = -220; + + int64_t score = 0; + for (const ubyte& b : buffer) { + const ubyte byte_class = score_data.classify(b); + if (byte_class == INVALID_CLASS) { + return std::nullopt; + } + + const ubyte caseless_class = byte_class & 0x7F; + const bool ascii = b < 0x80; + const bool ascii_pair = prev_non_ascii == 0 && ascii; + + int16_t non_ascii_penalty = -200; + switch (prev_non_ascii) { + case 0: + case 1: + case 2: + non_ascii_penalty = 0; + break; + case 3: + non_ascii_penalty = -5; + break; + case 4: + non_ascii_penalty = 20; + break; + } + score += non_ascii_penalty; + + if (!score_data.is_latin_alphabetic(caseless_class)) { + case_state = CaseState::Space; + } else if (byte_class >> 7 == 0) { + if (case_state == CaseState::AllCaps && !ascii_pair) { + score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY; + } + case_state = CaseState::Lower; + } else { + switch (case_state) { + case CaseState::Lower: + if (!ascii_pair) { + score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY; + } + [[fallthrough]]; + case CaseState::Space: + case_state = CaseState::Upper; + break; + case CaseState::Upper: + case CaseState::AllCaps: + case_state = CaseState::AllCaps; + break; + } + } + + bool ascii_ish_pair = ascii_pair || (ascii && prev == 0) || (caseless_class == 0 && prev_non_ascii == 0); + + if (!ascii_ish_pair) { + score += score_data.score(caseless_class, prev); + } + + if (windows1252) { + switch (ordinal_state) { + case OrdinalState::Other: + if (caseless_class == 0) { + ordinal_state = OrdinalState::Space; + } + break; + case OrdinalState::Space: + if (caseless_class == 0) { + // pass + } else if (b == 0xAA || b == 0xBA) { + ordinal_state = OrdinalState::OrdinalExpectingSpace; + } else if (b == 'M' || b == 'D' || b == 'S') { + ordinal_state = OrdinalState::FeminineAbbreviationStartLetter; + } else if (b == 'N') { + // numero or Nuestra + ordinal_state = OrdinalState::UpperN; + } else if (b == 'n') { + // numero + ordinal_state = OrdinalState::LowerN; + } else if (caseless_class == ASCII_DIGIT) { + ordinal_state = OrdinalState::Digit; + } else if (caseless_class == 9 /* I */ || caseless_class == 22 /* V */ || caseless_class == 24) + /* X */ + { + ordinal_state = OrdinalState::Roman; + } else if (b == 0xA9) { + ordinal_state = OrdinalState::Copyright; + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::OrdinalExpectingSpace: + if (caseless_class == 0) { + score += ORDINAL_BONUS; + ordinal_state = OrdinalState::Space; + } else { + ordinal_state = OrdinalState::Other; + } + case OrdinalState::OrdinalExpectingSpaceUndoImplausibility: + if (caseless_class == 0) { + score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY; + ordinal_state = OrdinalState::Space; + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::OrdinalExpectingSpaceOrDigit: + if (caseless_class == 0) { + score += ORDINAL_BONUS; + ordinal_state = OrdinalState::Space; + } else if (caseless_class == ASCII_DIGIT) { + score += ORDINAL_BONUS; + // Deliberately set to `Other` + ordinal_state = OrdinalState::Other; + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily: + if (caseless_class == 0) { + score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY; + ordinal_state = OrdinalState::Space; + } else if (caseless_class == ASCII_DIGIT) { + score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY; + // Deliberately set to `Other` + ordinal_state = OrdinalState::Other; + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::UpperN: + if (b == 0xAA) { + ordinal_state = + OrdinalState::OrdinalExpectingSpaceUndoImplausibility; + } else if (b == 0xBA) { + ordinal_state = + OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily; + } else if (b == '.') { + ordinal_state = OrdinalState::PeriodAfterN; + } else if (caseless_class == 0) { + ordinal_state = OrdinalState::Space; + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::LowerN: + if (b == 0xBA) { + ordinal_state = + OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily; + } else if (b == '.') { + ordinal_state = OrdinalState::PeriodAfterN; + } else if (caseless_class == 0) { + ordinal_state = OrdinalState::Space; + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::FeminineAbbreviationStartLetter: + if (b == 0xAA) { + ordinal_state = + OrdinalState::OrdinalExpectingSpaceUndoImplausibility; + } else if (caseless_class == 0) { + ordinal_state = OrdinalState::Space; + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::Digit: + if (b == 0xAA || b == 0xBA) { + ordinal_state = OrdinalState::OrdinalExpectingSpace; + } else if (caseless_class == 0) { + ordinal_state = OrdinalState::Space; + } else if (caseless_class == ASCII_DIGIT) { + // pass + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::Roman: + if (b == 0xAA || b == 0xBA) { + ordinal_state = + OrdinalState::OrdinalExpectingSpaceUndoImplausibility; + } else if (caseless_class == 0) { + ordinal_state = OrdinalState::Space; + } else if (caseless_class == 9 /* I */ || caseless_class == 22 /* V */ || caseless_class == 24) + /* X */ + { + // pass + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::PeriodAfterN: + if (b == 0xBA) { + ordinal_state = OrdinalState::OrdinalExpectingSpaceOrDigit; + } else if (caseless_class == 0) { + ordinal_state = OrdinalState::Space; + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::Copyright: + if (caseless_class == 0) { + score += COPYRIGHT_BONUS; + ordinal_state = OrdinalState::Space; + } else { + ordinal_state = OrdinalState::Other; + } + break; + } + } + + if (ascii) { + prev_non_ascii = 0; + } else { + prev_non_ascii += 1; + } + prev = caseless_class; + } + return score; +} + +template struct ovdl::encoding_detect::DetectUtf8<true>; +template struct ovdl::encoding_detect::DetectUtf8<false>; |