diff options
author | Spartan322 <Megacake1234@gmail.com> | 2024-05-09 16:06:02 +0200 |
---|---|---|
committer | Spartan322 <Megacake1234@gmail.com> | 2024-06-18 01:31:12 +0200 |
commit | b0c3ba3f91926b0c95625bdbf4aab69269130b13 (patch) | |
tree | f15ebc47d6bf370031af28e4bb4814ae30ef46e1 /src/openvic-dataloader/detail | |
parent | 7b521d6023113372cf6b02e562828273c4040f0e (diff) |
Add runtime encoding detection and conversionfix/char-detection
Win-1251/1252 detection is a reduced C++ version of https://github.com/hsivonen/chardetng
Add manually-specified encoding fallback
Add default system encoding fallback
Add error recovery to v2script
Add unknown encoding detection warning
Remove csv::Parser templating
Fix lua files dropping data
Update lexy to foonathan/lexy@1e5d99fa3826b1c3c8628d3a11117fb4fb4cc0d0
Remove exclusive reliance on lexy::default_encoding for v2script
Move internal concepts to src/openvic-detail/InternalConcepts.hpp
Move contents of DetectUtf8.hpp to src/detail/Detect.hpp
Move openvic-dataloader/AbstractSyntaxTree.hpp to src
Move DiagnosticLogger.hpp to src
Move File.hpp to src
Move openvic-dataloader/detail/utlity files to openvic-dataloader/detail
Add ovdl::utility::type_concat
Add ovdl::utility::type_prepend
Add ovdl::utility::is_instance_of
Overhaul parse error messages
Diffstat (limited to 'src/openvic-dataloader/detail')
-rw-r--r-- | src/openvic-dataloader/detail/Convert.hpp | 577 | ||||
-rw-r--r-- | src/openvic-dataloader/detail/Detect.cpp | 351 | ||||
-rw-r--r-- | src/openvic-dataloader/detail/Detect.hpp | 627 | ||||
-rw-r--r-- | src/openvic-dataloader/detail/DetectUtf8.hpp | 53 | ||||
-rw-r--r-- | src/openvic-dataloader/detail/Errors.hpp | 25 | ||||
-rw-r--r-- | src/openvic-dataloader/detail/InternalConcepts.hpp | 127 | ||||
-rw-r--r-- | src/openvic-dataloader/detail/ParseHandler.cpp | 347 | ||||
-rw-r--r-- | src/openvic-dataloader/detail/ParseHandler.hpp | 199 | ||||
-rw-r--r-- | src/openvic-dataloader/detail/Warnings.hpp | 9 | ||||
-rw-r--r-- | src/openvic-dataloader/detail/dsl.hpp | 194 |
10 files changed, 2330 insertions, 179 deletions
diff --git a/src/openvic-dataloader/detail/Convert.hpp b/src/openvic-dataloader/detail/Convert.hpp new file mode 100644 index 0000000..5d9fca0 --- /dev/null +++ b/src/openvic-dataloader/detail/Convert.hpp @@ -0,0 +1,577 @@ +#pragma once + +#include <cstddef> +#include <string_view> +#include <type_traits> + +#include <lexy/_detail/config.hpp> +#include <lexy/callback/string.hpp> +#include <lexy/code_point.hpp> +#include <lexy/dsl/option.hpp> +#include <lexy/dsl/symbol.hpp> +#include <lexy/encoding.hpp> +#include <lexy/input/base.hpp> +#include <lexy/input/file.hpp> +#include <lexy/input/string_input.hpp> +#include <lexy/lexeme.hpp> + +#include "openvic-dataloader/detail/Encoding.hpp" + +#include "ParseState.hpp" // IWYU pragma: keep +#include "detail/InternalConcepts.hpp" +#include "detail/dsl.hpp" +#include "v2script/ParseState.hpp" + +namespace ovdl::convert { + struct MappedChar { + char value; + std::string_view utf8; + + constexpr bool is_invalid() const { return value == 0; } + constexpr bool is_pass() const { return value == 1; } + }; + constexpr MappedChar invalid_map { 0, "" }; + constexpr MappedChar pass_map { 1, "" }; + + struct map_value { + std::string_view _value; + + constexpr map_value() noexcept : _value("") {} + constexpr map_value(std::nullptr_t) noexcept : _value("\0") {} + constexpr explicit map_value(std::string_view val) noexcept : _value(val) {} + + constexpr bool is_invalid() const { + return !_value.empty() && _value[0] == '\0'; + } + + constexpr bool is_pass() const { + return _value.empty(); + } + + constexpr bool is_valid() const noexcept { + return !_value.empty() && _value[0] != '\0'; + } + + constexpr explicit operator bool() const noexcept { + return is_valid(); + } + }; + + template<typename T> + concept IsConverter = requires(unsigned char c, lexy::_pr<lexy::deduce_encoding<char>>& reader) { + { T::try_parse(reader) } -> std::same_as<map_value>; + }; + + struct Utf8 { + static constexpr auto map = lexy::symbol_table<std::string_view>; + + template<typename Reader> + static constexpr map_value try_parse(Reader& reader) { + return {}; + } + }; + static_assert(IsConverter<Utf8>); + + struct Windows1252 { + static constexpr auto map = lexy::symbol_table<std::string_view> // + .map<'\x80'>("€") + .map<'\x82'>("‚") + .map<'\x83'>("ƒ") + .map<'\x84'>("„") + .map<'\x85'>("…") + .map<'\x86'>("†") + .map<'\x87'>("‡") + .map<'\x88'>("ˆ") + .map<'\x89'>("‰") + .map<'\x8A'>("Š") + .map<'\x8B'>("‹") + .map<'\x8C'>("Œ") + .map<'\x8E'>("Ž") + + .map<'\x91'>("‘") + .map<'\x92'>("’") + .map<'\x93'>("“") + .map<'\x94'>("”") + .map<'\x95'>("•") + .map<'\x96'>("–") + .map<'\x97'>("—") + .map<'\x98'>("˜") + .map<'\x99'>("™") + .map<'\x9A'>("š") + .map<'\x9B'>("›") + .map<'\x9C'>("œ") + .map<'\x9E'>("ž") + .map<'\x9F'>("Ÿ") + + .map<'\xA0'>(" ") + .map<'\xA1'>("¡") + .map<'\xA2'>("¢") + .map<'\xA3'>("£") + .map<'\xA4'>("¤") + .map<'\xA5'>("¥") + .map<'\xA6'>("¦") + .map<'\xA7'>("§") + .map<'\xA8'>("¨") + .map<'\xA9'>("©") + .map<'\xAA'>("ª") + .map<'\xAB'>("«") + .map<'\xAC'>("¬") + .map<'\xAD'>("") // Soft Hyphen + .map<'\xAE'>("®") + .map<'\xAF'>("¯") + + .map<'\xB0'>("°") + .map<'\xB1'>("±") + .map<'\xB2'>("²") + .map<'\xB3'>("³") + .map<'\xB4'>("´") + .map<'\xB5'>("µ") + .map<'\xB6'>("¶") + .map<'\xB7'>("·") + .map<'\xB8'>("¸") + .map<'\xB9'>("¹") + .map<'\xBA'>("º") + .map<'\xBB'>("»") + .map<'\xBC'>("¼") + .map<'\xBD'>("½") + .map<'\xBE'>("¾") + .map<'\xBF'>("¿") + + .map<'\xC0'>("À") + .map<'\xC1'>("Á") + .map<'\xC2'>("Â") + .map<'\xC3'>("Ã") + .map<'\xC4'>("Ä") + .map<'\xC5'>("Å") + .map<'\xC6'>("Æ") + .map<'\xC7'>("Ç") + .map<'\xC8'>("È") + .map<'\xC9'>("É") + .map<'\xCA'>("Ê") + .map<'\xCB'>("Ë") + .map<'\xCC'>("Ì") + .map<'\xCD'>("Í") + .map<'\xCE'>("Î") + .map<'\xCF'>("Ï") + + .map<'\xD0'>("Ð") + .map<'\xD1'>("Ñ") + .map<'\xD2'>("Ò") + .map<'\xD3'>("Ó") + .map<'\xD4'>("Ô") + .map<'\xD5'>("Õ") + .map<'\xD6'>("Ö") + .map<'\xD7'>("×") + .map<'\xD8'>("Ø") + .map<'\xD9'>("Ù") + .map<'\xDA'>("Ú") + .map<'\xDB'>("Û") + .map<'\xDC'>("Ü") + .map<'\xDD'>("Ý") + .map<'\xDE'>("Þ") + .map<'\xDF'>("ß") + + .map<'\xE0'>("à") + .map<'\xE1'>("á") + .map<'\xE2'>("â") + .map<'\xE3'>("ã") + .map<'\xE4'>("ä") + .map<'\xE5'>("å") + .map<'\xE6'>("æ") + .map<'\xE7'>("ç") + .map<'\xE8'>("è") + .map<'\xE9'>("é") + .map<'\xEA'>("ê") + .map<'\xEB'>("ë") + .map<'\xEC'>("ì") + .map<'\xED'>("í") + .map<'\xEE'>("î") + .map<'\xEF'>("ï") + + .map<'\xF0'>("ð") + .map<'\xF1'>("ñ") + .map<'\xF2'>("ò") + .map<'\xF3'>("ó") + .map<'\xF4'>("ô") + .map<'\xF5'>("õ") + .map<'\xF6'>("ö") + .map<'\xF7'>("÷") + .map<'\xF8'>("ø") + .map<'\xF9'>("ù") + .map<'\xFA'>("ú") + .map<'\xFB'>("û") + .map<'\xFC'>("ü") + .map<'\xFD'>("ý") + .map<'\xFE'>("þ") + .map<'\xFF'>("ÿ"); + + template<typename Reader> + static constexpr map_value try_parse(Reader& reader) { + auto index = map.try_parse(reader); + if (index) { + return map_value(map[index]); + } + return {}; + } + }; + static_assert(IsConverter<Windows1252>); + + struct Windows1251 { + static constexpr auto map = lexy::symbol_table<std::string_view> // + .map<'\x80'>("Ђ") + .map<'\x81'>("Ѓ") + .map<'\x82'>("‚") + .map<'\x83'>("ѓ") + .map<'\x84'>("„") + .map<'\x85'>("…") + .map<'\x86'>("†") + .map<'\x87'>("‡") + .map<'\x88'>("€") + .map<'\x89'>("‰") + .map<'\x8A'>("Љ") + .map<'\x8B'>("‹") + .map<'\x8C'>("Њ") + .map<'\x8D'>("Ќ") + .map<'\x8E'>("Ћ") + .map<'\x8F'>("Џ") + + .map<'\x90'>("ђ") + .map<'\x91'>("‘") + .map<'\x92'>("’") + .map<'\x93'>("“") + .map<'\x94'>("”") + .map<'\x95'>("•") + .map<'\x96'>("–") + .map<'\x97'>("—") + .map<'\x99'>("™") + .map<'\x9A'>("љ") + .map<'\x9B'>("›") + .map<'\x9C'>("њ") + .map<'\x9D'>("ќ") + .map<'\x9E'>("ћ") + .map<'\x9F'>("џ") + + .map<'\xA0'>(" ") + .map<'\xA1'>("Ў") + .map<'\xA2'>("ў") + .map<'\xA3'>("Ј") + .map<'\xA4'>("¤") + .map<'\xA5'>("Ґ") + .map<'\xA6'>("¦") + .map<'\xA7'>("§") + .map<'\xA8'>("Ё") + .map<'\xA9'>("©") + .map<'\xAA'>("Є") + .map<'\xAB'>("«") + .map<'\xAC'>("¬") + .map<'\xAD'>("") // Soft Hyphen + .map<'\xAE'>("®") + .map<'\xAF'>("Ї") + + .map<'\xB0'>("°") + .map<'\xB1'>("±") + .map<'\xB2'>("І") + .map<'\xB3'>("і") + .map<'\xB4'>("ґ") + .map<'\xB5'>("µ") + .map<'\xB6'>("¶") + .map<'\xB7'>("·") + .map<'\xB8'>("ё") + .map<'\xB9'>("№") + .map<'\xBA'>("є") + .map<'\xBB'>("»") + .map<'\xBC'>("ј") + .map<'\xBD'>("Ѕ") + .map<'\xBE'>("ѕ") + .map<'\xBF'>("ї") + + .map<'\xC0'>("А") + .map<'\xC1'>("Б") + .map<'\xC2'>("В") + .map<'\xC3'>("Г") + .map<'\xC4'>("Д") + .map<'\xC5'>("Е") + .map<'\xC6'>("Ж") + .map<'\xC7'>("З") + .map<'\xC8'>("И") + .map<'\xC9'>("Й") + .map<'\xCA'>("К") + .map<'\xCB'>("Л") + .map<'\xCC'>("М") + .map<'\xCD'>("Н") + .map<'\xCE'>("О") + .map<'\xCF'>("П") + + .map<'\xD0'>("Р") + .map<'\xD1'>("С") + .map<'\xD2'>("Т") + .map<'\xD3'>("У") + .map<'\xD4'>("Ф") + .map<'\xD5'>("Х") + .map<'\xD6'>("Ц") + .map<'\xD7'>("Ч") + .map<'\xD8'>("Ш") + .map<'\xD9'>("Щ") + .map<'\xDA'>("Ъ") + .map<'\xDB'>("Ы") + .map<'\xDC'>("Ь") + .map<'\xDD'>("Э") + .map<'\xDE'>("Ю") + .map<'\xDF'>("Я") + + .map<'\xE0'>("а") + .map<'\xE1'>("б") + .map<'\xE2'>("в") + .map<'\xE3'>("г") + .map<'\xE4'>("д") + .map<'\xE5'>("е") + .map<'\xE6'>("ж") + .map<'\xE7'>("з") + .map<'\xE8'>("и") + .map<'\xE9'>("й") + .map<'\xEA'>("к") + .map<'\xEB'>("л") + .map<'\xEC'>("м") + .map<'\xED'>("н") + .map<'\xEE'>("о") + .map<'\xEF'>("п") + + .map<'\xF0'>("р") + .map<'\xF1'>("с") + .map<'\xF2'>("т") + .map<'\xF3'>("у") + .map<'\xF4'>("ф") + .map<'\xF5'>("х") + .map<'\xF6'>("ц") + .map<'\xF7'>("ч") + .map<'\xF8'>("ш") + .map<'\xF9'>("щ") + .map<'\xFA'>("ъ") + .map<'\xFB'>("ы") + .map<'\xFC'>("ь") + .map<'\xFD'>("э") + .map<'\xFE'>("ю") + .map<'\xFF'>("я"); + + template<typename Reader> + static constexpr map_value try_parse(Reader& reader) { + auto index = map.try_parse(reader); + if (index) { + return map_value(map[index]); + } + return {}; + } + }; + static_assert(IsConverter<Windows1251>); + + template<typename Reader> + constexpr map_value try_parse_map(detail::Encoding&& encoding, Reader& reader) { + switch (encoding) { + case detail::Encoding::Unknown: + case detail::Encoding::Ascii: + case detail::Encoding::Utf8: return Utf8::try_parse(reader); + case detail::Encoding::Windows1251: return Windows1251::try_parse(reader); + case detail::Encoding::Windows1252: return Windows1252::try_parse(reader); + } + ovdl::detail::unreachable(); + } + + template<typename String> + using _string_char_type = LEXY_DECAY_DECLTYPE(LEXY_DECLVAL(String)[0]); + + template<typename T, typename CharT> + concept IsErrorHandler = + std::is_convertible_v<CharT, char> // + && requires(T t, ovdl::v2script::ast::ParseState& state, lexy::_pr<lexy::deduce_encoding<CharT>> reader) { + { T::on_invalid_character(state, reader) }; + }; + + struct EmptyHandler { + static constexpr void on_invalid_character(detail::IsStateType auto& state, auto reader) {} + }; + + template<typename String, + IsErrorHandler<_string_char_type<String>> Error = EmptyHandler> + constexpr auto convert_as_string = + dsl::sink<String>( + lexy::fold_inplace<String>( + std::initializer_list<_string_char_type<String>> {}, // + []<typename CharT, typename = decltype(LEXY_DECLVAL(String).push_back(CharT()))>(String& result, detail::IsStateType auto& state, CharT c) { + if constexpr (std::is_convertible_v<CharT, char>) { + switch (state.encoding()) { + using enum ovdl::detail::Encoding; + case Ascii: + case Utf8: + break; + // Skip Ascii and Utf8 encoding + default: { + map_value val = {}; + CharT char_array[] { c, CharT() }; + auto input = lexy::range_input(&char_array[0], &char_array[1]); + auto reader = input.reader(); + + // prefer preserving unknown conversion maps, least things will work, they'll just probably display wrong + // map = make_map_from(state.encoding(), c); + val = try_parse_map(state.encoding(), reader); + + // Invalid characters are dropped + if (val.is_invalid()) { + Error::on_invalid_character(state, reader); + return; + } + + // non-pass characters are not valid ascii and are mapped to utf8 values + if (!val.is_pass()) { + result.append(val._value); + return; + } + + break; + } + } + } + + result.push_back(c); // + }, // + [](String& result, detail::IsStateType auto& state, String&& str) { + if constexpr (std::is_convertible_v<typename String::value_type, char>) { + switch (state.encoding()) { + using enum ovdl::detail::Encoding; + case Ascii: + case Utf8: + break; + // Skip Ascii and Utf8 encoding + default: { + auto input = lexy::string_input(str); + auto reader = input.reader(); + using encoding = decltype(reader)::encoding; + constexpr auto eof = encoding::eof(); + + if constexpr (requires { result.reserve(str.size()); }) { + result.reserve(str.size()); + } + + auto begin = reader.position(); + auto last_it = begin; + while (reader.peek() != eof) { + map_value val = try_parse_map(state.encoding(), reader); + + if (val.is_invalid()) { + Error::on_invalid_character(state, reader); + reader.bump(); + continue; + } else if (!val.is_pass()) { + result.append(val._value); + last_it = reader.position(); + continue; + } + + reader.bump(); + result.append(last_it, reader.position()); + last_it = reader.position(); + } + if (last_it != begin) { + result.append(last_it, reader.position()); + return; + } + break; + } + } + } + + result.append(LEXY_MOV(str)); // + }, // + []<typename Str = String, typename Iterator>(String& result, detail::IsStateType auto& state, Iterator begin, Iterator end) // + -> decltype(void(LEXY_DECLVAL(Str).append(begin, end))) { + if constexpr (std::is_convertible_v<typename String::value_type, char>) { + switch (state.encoding()) { + using enum ovdl::detail::Encoding; + case Ascii: + case Utf8: + break; + // Skip Ascii and Utf8 encoding + default: { + auto input = lexy::range_input(begin, end); + auto reader = input.reader(); + using encoding = decltype(reader)::encoding; + constexpr auto eof = encoding::eof(); + + if constexpr (requires { result.reserve(end - begin); }) { + result.reserve(end - begin); + } + + auto begin = reader.position(); + auto last_it = begin; + while (reader.peek() != eof) { + map_value val = try_parse_map(state.encoding(), reader); + + if (val.is_invalid()) { + Error::on_invalid_character(state, reader); + reader.bump(); + continue; + } else if (!val.is_pass()) { + result.append(val._value); + last_it = reader.position(); + continue; + } + + reader.bump(); + result.append(last_it, reader.position()); + last_it = reader.position(); + } + if (last_it != begin) { + result.append(last_it, reader.position()); + return; + } + break; + } + } + } + + result.append(begin, end); // + }, // + []<typename Reader>(String& result, detail::IsStateType auto& state, lexy::lexeme<Reader> lex) { + using encoding = typename Reader::encoding; + using _char_type = _string_char_type<String>; + static_assert(lexy::char_type_compatible_with_reader<Reader, _char_type>, + "cannot convert lexeme to this string type"); + + if constexpr ((std::same_as<encoding, lexy::default_encoding> || std::same_as<encoding, lexy::byte_encoding>) && + std::convertible_to<typename String::value_type, char>) { + auto input = lexy::range_input(lex.begin(), lex.end()); + auto reader = input.reader(); + using encoding = decltype(reader)::encoding; + constexpr auto eof = encoding::eof(); + + if constexpr (requires { result.reserve(lex.end() - lex.begin()); }) { + result.reserve(lex.end() - lex.begin()); + } + + auto begin = reader.position(); + auto last_it = begin; + while (reader.peek() != eof) { + map_value val = try_parse_map(state.encoding(), reader); + + if (val.is_invalid()) { + Error::on_invalid_character(state, reader); + reader.bump(); + continue; + } else if (!val.is_pass()) { + result.append(val._value); + last_it = reader.position(); + continue; + } + + reader.bump(); + result.append(last_it, reader.position()); + last_it = reader.position(); + } + if (last_it != begin) { + result.append(last_it, reader.position()); + return; + } + } + + result.append(lex.begin(), lex.end()); // + })); +}
\ No newline at end of file diff --git a/src/openvic-dataloader/detail/Detect.cpp b/src/openvic-dataloader/detail/Detect.cpp new file mode 100644 index 0000000..1516fc7 --- /dev/null +++ b/src/openvic-dataloader/detail/Detect.cpp @@ -0,0 +1,351 @@ +#include "detail/Detect.hpp" + +using namespace ovdl; +using namespace ovdl::encoding_detect; + +static constexpr int64_t INVALID_CLASS = 255; + +std::optional<int64_t> Utf8Canidate::read(const std::span<const cbyte>& buffer) { + auto lexy_buffer = lexy::make_buffer_from_raw<lexy::default_encoding, lexy::encoding_endianness::little>(buffer.data(), buffer.size()); + if (is_utf8(lexy_buffer)) { + return 0; + } + + return std::nullopt; +} + +std::optional<int64_t> AsciiCanidate::read(const std::span<const cbyte>& buffer) { + auto lexy_buffer = lexy::make_buffer_from_raw<lexy::default_encoding, lexy::encoding_endianness::little>(buffer.data(), buffer.size()); + if (is_ascii(lexy_buffer)) { + return 0; + } + + return std::nullopt; +} + +std::optional<int64_t> NonLatinCasedCanidate::read(const std::span<const cbyte>& buffer) { + static constexpr cbyte LATIN_LETTER = 1; + static constexpr int64_t NON_LATIN_MIXED_CASE_PENALTY = -20; + static constexpr int64_t NON_LATIN_ALL_CAPS_PENALTY = -40; + static constexpr int64_t NON_LATIN_CAPITALIZATION_BONUS = 40; + static constexpr int64_t LATIN_ADJACENCY_PENALTY = -50; + + int64_t score = 0; + for (const ubyte& b : buffer) { + const ubyte byte_class = score_data.classify(b); + if (byte_class == INVALID_CLASS) { + return std::nullopt; + } + + const ubyte caseless_class = byte_class & 0x7F; + const bool ascii = b < 0x80; + const bool ascii_pair = prev_ascii == 0 && ascii; + const bool non_ascii_alphabetic = score_data.is_non_latin_alphabetic(caseless_class); + + if (caseless_class == LATIN_LETTER) { + case_state = CaseState::Mix; + } else if (!non_ascii_alphabetic) { + switch (case_state) { + default: break; + case CaseState::UpperLower: + score += NON_LATIN_CAPITALIZATION_BONUS; + break; + case CaseState::AllCaps: + // pass + break; + case CaseState::Mix: + score += NON_LATIN_MIXED_CASE_PENALTY * current_word_len; + break; + } + case_state = CaseState::Space; + } else if (byte_class >> 7 == 0) { + switch (case_state) { + default: break; + case CaseState::Space: + case_state = CaseState::Lower; + break; + case CaseState::Upper: + case_state = CaseState::UpperLower; + break; + case CaseState::AllCaps: + case_state = CaseState::Mix; + break; + } + } else { + switch (case_state) { + default: break; + case CaseState::Space: + case_state = CaseState::Upper; + break; + case CaseState::Upper: + case_state = CaseState::AllCaps; + break; + case CaseState::Lower: + case CaseState::UpperLower: + case_state = CaseState::Mix; + break; + } + } + + if (non_ascii_alphabetic) { + current_word_len += 1; + } else { + if (current_word_len > longest_word) { + longest_word = current_word_len; + } + current_word_len = 0; + } + + const bool is_a0 = b == 0xA0; + + if (!ascii_pair) { + // 0xA0 is no-break space in many other encodings, so avoid + // assigning score to IBM866 when 0xA0 occurs next to itself + // or a space-like byte. + if (!(ibm866 && ((is_a0 && (prev_was_a0 || prev == 0)) || caseless_class == 0 && prev_was_a0))) { + score += score_data.score(caseless_class, prev); + } + + if (prev == LATIN_LETTER && + non_ascii_alphabetic) { + score += LATIN_ADJACENCY_PENALTY; + } else if (caseless_class == LATIN_LETTER && score_data.is_non_latin_alphabetic(prev)) { + score += LATIN_ADJACENCY_PENALTY; + } + } + + prev_ascii = ascii; + prev = caseless_class; + prev_was_a0 = is_a0; + } + return score; +} + +std::optional<int64_t> LatinCanidate::read(const std::span<const cbyte>& buffer) { + static constexpr int64_t IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY = -180; + static constexpr int64_t ORDINAL_BONUS = 300; + static constexpr int64_t COPYRIGHT_BONUS = 222; + static constexpr int64_t IMPLAUSIBILITY_PENALTY = -220; + + int64_t score = 0; + for (const ubyte& b : buffer) { + const ubyte byte_class = score_data.classify(b); + if (byte_class == INVALID_CLASS) { + return std::nullopt; + } + + const ubyte caseless_class = byte_class & 0x7F; + const bool ascii = b < 0x80; + const bool ascii_pair = prev_non_ascii == 0 && ascii; + + int16_t non_ascii_penalty = -200; + switch (prev_non_ascii) { + case 0: + case 1: + case 2: + non_ascii_penalty = 0; + break; + case 3: + non_ascii_penalty = -5; + break; + case 4: + non_ascii_penalty = 20; + break; + } + score += non_ascii_penalty; + + if (!score_data.is_latin_alphabetic(caseless_class)) { + case_state = CaseState::Space; + } else if (byte_class >> 7 == 0) { + if (case_state == CaseState::AllCaps && !ascii_pair) { + score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY; + } + case_state = CaseState::Lower; + } else { + switch (case_state) { + case CaseState::Lower: + if (!ascii_pair) { + score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY; + } + [[fallthrough]]; + case CaseState::Space: + case_state = CaseState::Upper; + break; + case CaseState::Upper: + case CaseState::AllCaps: + case_state = CaseState::AllCaps; + break; + } + } + + bool ascii_ish_pair = ascii_pair || (ascii && prev == 0) || (caseless_class == 0 && prev_non_ascii == 0); + + if (!ascii_ish_pair) { + score += score_data.score(caseless_class, prev); + } + + if (windows1252) { + switch (ordinal_state) { + case OrdinalState::Other: + if (caseless_class == 0) { + ordinal_state = OrdinalState::Space; + } + break; + case OrdinalState::Space: + if (caseless_class == 0) { + // pass + } else if (b == 0xAA || b == 0xBA) { + ordinal_state = OrdinalState::OrdinalExpectingSpace; + } else if (b == 'M' || b == 'D' || b == 'S') { + ordinal_state = OrdinalState::FeminineAbbreviationStartLetter; + } else if (b == 'N') { + // numero or Nuestra + ordinal_state = OrdinalState::UpperN; + } else if (b == 'n') { + // numero + ordinal_state = OrdinalState::LowerN; + } else if (caseless_class == ASCII_DIGIT) { + ordinal_state = OrdinalState::Digit; + } else if (caseless_class == 9 /* I */ || caseless_class == 22 /* V */ || caseless_class == 24) + /* X */ + { + ordinal_state = OrdinalState::Roman; + } else if (b == 0xA9) { + ordinal_state = OrdinalState::Copyright; + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::OrdinalExpectingSpace: + if (caseless_class == 0) { + score += ORDINAL_BONUS; + ordinal_state = OrdinalState::Space; + } else { + ordinal_state = OrdinalState::Other; + } + case OrdinalState::OrdinalExpectingSpaceUndoImplausibility: + if (caseless_class == 0) { + score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY; + ordinal_state = OrdinalState::Space; + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::OrdinalExpectingSpaceOrDigit: + if (caseless_class == 0) { + score += ORDINAL_BONUS; + ordinal_state = OrdinalState::Space; + } else if (caseless_class == ASCII_DIGIT) { + score += ORDINAL_BONUS; + // Deliberately set to `Other` + ordinal_state = OrdinalState::Other; + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily: + if (caseless_class == 0) { + score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY; + ordinal_state = OrdinalState::Space; + } else if (caseless_class == ASCII_DIGIT) { + score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY; + // Deliberately set to `Other` + ordinal_state = OrdinalState::Other; + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::UpperN: + if (b == 0xAA) { + ordinal_state = + OrdinalState::OrdinalExpectingSpaceUndoImplausibility; + } else if (b == 0xBA) { + ordinal_state = + OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily; + } else if (b == '.') { + ordinal_state = OrdinalState::PeriodAfterN; + } else if (caseless_class == 0) { + ordinal_state = OrdinalState::Space; + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::LowerN: + if (b == 0xBA) { + ordinal_state = + OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily; + } else if (b == '.') { + ordinal_state = OrdinalState::PeriodAfterN; + } else if (caseless_class == 0) { + ordinal_state = OrdinalState::Space; + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::FeminineAbbreviationStartLetter: + if (b == 0xAA) { + ordinal_state = + OrdinalState::OrdinalExpectingSpaceUndoImplausibility; + } else if (caseless_class == 0) { + ordinal_state = OrdinalState::Space; + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::Digit: + if (b == 0xAA || b == 0xBA) { + ordinal_state = OrdinalState::OrdinalExpectingSpace; + } else if (caseless_class == 0) { + ordinal_state = OrdinalState::Space; + } else if (caseless_class == ASCII_DIGIT) { + // pass + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::Roman: + if (b == 0xAA || b == 0xBA) { + ordinal_state = + OrdinalState::OrdinalExpectingSpaceUndoImplausibility; + } else if (caseless_class == 0) { + ordinal_state = OrdinalState::Space; + } else if (caseless_class == 9 /* I */ || caseless_class == 22 /* V */ || caseless_class == 24) + /* X */ + { + // pass + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::PeriodAfterN: + if (b == 0xBA) { + ordinal_state = OrdinalState::OrdinalExpectingSpaceOrDigit; + } else if (caseless_class == 0) { + ordinal_state = OrdinalState::Space; + } else { + ordinal_state = OrdinalState::Other; + } + break; + case OrdinalState::Copyright: + if (caseless_class == 0) { + score += COPYRIGHT_BONUS; + ordinal_state = OrdinalState::Space; + } else { + ordinal_state = OrdinalState::Other; + } + break; + } + } + + if (ascii) { + prev_non_ascii = 0; + } else { + prev_non_ascii += 1; + } + prev = caseless_class; + } + return score; +} + +template struct ovdl::encoding_detect::DetectUtf8<true>; +template struct ovdl::encoding_detect::DetectUtf8<false>; diff --git a/src/openvic-dataloader/detail/Detect.hpp b/src/openvic-dataloader/detail/Detect.hpp new file mode 100644 index 0000000..ad36d04 --- /dev/null +++ b/src/openvic-dataloader/detail/Detect.hpp @@ -0,0 +1,627 @@ +/// Based heavily on https://github.com/hsivonen/chardetng/tree/143dadde20e283a46ef33ba960b517a3283a3d22 + +#pragma once + +#include <array> +#include <cassert> +#include <cstdint> +#include <optional> +#include <span> +#include <type_traits> +#include <variant> +#include <vector> + +#include <openvic-dataloader/detail/Encoding.hpp> + +#include <lexy/action/match.hpp> +#include <lexy/callback/constant.hpp> +#include <lexy/dsl.hpp> +#include <lexy/dsl/ascii.hpp> +#include <lexy/dsl/newline.hpp> +#include <lexy/encoding.hpp> +#include <lexy/input/buffer.hpp> + +#include "detail/dsl.hpp" + +namespace ovdl::encoding_detect { + using cbyte = char; + using ubyte = unsigned char; + + using Encoding = detail::Encoding; + + struct DetectAscii { + // & 0b10000000 == 0b00000000 + static constexpr auto rule = lexy::dsl::while_(lexy::dsl::ascii::character) + lexy::dsl::eol; + static constexpr auto value = lexy::constant(true); + }; + + template<bool IncludeAscii> + struct DetectUtf8 { + struct not_utf8 { + static constexpr auto name = "not utf8"; + }; + + static constexpr auto rule = [] { + constexpr auto is_not_ascii_flag = lexy::dsl::context_flag<DetectUtf8>; + + // & 0b10000000 == 0b00000000 + constexpr auto ascii_values = lexy::dsl::ascii::character; + // & 0b11100000 == 0b11000000 + constexpr auto two_byte = dsl::lit_b_range<0b11000000, 0b11011111>; + // & 0b11110000 == 0b11100000 + constexpr auto three_byte = dsl::lit_b_range<0b11100000, 0b11101111>; + // & 0b11111000 == 0b11110000 + constexpr auto four_byte = dsl::lit_b_range<0b11110000, 0b11110111>; + // & 0b11000000 == 0b10000000 + constexpr auto check_bytes = dsl::lit_b_range<0b10000000, 0b10111111>; + + constexpr auto utf8_check = + ((four_byte >> lexy::dsl::times<3>(check_bytes)) | + (three_byte >> lexy::dsl::times<2>(check_bytes)) | + (two_byte >> lexy::dsl::times<1>(check_bytes))) >> + is_not_ascii_flag.set(); + + return is_not_ascii_flag.template create<IncludeAscii>() + + lexy::dsl::while_(utf8_check | ascii_values) + + lexy::dsl::must(is_not_ascii_flag.is_set()).template error<not_utf8> + lexy::dsl::eof; + }(); + + static constexpr auto value = lexy::constant(true); + }; + + extern template struct DetectUtf8<true>; + extern template struct DetectUtf8<false>; + + template<typename Input> + constexpr bool is_ascii(const Input& input) { + return lexy::match<DetectAscii>(input); + } + + template<typename Input> + constexpr bool is_utf8_no_ascii(const Input& input) { + return lexy::match<DetectUtf8<false>>(input); + } + + template<typename Input> + constexpr bool is_utf8(const Input& input) { + return lexy::match<DetectUtf8<true>>(input); + } + + struct DetectorData { + static constexpr std::array latin_ascii = std::to_array<ubyte>({ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 0, 0, 0, 0, 0, 0, // + 0, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, // + 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 0, 0, 0, 0, 0, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 0, 0, 0, 0, 0, // + }); + + static constexpr std::array non_latin_ascii = std::to_array<ubyte>({ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 0, 0, 0, 0, 0, 0, // + 0, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, // + 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 0, 0, 0, 0, 0, // + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, // + }); + + static constexpr std::array windows_1251 = std::to_array<ubyte>({ + 131, 130, 0, 2, 0, 0, 0, 0, 0, 0, 132, 0, 133, 130, 134, 135, // + 3, 0, 0, 0, 0, 0, 0, 0, 255, 0, 4, 0, 5, 2, 6, 7, // + 0, 136, 8, 140, 47, 130, 46, 47, 138, 49, 139, 49, 50, 46, 48, 141, // + 49, 50, 137, 9, 2, 49, 48, 46, 10, 47, 11, 48, 12, 130, 2, 13, // + 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, // + 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, // + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, // + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, // + }); + + static constexpr std::array windows_1252 = std::to_array<ubyte>({ + 0, 255, 0, 60, 0, 0, 0, 0, 0, 0, 156, 0, 157, 255, 185, 255, // + 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 28, 0, 29, 255, 57, 186, // + 0, 62, 60, 60, 60, 60, 59, 60, 60, 62, 60, 59, 63, 59, 61, 60, // + 62, 63, 61, 61, 60, 62, 61, 59, 60, 61, 60, 59, 62, 62, 62, 62, // + 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, // + 188, 174, 175, 176, 177, 178, 179, 63, 180, 181, 182, 183, 184, 188, 188, 27, // + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, // + 60, 46, 47, 48, 49, 50, 51, 63, 52, 53, 54, 55, 56, 60, 60, 58, // + }); + + // clang-format off + static constexpr std::array cyrillic = std::to_array<ubyte>({ + 0, 0, 0, 0, 1, 0, 16, 38, 0, 2, 5, 10,121, 4, 20, 25, 26, 53, 9, 5, 61, 23, 20, 26, 15, 95, 60, 2, 26, 15, 25, 29, 0, 14, 6, 6, 25, 1, 0, 27, 25, 8, 5, 39, // , + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // a, + 0, 0, 0,255, 0, 0,255, 0,255, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0,255, 0, 0, // ѓ, + 0, 0,255, 0, 0, 0, 0, 0,255,255,255,255, 0,255, 2, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255, // ђ, + 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255, 0,255, 0, 0, 0, 0, 0, 4, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255, // љ, + 0, 0, 0, 0, 0, 0, 0, 0,255,255,255, 0, 0,255, 5, 0, 0, 0, 0, 2, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255, // њ, + 0, 0,255, 0, 0, 0, 0, 0,255, 0,255,255, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 1,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255, // ћ, + 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255, // џ, + 7, 0, 0,255,255,255,255,255, 0, 1, 0,255,255,255, 15, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 1, 0, 0, 0, 1, // ў, + 12, 0, 0,255,255, 0,255,255, 0, 2, 0, 0, 0, 0, 2, 3, 15, 5, 5, 0, 0, 4, 0, 0, 21, 15, 10, 17, 0, 6, 14, 4, 6, 0, 3, 1, 8, 1, 0, 0, 0, 2, 0, 0, 0, 0, // і, + 0, 0,255,255,255,255,255,255, 0, 0, 0,255,255, 0, 4, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ё, + 6, 0, 0,255,255,255,255,255, 0, 0,255, 5,255, 0, 1, 7, 0, 3, 2, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 2, 2, 5, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // є, + 12, 0, 0, 0, 0, 0, 0, 0,255, 0,255, 0, 0, 0, 5, 1, 0, 0, 0, 2, 0, 0, 20,255, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0,255,255,255,255, // ј, + 9, 0, 0,255,255,255,255,255,255, 5,255, 0, 0, 13, 3, 3, 0, 4, 1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 4, 0, 0, 1, 3, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ї, + 32, 0, 0, 2, 2, 2, 0, 0, 0, 1, 0, 0, 28, 0, 23, 22, 26, 22, 19, 0, 3, 12, 5, 0, 44, 38, 18, 58, 1, 21, 44, 17, 54, 1, 2, 28, 5, 8, 3, 1, 9, 0, 12, 0, 0, 0, // а, + 40, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 7, 0, 0, 0, 1, 7, 0, 1, 1, 0, 0, 7, 4, 1, 9, 0, 1, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, // б, + 31, 0, 0, 0, 0, 0, 0, 0, 0, 11, 0, 3, 0, 0, 19, 0, 0, 1, 1, 6, 0, 2, 6, 0, 1, 0, 1, 0, 32, 0, 2, 2, 23, 9, 0, 0, 0, 1, 0, 0, 1, 1, 0, 3, 0, 2, // в, + 23, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 7, 0, 1, 20, 0, 0, 1, 0, 9, 0, 0, 9, 7, 0, 5, 2, 18, 11, 0, 8, 3, 2, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 13, 0, 3, // г, + 26, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 2, 0, 2, 19, 0, 1, 5, 0, 13, 2, 2, 3, 2, 0, 6, 1, 12, 30, 0, 4, 0, 0, 7, 0, 0, 0, 0, 0, 0, 1, 0, 0, 5, 0, 1, // д, + 12, 0, 0, 1, 4, 5, 0, 0, 0, 0, 0, 0, 24, 1, 5, 7, 11, 3, 12, 1, 6, 6, 11, 0, 3, 15, 14, 14, 4, 8, 25, 14, 29, 0, 1, 1, 4, 8, 8, 2, 0, 3, 1, 0, 0, 0, // е, + 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 3, 2, 1, 2, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, // ж, + 19, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 1, 6, 0, 0, 0, 11, 8, 0, 0, 8, 0, 0, 0, 0, 0, 4, 0, 1, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, // з, + 24, 0, 0, 0, 0, 1, 5, 0, 0, 0, 0, 0, 1, 0, 1, 10, 16, 21, 22, 0, 6, 5, 6, 1, 15, 15, 8, 38, 2, 4, 27, 9, 15, 0, 3, 8, 12, 7, 6, 1, 0, 0, 0, 0, 0, 0, // и, + 6, 0, 0, 0,255,255,255,255, 0, 7, 0, 0,255, 4, 21, 0, 0, 0, 0, 5, 0, 0, 39, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 5, 0, 3, 0, 0, // й, + 54, 0, 0, 0, 0, 0, 0, 0, 1, 8, 0, 0, 0, 0, 10, 0, 1, 0, 1, 11, 0, 0, 12, 0, 1, 2, 0, 4, 8, 0, 2, 23, 2, 4, 0, 2, 3, 3, 8, 0, 0, 3, 16, 1, 4, 3, // к, + 12, 0, 0, 0, 0, 0, 0, 0, 2, 6, 0, 6, 0, 4, 29, 12, 4, 5, 2, 18, 0, 0, 17, 4, 5, 11, 0, 0, 21, 2, 3, 4, 1, 15, 1, 0, 0, 0, 0, 0, 4, 3, 2, 12, 0, 2, // л, + 23, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 4, 0, 0, 17, 1, 0, 0, 0, 7, 0, 1, 13, 2, 0, 0, 0, 0, 13, 0, 2, 4, 0, 2, 0, 0, 0, 0, 0, 0, 1, 4, 2, 4, 1, 1, // м, + 42, 0, 0, 0, 0, 0, 0, 0, 4, 12, 6, 7, 1, 7, 76, 0, 22, 1, 4, 27, 1, 3, 34, 30, 0, 7, 1, 13, 24, 1, 3, 5, 3, 4, 0, 1, 0, 4, 1, 0, 2, 18, 7, 16, 0, 4, // н, + 37, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 1, 0, 1, 10, 27, 22, 15, 1, 2, 3, 7, 5, 32, 11, 7, 38, 8, 21, 24, 11, 23, 0, 2, 10, 2, 2, 3, 2, 0, 0, 1, 0, 0, 0, // о, + 47, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 0, 0, 2, 0, 1, 2, 4, 0, 0, 2, 0, 6, 0, 0, 5, 0, 2, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, // п, + 19, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 8, 0, 5, 47, 4, 6, 6, 5, 23, 0, 0, 5, 2, 6, 0, 0, 0, 23, 22, 0, 1, 14, 9, 1, 0, 1, 0, 0, 0, 7, 2, 8, 16, 0, 3, // р, + 53, 0, 0, 0, 0, 0, 0, 0, 4, 9, 2, 0, 1, 2, 21, 1, 4, 1, 2, 11, 0, 0, 12, 2, 4, 7, 1, 13, 15, 1, 4, 6, 3, 6, 0, 0, 0, 0, 0, 0, 1, 2, 3, 5, 0, 1, // с, + 28, 0, 0, 0, 0, 0, 0, 0, 1, 6, 0, 1, 0, 1, 32, 0, 1, 3, 0, 12, 0, 1, 22, 1, 4, 7, 1, 6, 23, 0, 14, 41, 14, 3, 0, 1, 1, 1, 21, 0, 2, 2, 6, 2, 1, 4, // т, + 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 2, 4, 2, 4, 6, 3, 0, 2, 0, 0, 6, 5, 6, 3, 0, 3, 7, 4, 7, 18, 1, 6, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, // у, + 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ф, + 41, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 2, 30, 0, 2, 0, 0, 11, 0, 0, 5, 1, 14, 3, 0, 3, 6, 0, 7, 0, 0, 1, 0, 1, 0, 2, 0, 0, 0, 4, 3, 5, 0, 0, // х, + 8, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 7, 0, 0, 0, 0, 4, 0, 0, 7, 1, 0, 1, 0, 2, 1, 0, 0, 9, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 1, 1, // ц, + 6, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 5, 0, 1, 5, 0, 2, 0, 0, 6, 0, 0, 1, 0, 0, 3, 0, 2, 0, 0, 2, 0, 1, 0, 0, 3, 0, 0, 2, 0, 0, 0, 0, // ч, + 12, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 17, 0, 0, 1, 0, 2, 0, 0, 26, 0, 0, 0, 0, 0, 22, 2, 6, 0, 0, 5, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, // ш, + 2, 0,255, 0,255,255,255,255,255, 0, 0, 0,255, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, // щ, + 0, 0,255,255,255,255, 0,255, 0, 0, 0,255,255,255, 0, 3, 4, 0, 2, 0, 0, 0, 0, 0, 11, 0, 1, 0, 0, 2, 2, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ъ, + 1, 0, 0,255,255,255,255,255, 0, 0, 0, 0, 0,255, 0, 3, 11, 0, 4, 0, 2, 1, 0, 0, 0, 3, 1, 16, 0, 0, 22, 2, 10, 0, 0, 0, 8, 6, 3, 0, 0, 0, 0, 0, 0, 0, // ы, + 0, 0, 0,255,255, 0, 0, 0,255, 0, 0, 0, 0, 0, 5, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 7, 3, 0, 1, 13, 7, 7, 0, 35, 6, 0, 0, 0, 0, 0, 0, 0, 6, 0, // ь, + 10, 0, 0,255,255,255,255,255, 0, 0, 0, 0,255, 0, 0, 1, 1, 10, 11, 0, 2, 2, 0, 0, 0, 9, 3, 9, 0, 0, 7, 6, 9, 0, 0, 8, 3, 2, 1, 0, 0, 0, 0, 17, 0, 0, // э, + 14, 0, 0, 0,255,255,255,255, 0, 0, 0, 0,255, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ю, + 5, 0, 0,255,255,255,255,255, 0, 9, 0, 0,255, 0, 11, 0, 3, 0, 0, 0, 0, 2, 24, 0, 0, 5, 2, 14, 1, 0, 2, 3, 1, 0, 0, 1, 3, 0, 0, 0, 0, 16, 1, 0, 0, 0, // я, + // , a, ѓ, ђ, љ, њ, ћ, џ, ў, і, ё, є, ј, ї, а, б, в, г, д, е, ж, з, и, й, к, л, м, н, о, п, р, с, т, у, ф, х, ц, ч, ш, щ, ъ, ы, ь, э, ю, я, + }); + // clang-format on + + // clang-format off + static constexpr std::array western = std::to_array<ubyte>({ + 18, 3, 0,254, 74, 0, 5,254,254, 2, 25,254,149, 4,254, 66,148,254, 0,254,122,238, 8, 1, 20, 13,254, 35, 20, 3, 1, 0, // , + 0, 3, 0, 0, 0, 0, 0, 5, 2, 0, 86, 9, 76, 0, 0, 0,241, 0, 0, 49, 0, 0, 0, 0, 11, 2, 0, 34, 0, 1, 2, 0, // a, + 19, 0, 0, 5, 5, 0, 0, 8, 13, 5, 0, 34, 22, 0, 0, 0, 4, 0, 0, 0, 6, 1, 3, 3, 42, 37, 8, 8, 0, 67, 0, 0, // b, + 0, 0, 0, 9, 6, 1, 0, 22, 10, 1, 0, 19, 54, 1, 0, 1, 18, 3, 1, 2, 40, 7, 0, 0, 6, 0, 3, 5, 1, 34, 0, 0, // c, + 0, 0, 0, 5, 5, 0, 0, 12, 45, 16, 1, 6, 42, 0, 13, 3, 10, 0, 2, 0, 66, 11, 5, 8, 33,104, 3, 4, 0, 19, 0, 0, // d, + 63, 5, 0, 0, 0, 0, 2, 33, 15, 1, 3, 0, 87, 0, 0, 0, 0, 0, 1, 21, 0, 0, 0, 49, 1, 11, 0, 3, 0, 9, 1, 0, // e, + 0, 0, 0, 8, 8, 0, 0, 10, 2, 7, 0,162, 23, 0, 13, 0, 4, 0, 0, 0, 1, 3, 0, 0, 15, 4, 0, 0, 0, 4, 0, 0, // f, + 1, 0, 0, 14, 16, 24, 0, 29, 11, 41, 0, 13, 86, 0, 14, 9, 3, 0, 0, 0, 20, 8, 7, 7, 13, 37, 14, 0, 0, 12, 0, 0, // g, + 1, 0, 0, 0, 0, 0, 0, 47, 2, 0, 0, 0, 1, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 29, 20, 0, 0, 0, 0, 45, 0, 0, // h, + 5, 4, 0,166,120, 0, 0,144, 0, 2, 3, 88,254, 0, 0, 0, 0, 0, 0, 3, 28,107, 0,112, 8, 2, 44, 32, 0, 3, 3, 0, // i, + 0, 0, 0, 0, 0, 0, 0, 39, 9, 0, 0, 2, 1, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 16, 18, 44, 0, 0, 0, 0, 0,255, // j, + 0, 2, 0, 0, 1, 0, 0, 48, 31, 32, 1, 60, 1, 0, 4, 0, 1, 0, 0, 0, 1, 3, 0, 2, 20, 47, 0, 0, 0, 20, 0, 0, // k, + 4, 0, 0, 12, 16, 0, 0, 54, 40, 48, 0, 64, 36, 0, 39, 6, 12, 3, 0, 0, 27, 9, 3, 24, 42, 33, 2, 9, 7, 77, 0, 0, // l, + 0, 0, 0, 14, 5, 4, 0, 60, 11, 4, 3, 48, 30, 7, 28, 1, 10, 1, 0, 0, 24, 41, 3, 3, 19, 24, 1, 8, 2, 36, 0, 0, // m, + 1, 1, 0, 24, 91, 16, 0,132, 62, 73, 1, 56, 71, 33, 78, 7, 35, 2, 3, 0, 94,254, 10, 21, 33, 38, 24, 21, 1, 61, 0, 0, // n, + 0, 1, 0, 0, 0, 0,254, 6, 0, 1, 27, 0, 13, 0, 0, 84,127, 0, 0, 62, 0, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, // o, + 0, 0, 0, 5, 2, 0, 0, 9, 15, 0, 0, 4, 34, 0, 6, 0, 6, 0, 0, 0, 20, 12, 9, 28, 10, 22, 0, 3, 0, 7, 0, 0, // p, + 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 33, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,255,255, // q, + 0, 0, 0, 83, 62, 1, 0,198,139,125, 0,229, 94, 54,190, 38, 18, 1, 0, 0,176, 24, 16, 29,193,181, 13, 13, 2,131, 0, 0, // r, + 1, 0, 0, 41, 34, 0, 0, 41, 24, 42, 0, 68,113, 15,159, 6, 43, 19, 4, 58, 14, 18, 1, 4, 48, 42, 4, 12, 9, 20, 0, 0, // s, + 7, 1, 0, 14, 20, 8, 0, 56, 37, 31, 0,104, 67, 14,113, 3, 50, 9, 5, 0, 89, 7, 19, 22, 13, 14, 40, 12, 15, 18, 0, 0, // t, + 0, 1, 5, 1, 2, 0, 0, 30, 0, 0, 1, 15, 2, 0, 1, 0, 1, 0, 0, 2, 4, 0, 0, 36, 0, 0, 0, 0, 0, 0, 0, 0, // u, + 0, 2, 0, 1, 6, 0, 0, 29, 33, 13, 0, 19, 46, 0, 15, 0, 7, 0, 1, 31, 2, 2, 3, 1, 32, 27, 0, 0, 1, 1, 0, 0, // v, + 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 3, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0,255, // w, + 0, 0, 0, 1, 16, 0, 0, 23, 0, 0, 0, 3, 14, 0, 0, 0, 2, 3, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, // x, + 0, 0, 0, 0, 0, 0, 0, 58, 8, 0, 0, 1, 1, 62, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 6, 82, 0, 0, 0, 0, 0,255, // y, + 0, 0, 0, 0, 2, 0, 0, 0, 14, 0, 0, 7, 3, 0, 6, 0, 3, 5, 0, 0, 0, 0, 4, 0, 1, 0, 0, 0, 0, 0, 0, 0, // z, + 0, 29, 0, 0, 0, 15, 0, 0, 0, 11, 0, 0, 0, 0, 0, 20, 0, 0, 0, 0, 0, 37, 0, 0, 0, 0, 0, 0,255,255, 0, 0,255,255, 4, 0, 0,255,255, 0,255, 0,255, 0, 0,255,255,255, 0, 0, 0, 8, 0,255, 0, 0, 2, 0, 0, // ß, + 6, 2, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 10, 1, 0, 0, 0, 0, 0, 0, 0,255, 0, 1, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // š, + 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0,255,255,255, 0, 0, 0,255,255,255, 0,255,255,255,255, 0, 0,255,255,255,255,255,255, 0,255,255,255, 0,255,255, // œ, + 107, 0, 22, 16, 18, 14, 6, 24, 46, 15, 2, 0, 42, 18, 17, 0, 36, 0, 34, 4,254, 1, 2, 0, 0, 1, 0, 0, 0,255, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0,255,255,255,255,255, 0, 0,255, 0, 0, 0, // à, + 41, 0, 10, 8, 21, 34, 5, 5, 60, 18, 5, 1, 29, 42, 26, 2, 16, 0, 27, 9, 43, 28, 7, 0, 0, 1, 4, 0, 0,255, 0, 0,255,255,255, 0,255, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255, 0, 0, 0, 0, 0,255, // á, + 24, 0, 1, 2, 0, 0, 0, 0, 7, 0, 0, 0, 3, 1, 0, 0, 0, 0, 2, 0, 5, 0, 1, 0, 0, 0, 0,255, 0,255, 0, 0, 0,255, 0,255, 0, 0, 0, 2, 0,255, 0,255, 0, 0, 0, 0,255, 0,255,255,255,255,255, 0,255, 0,255, // â, + 0, 0, 0, 1, 2, 3, 0, 1, 2, 12, 0, 0, 1, 7, 29, 4, 1,255, 11, 66, 11, 0, 1, 0, 0, 0, 0,255, 0,255,255,255, 0, 0, 0,255,255,127,255,255,255,255,255, 0, 0,255, 0, 0,255,255, 0,255,255,255,255,255,255,255,255, // ã, + 134, 1, 11, 0, 25, 6, 15, 11, 61, 24,123, 95,114, 68, 53, 1, 49, 0, 60, 98,198, 0, 88, 29, 0, 6, 12, 0, 0,255, 0,255, 0, 0,118, 0,255, 0,255, 0,255, 0,255, 0,255,255, 0,255,255, 0,255, 2,255,255,255, 0, 0, 0,255, // ä, + 156, 0, 12, 14, 19, 3, 12, 47, 17, 3, 12, 5, 30, 47, 22, 0,205, 0,184, 70, 19, 0, 22, 8, 0, 6, 1,255, 0,255,255, 0,255, 0, 0, 0, 0, 0,255, 0,255, 0,255, 0, 0,255,255,255,255,255,255, 0, 0,255,255,255,255,255,255, // å, + 26, 0, 7, 0, 4, 0, 23, 8, 15, 0, 18, 19, 56, 23, 24, 0, 9, 0, 82, 37, 24, 0, 71, 0, 0, 0, 0,255, 0,255,255, 0,255,255, 0, 0, 0, 0,255, 0,255,255,255, 0,255,255, 0,255,255,255,255, 0, 0,255,255,255,255, 0,255, // æ, + 17,112, 0, 2, 0, 15, 0, 0, 0, 35, 0, 0, 2, 0, 59, 9, 1, 0, 36, 0, 0, 8, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, // ç, + 254, 0, 9, 14, 20, 0, 15, 6, 70,144, 14, 45, 47, 92, 16, 3,123, 0, 38, 23,115, 52, 22, 42, 2, 80, 19,255, 0,255, 0, 0,255,255, 0,255,255, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0,255,255,255, 0, 0, 0, 1,255,255, // è, + 152, 2, 19, 24, 85, 0, 29, 23, 26, 25, 2, 9, 43, 60, 62, 1, 32, 0,122, 45,169, 15, 13, 30, 7, 4, 8, 0, 0,255, 0, 0, 0, 0, 0,255, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1,255, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, // é, + 5, 0, 0, 3, 7, 0, 0, 10, 2, 3, 0, 26, 6, 6, 20, 1, 2, 0, 20, 1, 11, 5, 5, 2, 0, 0, 1,255, 0,255,255,255, 0,255,255,255,255, 0, 0, 0, 0, 0,255, 0, 0, 0, 0,255, 0, 0,255,255,255, 0,255, 0, 0, 0,255, // ê, + 36, 2, 23, 15, 36,143, 5, 23, 52, 52, 66, 48, 92, 57,216, 10,125, 35, 89, 58,254, 9, 24, 14, 0, 0, 8,255, 0,255, 0,255,255,255, 0, 0,255, 1, 0, 0, 0, 0, 0,255, 0, 0, 0,255,255,255, 0, 0, 0, 0,255, 0, 0, 0,255, // ë, + 12, 0, 1, 4, 6, 0, 3, 21, 10, 0, 0, 0, 18, 8, 4, 0, 1, 0, 65, 35, 8, 3, 0, 0, 0, 0, 0,255, 0,255, 0, 0,255,255,255,255,255,255, 0, 0, 0,255, 0, 0, 0,255, 0, 0,255, 0,255,255,255, 0,255,255, 0, 0,255, // ì, + 40, 72, 7, 10, 16, 2, 23, 10, 34, 0, 0, 1, 34, 15, 21, 1, 3, 0,203, 28, 58, 23, 11, 0, 10, 0, 2, 0, 0, 0, 0, 0, 0,255, 0,255,255, 0, 0, 0, 0,255, 0, 0,255,255, 1,255, 0,255,255, 0,255,255, 0,255, 2, 0,255, // í, + 6, 5, 1, 9, 5, 0, 0, 0, 22, 0, 9, 8, 8, 6, 9, 1, 10, 0, 20, 6,182, 0, 13, 0, 0, 24, 1,255, 0,255,255,255, 0, 0,255, 0,255, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0,255,255,255,255,255, 0,255,255,255, // î, + 0, 6, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0,255, 0,255, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0,255, 0, 0, 0, 0,255,255, 0, 0, 0,255,255, // ï, + 0,254, 0, 0, 0, 26, 0, 0, 0, 61, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 25, 0, 0, 0, 0, 0,255,255,255, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 0,255, 0, 1, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,255, 0,255,255, // ñ, + 20, 0, 56, 43, 8,162, 14, 3, 23, 19, 2,118, 31, 26, 46, 0, 20, 0, 23, 6, 24, 19, 6, 21, 5, 27, 63,255, 0,255, 0, 0,255,255,255,255,255, 3, 0,255,255,255, 0, 0,255, 0, 0, 0, 0,255, 0,255,255, 0,255,255, 0,255,255, // ò, + 67, 0, 12, 15, 9, 7, 8, 66, 13,254, 3, 23, 14, 16, 16, 0, 8, 0, 29, 11, 26, 0, 5, 5, 1, 10, 13,255, 0,255,255, 0,255, 0, 0,255,255, 1,255, 0,255,255, 0, 0,255, 0, 1, 0, 0, 0, 0,255,255,255, 0,255,255, 0,255, // ó, + 18, 3, 3, 12, 1, 0, 2, 0, 7, 0, 1, 0, 2, 2, 8, 0, 6, 0, 6, 7, 4, 0, 2, 0, 0, 0, 1,255, 0, 0,255, 0, 0,255,255,255, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0,255,255,255,255, 0, 0,255,255,255, // ô, + 29, 2, 0, 0, 0, 0, 0, 0, 5, 2, 22, 30, 25, 38, 19, 0, 33,255, 4, 39, 24, 0, 88, 0, 0, 0, 0,255, 0,255,255, 0,255, 0,255,255,255, 36,255,255,255,255,255, 0,255,255, 0,255, 0, 0, 6, 0,255,255,255, 0, 0, 0,255, // õ, + 44, 0, 33, 0, 25, 0,142, 5, 46, 10, 25, 32, 26, 13, 6, 0, 3, 0, 30, 8, 35, 0, 25, 5, 0, 44, 7, 0, 0,255,255, 0,255,255, 73, 0,255, 0, 0, 0,255,255,255,255,255, 0, 0,255, 0, 0, 0, 39, 0,255,255,255, 0, 0, 0, // ö, + 52, 0, 21, 0, 57, 0,119, 12, 47, 3, 59, 33, 45, 15, 12, 0, 3, 0, 52, 82, 49, 1, 11, 0, 0, 0, 0, 0,255, 0,255,255,255,255,255, 0, 0, 0,255, 0,255,255,255, 0,255,255, 0,255,255,255,255, 0, 0,255,255,255,255,255, 0, // ø, + 25, 0, 4, 3, 53, 0, 0, 2, 12, 72, 0, 0, 30, 0, 0,254, 0, 0, 6, 3, 3, 0, 0, 0, 0, 0, 0,255, 0,255, 0,255, 0,255,255,255,255, 0, 0, 0, 0,255, 0,255,255,255,255, 0,255, 0, 0,255,255, 0, 0, 0, 0, 0, 0, // ù, + 19, 2, 1, 7, 9, 1, 12, 5, 9, 41, 1, 0, 10, 7, 9, 0, 8, 0, 12, 28, 8, 0, 0, 0, 0, 1, 0,255, 0,255,255, 0,255,255,255,255, 0, 0,255, 0,255,255,255, 0,255,255, 0, 0, 0,255, 0,255,255, 0, 0,255,255, 0,255, // ú, + 0, 0, 0, 0, 1, 5, 0, 0, 1, 0, 0, 0, 0, 0, 0, 45, 0, 0, 3, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255, 0,255,255,255,255, 0,255, 0,255,255,255, 0, 0,255,255,255,255, 0,255,255,255, 0,255, 0, 0,255, 0, // û, + 95, 2, 19, 0, 6, 2,121, 9, 15, 1, 5, 44, 18, 26, 7, 0, 11, 2, 68, 49, 20, 0, 2, 17, 0, 0, 6, 0, 0,255, 0,255,255,255, 0,255,255, 0,255, 0,255, 0,255,255,255, 0, 0,255,255,255, 0, 0,255, 0, 0, 0, 31, 0, 0, // ü, + 1, 1, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0,255,255, 0, 0,255, 0,255, 0,255,255,255,255, 0, 0, 0, 0,255, 0, 0, 0, 0, 0,255, // ž, + 0, 0, 0, 0, 0, 0,255, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0,255,255, 0,255,255,255,255,255,255, 0,255, 0,255,255,255,255,255,255,255,255,255,255,255,255,255, 0, 0,255, 0,255,255,255, 0, 0, 0, // ÿ, + // , a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, ß, š, œ, à, á, â, ã, ä, å, æ, ç, è, é, ê, ë, ì, í, î, ï, ñ, ò, ó, ô, õ, ö, ø, ù, ú, û, ü, ž, ÿ, + + }); + // clang-format on + }; + + namespace class_size { + constexpr std::size_t cyrillic_ascii = 2; + constexpr std::size_t cyrillic_non_ascii = 44; + constexpr std::size_t western_ascii = 27; + constexpr std::size_t western_non_ascii = 32; + } + + constexpr std::size_t ASCII_DIGIT = 100; + + struct ByteScore { + const Encoding encoding; + const std::array<ubyte, 128>& lower; + const std::array<ubyte, 128>& upper; + const std::span<const ubyte> probabilities; + const std::size_t ascii; + const std::size_t non_ascii; + + static inline constexpr std::optional<std::size_t> compute_index(std::size_t x, std::size_t y, std::size_t ascii_classes, std::size_t non_ascii_classes) { + if (x == 0 && y == 0) { + return std::nullopt; + } + + if (x < ascii_classes && y < ascii_classes) { + return std::nullopt; + } + + if (y >= ascii_classes) { + return (ascii_classes * non_ascii_classes) + (ascii_classes + non_ascii_classes) * (y - ascii_classes) + x; + } + + return y * non_ascii_classes + x - ascii_classes; + } + + inline constexpr cbyte classify(cbyte byte) const { + cbyte high = byte >> 7; + cbyte low = byte & 0x7F; + if (high == 0) { + return lower[low]; + } + + return upper[low]; + } + + inline constexpr bool is_latin_alphabetic(cbyte caseless_class) const { + return caseless_class > 0 && caseless_class < (ascii + non_ascii); + } + + inline constexpr bool is_non_latin_alphabetic(cbyte caseless_class) const { + return caseless_class > 1 && caseless_class < (ascii + non_ascii); + } + + inline constexpr int64_t score(cbyte current_class, cbyte previous_class) const { + constexpr std::size_t IMPLAUSABILITY_PENALTY = -220; + + constexpr std::size_t PLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE = 0; + constexpr std::size_t IMPLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE = 1; + constexpr std::size_t IMPLAUSIBLE_BEFORE_ALPHABETIC = 2; + constexpr std::size_t IMPLAUSIBLE_AFTER_ALPHABETIC = 3; + constexpr std::size_t PLAUSIBLE_NEXT_TO_NON_ASCII_ALPHABETIC_ON_EITHER_SIDE = 4; + constexpr std::size_t PLAUSIBLE_NEXT_TO_ASCII_ALPHABETIC_ON_EITHER_SIDE = 5; + + std::size_t stored_boundary = ascii + non_ascii; + if (current_class < stored_boundary) { + if (previous_class < stored_boundary) { + if (auto index = compute_index(previous_class, current_class, ascii, non_ascii); index) { + ubyte b = probabilities[index.value()]; + if (b == 255) { + return IMPLAUSABILITY_PENALTY; + } + return b; + } + return 0; + } + + if (current_class == 0 || current_class == ASCII_DIGIT) { + return 0; + } + + std::size_t previous_unstored = previous_class - stored_boundary; + switch (previous_unstored) { + case PLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE: + case IMPLAUSIBLE_AFTER_ALPHABETIC: + return 0; + case IMPLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE: + case IMPLAUSIBLE_BEFORE_ALPHABETIC: + return IMPLAUSABILITY_PENALTY; + case PLAUSIBLE_NEXT_TO_NON_ASCII_ALPHABETIC_ON_EITHER_SIDE: + if (current_class < ascii) { + return IMPLAUSABILITY_PENALTY; + } + return 0; + case PLAUSIBLE_NEXT_TO_ASCII_ALPHABETIC_ON_EITHER_SIDE: + if (current_class < ascii) { + return 0; + } + return IMPLAUSABILITY_PENALTY; + default: + assert(previous_class == ASCII_DIGIT); + return 0; + } + } + + if (previous_class < stored_boundary) { + if (previous_class == 0 || previous_class == ASCII_DIGIT) { + return 0; + } + + std::size_t current_unstored = current_class - stored_boundary; + switch (current_unstored) { + case PLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE: + case IMPLAUSIBLE_BEFORE_ALPHABETIC: + return 0; + case IMPLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE: + case IMPLAUSIBLE_AFTER_ALPHABETIC: + return IMPLAUSABILITY_PENALTY; + case PLAUSIBLE_NEXT_TO_NON_ASCII_ALPHABETIC_ON_EITHER_SIDE: + if (previous_class < ascii) { + return IMPLAUSABILITY_PENALTY; + } + return 0; + case PLAUSIBLE_NEXT_TO_ASCII_ALPHABETIC_ON_EITHER_SIDE: + if (previous_class < ascii) { + return 0; + } + return IMPLAUSABILITY_PENALTY; + default: + assert(current_class == ASCII_DIGIT); + return 0; + } + } + + if (current_class == ASCII_DIGIT || previous_class == ASCII_DIGIT) { + return 0; + } + + return IMPLAUSABILITY_PENALTY; + } + }; + + enum class ScoreIndex { + Windows1251, + Windows1252 + }; + + static constexpr std::array byte_scores { + ByteScore { + .encoding = Encoding::Windows1251, + .lower = DetectorData::non_latin_ascii, + .upper = DetectorData::windows_1251, + .probabilities = DetectorData::cyrillic, + .ascii = class_size::cyrillic_ascii, + .non_ascii = class_size::cyrillic_non_ascii }, + ByteScore { + .encoding = Encoding::Windows1252, + .lower = DetectorData::latin_ascii, + .upper = DetectorData::windows_1252, + .probabilities = DetectorData::western, + .ascii = class_size::western_ascii, + .non_ascii = class_size::western_non_ascii } + }; + + constexpr const ByteScore& get_byte_score(ScoreIndex index) { + return byte_scores[static_cast<std::underlying_type_t<ScoreIndex>>(index)]; + } + + struct Utf8Canidate { + std::optional<int64_t> read(const std::span<const cbyte>& buffer); + }; + + struct AsciiCanidate { + std::optional<int64_t> read(const std::span<const cbyte>& buffer); + }; + + struct NonLatinCasedCanidate { + enum class CaseState { + Space, + Upper, + Lower, + UpperLower, + AllCaps, + Mix, + }; + + const ByteScore& score_data; + cbyte prev {}; + CaseState case_state = CaseState::Space; + bool prev_ascii = true; + uint64_t current_word_len {}; + uint64_t longest_word {}; + bool ibm866 = false; + bool prev_was_a0 = false; + + std::optional<int64_t> read(const std::span<const cbyte>& buffer); + }; + + struct LatinCanidate { + enum class CaseState { + Space, + Upper, + Lower, + AllCaps, + }; + + enum class OrdinalState { + Other, + Space, + PeriodAfterN, + OrdinalExpectingSpace, + OrdinalExpectingSpaceUndoImplausibility, + OrdinalExpectingSpaceOrDigit, + OrdinalExpectingSpaceOrDigitUndoImplausibily, + UpperN, + LowerN, + FeminineAbbreviationStartLetter, + Digit, + Roman, + Copyright, + }; + + const ByteScore& score_data; + cbyte prev {}; + CaseState case_state = CaseState::Space; + uint32_t prev_non_ascii {}; + OrdinalState ordinal_state = OrdinalState::Space; // Used only when `windows1252 == true` + bool windows1252; + + constexpr LatinCanidate(const ByteScore& data) : score_data(data) { + windows1252 = data.encoding == Encoding::Windows1252; + } + + std::optional<int64_t> read(const std::span<const cbyte>& buffer); + }; + + using InnerCanidate = std::variant<NonLatinCasedCanidate, LatinCanidate, Utf8Canidate, AsciiCanidate>; + + template<class... Ts> + struct overloaded : Ts... { + using Ts::operator()...; + }; + + template<class... Ts> + overloaded(Ts...) -> overloaded<Ts...>; + + struct Canidate { + InnerCanidate inner; + std::optional<int64_t> score_value; + + template<typename CanidateT> + static constexpr Canidate create_canidate() { + return { + .inner = CanidateT(), + .score_value = 0 + }; + } + + template<typename CanidateT> + static constexpr Canidate create_canidate(const ByteScore& score) { + return { + .inner = CanidateT { score }, + .score_value = 0 + }; + } + + static constexpr Canidate new_utf8() { + return create_canidate<Utf8Canidate>(); + } + + static constexpr Canidate new_ascii() { + return create_canidate<AsciiCanidate>(); + } + + static constexpr Canidate new_latin(ScoreIndex index) { + return create_canidate<LatinCanidate>(get_byte_score(index)); + } + + static constexpr Canidate new_non_latin_cased(ScoreIndex index) { + return create_canidate<NonLatinCasedCanidate>(get_byte_score(index)); + } + + constexpr std::optional<int64_t> score(const std::span<const cbyte>& buffer, std::size_t encoding, bool expectation_is_valid) { + if (auto old_score = score_value) { + auto new_score = std::visit([&](auto& inner) { + return inner.read(buffer); + }, + inner); + if (new_score) { + score_value = old_score.value() + new_score.value(); + } else { + score_value = std::nullopt; + } + } + + if (auto nlcc = std::get_if<NonLatinCasedCanidate>(&inner)) { + if (nlcc->longest_word < 2) { + return std::nullopt; + } + } + return score_value; + } + + constexpr Encoding encoding() const { + return std::visit( + overloaded { + [](const Utf8Canidate& canidate) { + return Encoding::Utf8; + }, + [](const AsciiCanidate& canidate) { + return Encoding::Ascii; + }, + [](const LatinCanidate& canidate) { + return canidate.score_data.encoding; + }, + [](const NonLatinCasedCanidate& canidate) { + return canidate.score_data.encoding; + } }, + inner); + } + }; + + struct Detector { + std::vector<Canidate> canidates { + Canidate::new_ascii(), + Canidate::new_utf8(), + Canidate::new_latin(ScoreIndex::Windows1252), + Canidate::new_non_latin_cased(ScoreIndex::Windows1251), + }; + + Encoding default_fallback = Encoding::Unknown; + + constexpr std::pair<Encoding, bool> detect_assess(std::span<const cbyte> buffer, bool allow_utf8 = true) { + int64_t max = 0; + Encoding encoding = default_fallback; // Presumes fallback, defaults to Unknown encoding if unknown (which skips conversion) + std::size_t i = 0; + for (Canidate& canidate : canidates) { + if (!allow_utf8 && canidate.encoding() == Encoding::Utf8) { + continue; + } + + if (auto score = canidate.score(buffer, i, false)) { + switch (canidate.encoding()) { + using enum Encoding; + case Ascii: + case Utf8: + return { canidate.encoding(), true }; + default: break; + } + + auto value = score.value(); + if (value > max) { + max = value; + encoding = canidate.encoding(); + } + } + i++; + } + return { encoding, max >= 0 }; + } + + constexpr Encoding detect(std::span<const cbyte> buffer, bool allow_utf8 = true) { + return detect_assess(buffer, allow_utf8).first; + } + + template<typename BufferEncoding> + std::pair<Encoding, bool> detect_assess(const lexy::buffer<BufferEncoding, void>& buffer, bool allow_utf8 = true) { + auto span = std::span<const cbyte>(buffer.data(), buffer.size()); + return detect_assess(span); + } + + template<typename BufferEncoding> + constexpr Encoding detect(const lexy::buffer<BufferEncoding, void>& buffer, bool allow_utf8 = true) { + return detect_assess(buffer, allow_utf8).first; + } + }; +}
\ No newline at end of file diff --git a/src/openvic-dataloader/detail/DetectUtf8.hpp b/src/openvic-dataloader/detail/DetectUtf8.hpp deleted file mode 100644 index e9d0350..0000000 --- a/src/openvic-dataloader/detail/DetectUtf8.hpp +++ /dev/null @@ -1,53 +0,0 @@ -#pragma once - -#include <lexy/action/match.hpp> -#include <lexy/dsl.hpp> - -#include "detail/dsl.hpp" - -namespace ovdl::detail { - namespace detect_utf8 { - - template<bool INCLUDE_ASCII> - struct DetectUtf8 { - struct not_utf8 { - static constexpr auto name = "not utf8"; - }; - - static constexpr auto rule = [] { - constexpr auto is_not_ascii_flag = lexy::dsl::context_flag<DetectUtf8>; - - // & 0b10000000 == 0b00000000 - constexpr auto ascii_values = dsl::make_range<0b00000000, 0b01111111>(); - // & 0b11100000 == 0b11000000 - constexpr auto two_byte = dsl::make_range<0b11000000, 0b11011111>(); - // & 0b11110000 == 0b11100000 - constexpr auto three_byte = dsl::make_range<0b11100000, 0b11101111>(); - // & 0b11111000 == 0b11110000 - constexpr auto four_byte = dsl::make_range<0b11110000, 0b11110111>(); - // & 0b11000000 == 0b10000000 - constexpr auto check_bytes = dsl::make_range<0b10000000, 0b10111111>(); - - constexpr auto utf8_check = - ((four_byte >> lexy::dsl::times<3>(check_bytes)) | - (three_byte >> lexy::dsl::times<2>(check_bytes)) | - (two_byte >> lexy::dsl::times<1>(check_bytes))) >> - is_not_ascii_flag.set(); - - return is_not_ascii_flag.template create<INCLUDE_ASCII>() + - lexy::dsl::while_(utf8_check | ascii_values) + - lexy::dsl::must(is_not_ascii_flag.is_set()).template error<not_utf8>; - }(); - }; - } - - template<typename Input> - constexpr bool is_utf8_no_ascii(const Input& input) { - return lexy::match<detect_utf8::DetectUtf8<false>>(input); - } - - template<typename Input> - constexpr bool is_utf8(const Input& input) { - return lexy::match<detect_utf8::DetectUtf8<true>>(input); - } -}
\ No newline at end of file diff --git a/src/openvic-dataloader/detail/Errors.hpp b/src/openvic-dataloader/detail/Errors.hpp deleted file mode 100644 index fbebcc5..0000000 --- a/src/openvic-dataloader/detail/Errors.hpp +++ /dev/null @@ -1,25 +0,0 @@ -#pragma once - -#include <string_view> - -#include <openvic-dataloader/ParseError.hpp> - -namespace ovdl::errors { - inline const ParseError make_no_file_error(std::string_view file_path) { - std::string message; - if (file_path.empty()) { - message = "File path not specified."; - } else { - message = "File '" + std::string(file_path) + "' was not found."; - } - - return ParseError { ParseError::Type::Fatal, message, 1 }; - } -} - -namespace ovdl::v2script::errors { - -} - -namespace ovdl::ovscript::errors { -}
\ No newline at end of file diff --git a/src/openvic-dataloader/detail/InternalConcepts.hpp b/src/openvic-dataloader/detail/InternalConcepts.hpp new file mode 100644 index 0000000..0c7913d --- /dev/null +++ b/src/openvic-dataloader/detail/InternalConcepts.hpp @@ -0,0 +1,127 @@ +#pragma once + +#include <concepts> +#include <utility> + +#include <openvic-dataloader/NodeLocation.hpp> +#include <openvic-dataloader/detail/Encoding.hpp> +#include <openvic-dataloader/detail/SymbolIntern.hpp> + +#include <lexy/encoding.hpp> +#include <lexy/input/buffer.hpp> + +#include <fmt/core.h> + +#include <lexy_ext/report_error.hpp> + +namespace ovdl::detail { + template<typename T> + concept IsFile = + requires(T t, const typename T::node_type* node, NodeLocation location) { + typename T::node_type; + { t.set_location(node, location) } -> std::same_as<void>; + { t.location_of(node) } -> std::same_as<NodeLocation>; + }; + + template<typename T> + concept IsAst = + requires( + T t, + const T ct, + const typename T::node_type* node, + NodeLocation loc // + ) { + requires IsFile<typename T::file_type>; + typename T::root_node_type; + typename T::node_type; + requires std::derived_from<typename T::root_node_type, typename T::node_type>; + { t.set_location(node, loc) } -> std::same_as<void>; + { t.location_of(node) } -> std::same_as<NodeLocation>; + { t.root() } -> std::same_as<typename T::root_node_type*>; + { ct.root() } -> std::same_as<const typename T::root_node_type*>; + { t.file() } -> std::same_as<typename T::file_type&>; + { ct.file() } -> std::same_as<const typename T::file_type&>; + }; + + template<typename T> + concept IsDiagnosticLogger = requires( + T t, + const T ct, + const char* str, + std::size_t length, + std::string_view sv, + lexy_ext::diagnostic_kind diag_kind // + ) { + typename T::error_range; + typename T::Writer; + { static_cast<bool>(ct) } -> std::same_as<bool>; + { ct.errored() } -> std::same_as<bool>; + { ct.warned() } -> std::same_as<bool>; + { ct.get_errors() } -> std::same_as<typename T::error_range>; + { t.intern(str, length) } -> std::same_as<ovdl::SymbolIntern::symbol_type>; + { t.intern(sv) } -> std::same_as<ovdl::SymbolIntern::symbol_type>; + { t.intern_cstr(str, length) } -> std::same_as<const char*>; + { t.intern_cstr(sv) } -> std::same_as<const char*>; + { t.symbol_interner() } -> std::same_as<SymbolIntern::symbol_interner_type&>; + { ct.symbol_interner() } -> std::same_as<const SymbolIntern::symbol_interner_type&>; + { t.error(std::declval<typename T::template format_str<>>()) } -> std::same_as<typename T::Writer>; + { t.warning(std::declval<typename T::template format_str<>>()) } -> std::same_as<typename T::Writer>; + { t.note(std::declval<typename T::template format_str<>>()) } -> std::same_as<typename T::Writer>; + { t.info(std::declval<typename T::template format_str<>>()) } -> std::same_as<typename T::Writer>; + { t.debug(std::declval<typename T::template format_str<>>()) } -> std::same_as<typename T::Writer>; + { t.fixit(std::declval<typename T::template format_str<>>()) } -> std::same_as<typename T::Writer>; + { t.help(std::declval<typename T::template format_str<>>()) } -> std::same_as<typename T::Writer>; + { t.error(sv) } -> std::same_as<typename T::Writer>; + { t.warning(sv) } -> std::same_as<typename T::Writer>; + { t.note(sv) } -> std::same_as<typename T::Writer>; + { t.info(sv) } -> std::same_as<typename T::Writer>; + { t.debug(sv) } -> std::same_as<typename T::Writer>; + { t.fixit(sv) } -> std::same_as<typename T::Writer>; + { t.help(sv) } -> std::same_as<typename T::Writer>; + { std::move(t.error_callback().sink()).finish() } -> std::same_as<std::size_t>; + { t.log(diag_kind, std::declval<typename T::template format_str<>>()) } -> std::same_as<typename T::Writer>; + }; + + template<typename T> + concept IsParseState = requires( + T t, + const T ct, + typename T::ast_type::file_type&& file, + lexy::buffer<lexy::default_encoding>&& buffer, + ovdl::detail::Encoding encoding, + const char* path // + ) { + requires IsAst<typename T::ast_type>; + requires IsDiagnosticLogger<typename T::diagnostic_logger_type>; + { T { std::move(file), encoding } } -> std::same_as<T>; + { T { std::move(buffer), encoding } } -> std::same_as<T>; + { T { path, std::move(buffer), encoding } } -> std::same_as<T>; + { t.ast() } -> std::same_as<typename T::ast_type&>; + { ct.ast() } -> std::same_as<const typename T::ast_type&>; + { t.logger() } -> std::same_as<typename T::diagnostic_logger_type&>; + { ct.logger() } -> std::same_as<const typename T::diagnostic_logger_type&>; + }; + + template<typename T> + concept IsFileParseState = requires( + T t, + const T ct, + typename T::file_type&& file, + lexy::buffer<lexy::default_encoding>&& buffer, + ovdl::detail::Encoding encoding, + const char* path // + ) { + requires IsFile<typename T::file_type>; + requires IsDiagnosticLogger<typename T::diagnostic_logger_type>; + { T { std::move(file), encoding } } -> std::same_as<T>; + { T { std::move(buffer), encoding } } -> std::same_as<T>; + { T { path, std::move(buffer), encoding } } -> std::same_as<T>; + { t.file() } -> std::same_as<typename T::file_type&>; + { ct.file() } -> std::same_as<const typename T::file_type&>; + { t.logger() } -> std::same_as<typename T::diagnostic_logger_type&>; + { ct.logger() } -> std::same_as<const typename T::diagnostic_logger_type&>; + }; + + template<typename T> + concept IsStateType = IsParseState<T> || IsFileParseState<T>; +}
\ No newline at end of file diff --git a/src/openvic-dataloader/detail/ParseHandler.cpp b/src/openvic-dataloader/detail/ParseHandler.cpp new file mode 100644 index 0000000..3818433 --- /dev/null +++ b/src/openvic-dataloader/detail/ParseHandler.cpp @@ -0,0 +1,347 @@ +#include "ParseHandler.hpp" + +#include <algorithm> +#include <cstddef> +#include <cstdlib> +#include <string_view> +#include <type_traits> + +#include <openvic-dataloader/detail/Encoding.hpp> + +using namespace ovdl::detail; + +#ifdef _WIN32 +#include <array> +#include <cstdint> +#include <utility> + +#define WIN32_LEAN_AND_MEAN +#include <Windows.h> +#undef WIN32_LEAN_AND_MEAN +#endif + +template<size_t N> +struct LangCodeLiteral { + char value[N]; + + constexpr LangCodeLiteral(const char (&str)[N]) { + std::copy_n(str, N, value); + } + + static constexpr std::integral_constant<std::size_t, N - 1> size = {}; + + constexpr const char& operator[](std::size_t index) const noexcept { + return value[index]; + } + + constexpr operator std::string_view() const noexcept { + return std::string_view(value, size()); + } + + constexpr bool operator==(const std::string_view view) const noexcept { + return view.size() >= size() + 1 && view.starts_with(*this) && view[size()] == '_'; + } +}; + +struct LangCodeView { + std::string_view view; + bool is_valid; + + constexpr LangCodeView() = default; + + template<std::size_t N> + constexpr LangCodeView(const char (&str)[N]) : view(str), is_valid(true) {} + + constexpr LangCodeView(char* str) : view(str) { + is_valid = view.find('_') != std::string_view::npos; + } + + constexpr std::size_t size() const noexcept { + return view.size(); + } + + constexpr const char& operator[](std::size_t index) const noexcept { + return view[index]; + } + + constexpr operator std::string_view() const noexcept { + return view; + } + + template<std::size_t N> + constexpr bool operator==(const LangCodeLiteral<N>& literal) { + return is_valid && size() >= LangCodeLiteral<N>::size() && view.starts_with(literal); + } +}; + +struct FallbackSetter { + std::optional<Encoding>& fallback; + + template<Encoding _Encoding, LangCodeLiteral LangCode> + constexpr bool encoded(auto&& view) const { + if (view == LangCode) { + fallback = _Encoding; + return true; + } + return false; + }; +}; + +void ParseHandler::_detect_system_fallback_encoding() { + _system_fallback_encoding = Encoding::Unknown; + LangCodeView lang_code; + +#ifdef _WIN32 + using namespace std::string_view_literals; + + // Every Windows language id mapped to a language code according to https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-lcid/63d3d639-7fd2-4afb-abbe-0d5b5551eef8 + constexpr std::array lang_id_to_lang_code = std::to_array<std::pair<std::uint8_t, LangCodeView>>({ + { 0x0001, "ar" }, + { 0x0002, "bg" }, + { 0x0003, "ca" }, + { 0x0004, "zh" }, + { 0x0005, "cs" }, + { 0x0006, "da" }, + { 0x0007, "de" }, + { 0x0008, "el" }, + { 0x0009, "en" }, + { 0x000A, "es" }, + { 0x000B, "fi" }, + { 0x000C, "fr" }, + { 0x000D, "he" }, + { 0x000E, "hu" }, + { 0x000F, "is" }, + { 0x0010, "it" }, + { 0x0011, "ja" }, + { 0x0012, "ko" }, + { 0x0013, "nl" }, + { 0x0014, "no" }, + { 0x0015, "pl" }, + { 0x0016, "pt" }, + { 0x0017, "rm" }, + { 0x0018, "ro" }, + { 0x0019, "ru" }, + { 0x001A, "hr" }, + { 0x001B, "sk" }, + { 0x001C, "sq" }, + { 0x001D, "sv" }, + { 0x001E, "th" }, + { 0x001F, "tr" }, + { 0x0020, "ur" }, + { 0x0021, "id" }, + { 0x0022, "uk" }, + { 0x0023, "be" }, + { 0x0024, "sl" }, + { 0x0025, "et" }, + { 0x0026, "lv" }, + { 0x0027, "lt" }, + { 0x0028, "tg" }, + { 0x0029, "fa" }, + { 0x002A, "vi" }, + { 0x002B, "hy" }, + { 0x002C, "az" }, + { 0x002D, "eu" }, + { 0x002E, "hsb" }, + { 0x002F, "mk" }, + { 0x0030, "st" }, + { 0x0031, "ts" }, + { 0x0032, "tn" }, + { 0x0033, "ve" }, + { 0x0034, "xh" }, + { 0x0035, "zu" }, + { 0x0036, "af" }, + { 0x0037, "ka" }, + { 0x0038, "fo" }, + { 0x0039, "hi" }, + { 0x003A, "mt" }, + { 0x003B, "se" }, + { 0x003C, "ga" }, + { 0x003D, "yi" }, + { 0x003E, "ms" }, + { 0x003F, "kk" }, + { 0x0040, "ky" }, + { 0x0041, "sw" }, + { 0x0042, "tk" }, + { 0x0043, "uz" }, + { 0x0044, "tt" }, + { 0x0045, "bn" }, + { 0x0046, "pa" }, + { 0x0047, "gu" }, + { 0x0048, "or" }, + { 0x0049, "ta" }, + { 0x004A, "te" }, + { 0x004B, "kn" }, + { 0x004C, "ml" }, + { 0x004D, "as" }, + { 0x004E, "mr" }, + { 0x004F, "sa" }, + { 0x0050, "mn" }, + { 0x0051, "bo" }, + { 0x0052, "cy" }, + { 0x0053, "km" }, + { 0x0054, "lo" }, + { 0x0055, "my" }, + { 0x0056, "gl" }, + { 0x0057, "kok" }, + { 0x0058, "mni" }, + { 0x0059, "sd" }, + { 0x005A, "syr" }, + { 0x005B, "si" }, + { 0x005C, "chr" }, + { 0x005D, "iu" }, + { 0x005E, "am" }, + { 0x005F, "tzm" }, + { 0x0060, "ks" }, + { 0x0061, "ne" }, + { 0x0062, "fy" }, + { 0x0063, "ps" }, + { 0x0064, "fil" }, + { 0x0065, "dv" }, + { 0x0066, "bin" }, + { 0x0067, "ff" }, + { 0x0068, "ha" }, + { 0x0069, "ibb" }, + { 0x006A, "yo" }, + { 0x006B, "quz" }, + { 0x006C, "nso" }, + { 0x006D, "ba" }, + { 0x006E, "lb" }, + { 0x006F, "kl" }, + { 0x0070, "ig" }, + { 0x0071, "kr" }, + { 0x0072, "om" }, + { 0x0073, "ti" }, + { 0x0074, "gn" }, + { 0x0075, "haw" }, + { 0x0076, "la" }, + { 0x0077, "so" }, + { 0x0078, "ii" }, + { 0x0079, "pap" }, + { 0x007A, "arn" }, + { 0x007C, "moh" }, + { 0x007E, "br" }, + { 0x0080, "ug" }, + { 0x0081, "mi" }, + { 0x0082, "oc" }, + { 0x0083, "co" }, + { 0x0084, "gsw" }, + { 0x0085, "sah" }, + { 0x0086, "qut" }, + { 0x0087, "rw" }, + { 0x0088, "wo" }, + { 0x008C, "prs" }, + { 0x0091, "gd" }, + { 0x0092, "ku" }, + { 0x0093, "quc" } // + }); + +#pragma pack(push, 1) + struct LocaleStruct { + struct { + uint8_t language_id; + uint8_t country_id; + } language_country; + uint8_t sort_id : 4; + uint16_t reserved : 12; + }; +#pragma pack(pop) + + std::uint32_t locale_int = GetSystemDefaultLCID(); + LocaleStruct locale_id; + std::memcpy(&locale_id, &locale_int, sizeof(locale_id)); + // first 16 bytes are language-country id, next 4 are sort id, last 12 bytes are reserved + // first 8 are the language id, last 8 bytes are a country id + const std::uint8_t& lang_id = locale_id.language_country.language_id; + + for (const auto& map : lang_id_to_lang_code) { + if (map.first != lang_id) continue; + lang_code = map.second; + break; + } +#else + lang_code = std::getenv("LANG"); +#endif + + constexpr FallbackSetter setter { _system_fallback_encoding }; + + if (lang_code.size() < 2) { + _system_fallback_encoding = Encoding::Unknown; + return; + } + +#define WIN1251(LANG_CODE) \ + if (setter.encoded<Encoding::Windows1251, #LANG_CODE>(lang_code)) return; + +#define WIN1252(LANG_CODE) \ + if (setter.encoded<Encoding::Windows1252, #LANG_CODE>(lang_code)) return; + + // More common, prefer + WIN1252(en); + WIN1252(es); + WIN1252(fr); + WIN1252(de); + + WIN1251(ru); + + WIN1252(af); + WIN1252(sq); + WIN1252(eu); + WIN1252(br); + WIN1252(co); + WIN1252(fo); + WIN1252(gl); + WIN1252(is); + WIN1252(io); + WIN1252(ga); + WIN1252(id); + WIN1252(in); + WIN1252(it); + WIN1252(lb); + WIN1252(ms); + WIN1252(gv); + WIN1252(no); + WIN1252(oc); + WIN1252(pt); + WIN1252(gd); + WIN1252(sw); + WIN1252(fi); + WIN1252(da); + WIN1252(et); + WIN1252(tn); + WIN1252(ca); + WIN1252(rm); + WIN1252(nl); + WIN1252(sl); + WIN1252(cy); + WIN1252(hu); + + WIN1251(be); + WIN1251(uk); + WIN1251(bg); + WIN1251(kk); + WIN1251(tg); + WIN1251(sr); + WIN1251(ky); + WIN1251(mn); + WIN1251(mk); + WIN1251(mo); + + if (lang_code.size() < 3) { + return; + } + + WIN1251(mol); + + WIN1252(ast); + WIN1252(jbo); + WIN1252(gla); + WIN1252(sco); + WIN1252(sma); + WIN1252(roo); + WIN1252(swa); + WIN1252(tsn); + WIN1252(tok); + +#undef WIN1251 +#undef WIN1252 +}
\ No newline at end of file diff --git a/src/openvic-dataloader/detail/ParseHandler.hpp b/src/openvic-dataloader/detail/ParseHandler.hpp index fbec0d7..9666a5b 100644 --- a/src/openvic-dataloader/detail/ParseHandler.hpp +++ b/src/openvic-dataloader/detail/ParseHandler.hpp @@ -1,20 +1,26 @@ #pragma once +#include <cstddef> +#include <optional> +#include <string> #include <utility> -#include <openvic-dataloader/ParseState.hpp> -#include <openvic-dataloader/detail/utility/Concepts.hpp> +#include <openvic-dataloader/detail/Concepts.hpp> #include <lexy/encoding.hpp> #include <lexy/input/buffer.hpp> #include <lexy/input/file.hpp> +#include "openvic-dataloader/detail/Encoding.hpp" +#include "openvic-dataloader/detail/Utility.hpp" + #include "detail/BufferError.hpp" +#include "detail/Detect.hpp" +#include "detail/InternalConcepts.hpp" namespace ovdl::detail { - template<typename Derived> struct ParseHandler { - std::string make_error_from(buffer_error error) { + std::string make_error_from(buffer_error error) const { switch (error) { using enum ovdl::detail::buffer_error; case buffer_is_null: @@ -30,116 +36,179 @@ namespace ovdl::detail { } } - template<typename... Args> - constexpr void _run_load_func(detail::LoadCallback<Derived, Args...> auto func, Args... args); - }; - - template<IsFileParseState ParseState, typename MemoryResource = void> - struct BasicFileParseHandler : ParseHandler<BasicFileParseHandler<ParseState, MemoryResource>> { - using parse_state_type = ParseState; - using encoding_type = typename parse_state_type::file_type::encoding_type; - constexpr bool is_valid() const { - if (!_parse_state) return false; - return buffer().data() != nullptr; + return is_valid_impl(); } - constexpr buffer_error load_buffer_size(const char* data, std::size_t size) { - lexy::buffer<encoding_type, MemoryResource> buffer(data, size); + buffer_error load_buffer_size(const char* data, std::size_t size, std::optional<Encoding> fallback) { + lexy::buffer<lexy::default_encoding> buffer(data, size); if (buffer.data() == nullptr) return buffer_error::buffer_is_null; - _parse_state.reset(new parse_state_type { std::move(buffer) }); - return is_valid() ? buffer_error::success : buffer_error::buffer_is_null; + return load_buffer_impl(std::move(buffer), "", fallback); } - constexpr buffer_error load_buffer(const char* start, const char* end) { - lexy::buffer<encoding_type, MemoryResource> buffer(start, end); + buffer_error load_buffer(const char* start, const char* end, std::optional<Encoding> fallback) { + lexy::buffer<lexy::default_encoding> buffer(start, end); if (buffer.data() == nullptr) return buffer_error::buffer_is_null; - _parse_state.reset(new parse_state_type { std::move(buffer) }); - return is_valid() ? buffer_error::success : buffer_error::buffer_is_null; + return load_buffer_impl(std::move(buffer), "", fallback); } - buffer_error load_file(const char* path) { - lexy::read_file_result file = lexy::read_file<encoding_type, lexy::encoding_endianness::bom, MemoryResource>(path); + buffer_error load_file(const char* path, std::optional<Encoding> fallback) { + lexy::read_file_result file = lexy::read_file<lexy::default_encoding, lexy::encoding_endianness::bom>(path); + if (!file) { - _parse_state.reset(new parse_state_type { path, lexy::buffer<typename parse_state_type::file_type::encoding_type>() }); return ovdl::detail::from_underlying<buffer_error>(ovdl::detail::to_underlying(file.error())); } - _parse_state.reset(new parse_state_type { path, std::move(file).buffer() }); - return is_valid() ? buffer_error::success : buffer_error::buffer_is_null; + + return load_buffer_impl(std::move(file).buffer(), path, fallback); } const char* path() const { + return path_impl(); + } + + static Encoding get_system_fallback() { + return _system_fallback_encoding.value_or(Encoding::Unknown); + } + + virtual ~ParseHandler() = default; + + protected: + constexpr virtual bool is_valid_impl() const = 0; + constexpr virtual buffer_error load_buffer_impl(lexy::buffer<lexy::default_encoding>&& buffer, const char* path = "", std::optional<Encoding> fallback = std::nullopt) = 0; + virtual const char* path_impl() const = 0; + + template<detail::IsStateType State, detail::IsEncoding BufferEncoding> + static constexpr auto generate_state = [](std::optional<State>* state, const char* path, auto&& buffer, Encoding encoding) { + if (path[0] != '\0') { + state->emplace( + path, + lexy::buffer<BufferEncoding>(std::move(buffer)), + encoding); + return; + } + state->emplace(lexy::buffer<BufferEncoding>(std::move(buffer)), encoding); + }; + + template<detail::IsStateType State> + static void create_state(std::optional<State>* state, const char* path, lexy::buffer<lexy::default_encoding>&& buffer, std::optional<Encoding> fallback) { + if (!_system_fallback_encoding.has_value()) { + _detect_system_fallback_encoding(); + } + bool is_bad_fallback = false; + if (fallback.has_value()) { + is_bad_fallback = fallback.value() == Encoding::Ascii || fallback.value() == Encoding::Utf8; + if (is_bad_fallback) + fallback = _system_fallback_encoding.value(); + } else { + fallback = _system_fallback_encoding.value(); + } + auto [encoding, is_alone] = encoding_detect::Detector { .default_fallback = fallback.value() }.detect_assess(buffer); + switch (encoding) { + using enum Encoding; + case Ascii: + case Utf8: { + generate_state<State, lexy::utf8_char_encoding>(state, path, std::move(buffer), encoding); + break; + } + case Unknown: + case Windows1251: + case Windows1252: { + generate_state<State, lexy::default_encoding>(state, path, std::move(buffer), encoding); + break; + } + default: + ovdl::detail::unreachable(); + } + + if (!is_alone) { + (*state)->logger().info("encoding type could not be distinguished"); + } + + if (is_bad_fallback) { + (*state)->logger().warning("fallback encoding cannot be ascii or utf8"); + } + + if (encoding == ovdl::detail::Encoding::Unknown) { + (*state)->logger().warning("could not detect encoding"); + } + } + + private: + inline static std::optional<Encoding> _system_fallback_encoding = std::nullopt; + static void _detect_system_fallback_encoding(); + }; + + template<detail::IsFileParseState ParseState> + struct BasicFileParseHandler : ParseHandler { + using parse_state_type = ParseState; + + virtual constexpr bool is_valid_impl() const { + if (!_parse_state) return false; + return _parse_state.value().file().is_valid(); + } + + constexpr virtual buffer_error load_buffer_impl(lexy::buffer<lexy::default_encoding>&& buffer, const char* path, std::optional<Encoding> fallback) { + if (buffer.data() == nullptr) return buffer_error::buffer_is_null; + create_state(&_parse_state, path, std::move(buffer), fallback); + return is_valid_impl() ? buffer_error::success : buffer_error::buffer_is_null; + } + + virtual const char* path_impl() const { if (!_parse_state) return ""; - return _parse_state->file().path(); + return _parse_state.value().file().path(); } parse_state_type& parse_state() { - return *_parse_state; + return _parse_state.value(); } const parse_state_type& parse_state() const { - return *_parse_state; + return _parse_state.value(); } + template<typename Encoding> constexpr const auto& buffer() const { - return _parse_state->file().buffer(); + return _parse_state.value().file().template get_buffer_as<Encoding>(); } protected: - std::unique_ptr<parse_state_type> _parse_state; + std::optional<parse_state_type> _parse_state; }; - template<IsParseState ParseState, typename MemoryResource = void> - struct BasicStateParseHandler : ParseHandler<BasicStateParseHandler<ParseState, MemoryResource>> { + template<detail::IsParseState ParseState> + struct BasicStateParseHandler : ParseHandler { using parse_state_type = ParseState; - using encoding_type = typename parse_state_type::ast_type::file_type::encoding_type; - constexpr bool is_valid() const { + virtual constexpr bool is_valid_impl() const { if (!_parse_state) return false; - return buffer().data() != nullptr; - } - - constexpr buffer_error load_buffer_size(const char* data, std::size_t size) { - lexy::buffer<encoding_type, MemoryResource> buffer(data, size); - _parse_state.reset(new parse_state_type { std::move(buffer) }); - return is_valid() ? buffer_error::success : buffer_error::buffer_is_null; - } - - constexpr buffer_error load_buffer(const char* start, const char* end) { - lexy::buffer<encoding_type, MemoryResource> buffer(start, end); - _parse_state.reset(new parse_state_type { std::move(buffer) }); - return is_valid() ? buffer_error::success : buffer_error::buffer_is_null; + return _parse_state.value().ast().file().is_valid(); } - buffer_error load_file(const char* path) { - lexy::read_file_result file = lexy::read_file<encoding_type, lexy::encoding_endianness::bom, MemoryResource>(path); - if (!file) { - _parse_state.reset(new parse_state_type { path, lexy::buffer<typename parse_state_type::ast_type::file_type::encoding_type>() }); - return ovdl::detail::from_underlying<buffer_error>(ovdl::detail::to_underlying(file.error())); - } - - _parse_state.reset(new parse_state_type { path, std::move(file).buffer() }); - return is_valid() ? buffer_error::success : buffer_error::buffer_is_null; + constexpr virtual buffer_error load_buffer_impl(lexy::buffer<lexy::default_encoding>&& buffer, const char* path, std::optional<Encoding> fallback) { + if (buffer.data() == nullptr) return buffer_error::buffer_is_null; + create_state(&_parse_state, path, std::move(buffer), fallback); + return is_valid_impl() ? buffer_error::success : buffer_error::buffer_is_null; } - const char* path() const { + virtual const char* path_impl() const { if (!_parse_state) return ""; - return _parse_state->ast().file().path(); + return _parse_state.value().ast().file().path(); } parse_state_type& parse_state() { - return *_parse_state; + return _parse_state.value(); } const parse_state_type& parse_state() const { - return *_parse_state; + return _parse_state.value(); } + template<typename Encoding> constexpr const auto& buffer() const { - return _parse_state->ast().file().buffer(); + return _parse_state.value().ast().file().template get_buffer_as<Encoding>(); } protected: - std::unique_ptr<parse_state_type> _parse_state; + std::optional<parse_state_type> _parse_state; }; }
\ No newline at end of file diff --git a/src/openvic-dataloader/detail/Warnings.hpp b/src/openvic-dataloader/detail/Warnings.hpp index ab718bc..3a0a239 100644 --- a/src/openvic-dataloader/detail/Warnings.hpp +++ b/src/openvic-dataloader/detail/Warnings.hpp @@ -1,18 +1,17 @@ #pragma once +#include <string> #include <string_view> -#include <openvic-dataloader/ParseWarning.hpp> - namespace ovdl::v2script::warnings { inline const std::string make_utf8_warning(std::string_view file_path) { - constexpr std::string_view message_suffix = "This may cause problems. Prefer Windows-1252 encoding."; + constexpr std::string_view message_suffix = "This may cause problems. Prefer Windows-1252 encoding:"; std::string message; if (file_path.empty()) { - message = "Buffer is a UTF-8 encoded string. " + std::string(message_suffix); + message = "Buffer is UTF-8 encoded. " + std::string(message_suffix); } else { - message = "File '" + std::string(file_path) + "' is a UTF-8 encoded file. " + std::string(message_suffix); + message = "File is UTF-8 encoded. " + std::string(message_suffix); } return message; diff --git a/src/openvic-dataloader/detail/dsl.hpp b/src/openvic-dataloader/detail/dsl.hpp index ccc1af6..fd8981a 100644 --- a/src/openvic-dataloader/detail/dsl.hpp +++ b/src/openvic-dataloader/detail/dsl.hpp @@ -1,16 +1,20 @@ #pragma once +#include <concepts> // IWYU pragma: keep #include <type_traits> #include <openvic-dataloader/NodeLocation.hpp> -#include <openvic-dataloader/ParseState.hpp> +#include <lexy/_detail/config.hpp> #include <lexy/callback/adapter.hpp> #include <lexy/callback/bind.hpp> #include <lexy/callback/container.hpp> #include <lexy/callback/fold.hpp> #include <lexy/dsl.hpp> +#include <lexy/dsl/literal.hpp> +#include <lexy/encoding.hpp> +#include "detail/InternalConcepts.hpp" #include "detail/StringLiteral.hpp" namespace ovdl::dsl { @@ -20,10 +24,46 @@ namespace ovdl::dsl { } template<typename Sink> - constexpr auto sink(Sink sink) { + constexpr auto bind_sink(Sink sink) { return lexy::bind_sink(sink, lexy::parse_state); } + template<typename ReturnT, typename Sink> + struct _sink_with_state { + using return_type = ReturnT; + + LEXY_EMPTY_MEMBER Sink _sink_cb; + + template<detail::IsStateType StateType, typename SinkCallback> + struct _sink_callback { + StateType& _state; + SinkCallback _sink_cb; + + using return_type = decltype(LEXY_MOV(_sink_cb).finish()); + + template<typename... Args> + constexpr void operator()(Args&&... args) { + lexy::_detail::invoke(_sink_cb, _state, LEXY_FWD(args)...); + } + + constexpr return_type finish() && { return LEXY_MOV(_sink_cb).finish(); } + }; + + template<typename... Args> + constexpr auto operator()(detail::IsStateType auto& state, Args... args) const -> decltype(_sink_cb(state, LEXY_FWD(args)...)) { + return _sink_cb(state, LEXY_FWD(args)...); + } + + constexpr auto sink(detail::IsStateType auto& state) const { + return _sink_callback<std::decay_t<decltype(state)>, decltype(_sink_cb.sink())> { state, _sink_cb.sink() }; + } + }; + + template<typename ReturnT, typename Sink> + constexpr auto sink(Sink&& sink) { + return bind_sink(_sink_with_state<ReturnT, Sink> { LEXY_FWD(sink) }); + } + template<typename Container, typename Callback> constexpr auto collect(Callback callback) { return sink(lexy::collect<Container>(callback)); @@ -34,49 +74,76 @@ namespace ovdl::dsl { return sink(lexy::collect(callback)); } - template<IsParseState StateType, typename T> + template<typename T> constexpr auto construct = callback<T*>( - [](StateType& state, ovdl::NodeLocation loc, auto&& arg) { - if constexpr (std::is_same_v<std::decay_t<decltype(arg)>, lexy::nullopt>) + [](detail::IsParseState auto& state, ovdl::NodeLocation loc, auto&& arg) { + if constexpr (std::same_as<std::decay_t<decltype(arg)>, lexy::nullopt>) return state.ast().template create<T>(loc); else return state.ast().template create<T>(loc, DRYAD_FWD(arg)); }, - [](StateType& state, ovdl::NodeLocation loc, auto&&... args) { + [](detail::IsParseState auto& state, ovdl::NodeLocation loc, auto&&... args) { return state.ast().template create<T>(loc, DRYAD_FWD(args)...); }); - template<IsParseState StateType, typename T, typename ListType, bool DisableEmpty = false> + template<typename T, typename ListType, bool DisableEmpty = false> constexpr auto construct_list = callback<T*>( - [](StateType& state, const char* begin, ListType&& arg, const char* end) { + [](detail::IsParseState auto& state, const char* begin, ListType&& arg, const char* end) { return state.ast().template create<T>(NodeLocation::make_from(begin, end), DRYAD_FWD(arg)); }, - [](StateType& state, const char* begin, lexy::nullopt, const char* end) { + [](detail::IsParseState auto& state, const char* begin, lexy::nullopt, const char* end) { return state.ast().template create<T>(NodeLocation::make_from(begin, end)); }, - [](StateType& state, const char* begin, const char* end) { + [](detail::IsParseState auto& state, const char* begin, const char* end) { return state.ast().template create<T>(NodeLocation::make_from(begin, end)); + }, + [](detail::IsParseState auto& state) { + return nullptr; }); - template<IsParseState StateType, typename T, typename ListType> - constexpr auto construct_list<StateType, T, ListType, true> = callback<T*>( - [](StateType& state, const char* begin, ListType&& arg, const char* end) { + template<typename T, typename ListType> + constexpr auto construct_list<T, ListType, true> = callback<T*>( + [](detail::IsParseState auto& state, const char* begin, ListType&& arg, const char* end) { return state.ast().template create<T>(NodeLocation::make_from(begin, end), DRYAD_FWD(arg)); }, - [](StateType& state, const char* begin, lexy::nullopt, const char* end) { + [](detail::IsParseState auto& state, const char* begin, lexy::nullopt, const char* end) { return state.ast().template create<T>(NodeLocation::make_from(begin, end)); }); - template<unsigned char LOW, unsigned char HIGH> - consteval auto make_range() { - if constexpr (LOW == HIGH) { - return ::lexy::dsl::lit_c<LOW>; - } else if constexpr (LOW == (HIGH - 1)) { - return ::lexy::dsl::lit_c<LOW> / ::lexy::dsl::lit_c<HIGH>; - } else { - return ::lexy::dsl::lit_c<LOW> / make_range<LOW + 1, HIGH>(); + template<typename CharT, CharT LowC, CharT HighC> + struct _crange : lexyd::char_class_base<_crange<CharT, LowC, HighC>> { + static_assert(LowC >= 0, "LowC cannot be less than 0"); + static_assert(HighC - LowC > 0, "LowC must be less than HighC"); + + static constexpr auto char_class_unicode() { + return LowC <= 0x7F && HighC <= 0x7F; } - } + + static LEXY_CONSTEVAL auto char_class_name() { + return "range"; + } + + static LEXY_CONSTEVAL auto char_class_ascii() { + lexy::_detail::ascii_set result; + if constexpr (LowC <= 0x7F && HighC <= 0x7F) + for (auto c = LowC; c <= HighC; c++) + result.insert(c); + return result; + } + + static constexpr auto char_class_match_cp([[maybe_unused]] char32_t cp) { + if constexpr (LowC <= 0x7F && HighC <= 0x7F) + return std::false_type {}; + else + return LowC <= cp && cp <= HighC; + } + }; + + template<auto LowC, decltype(LowC) HighC> + constexpr auto lit_c_range = _crange<LEXY_DECAY_DECLTYPE(LowC), LowC, HighC> {}; + + template<unsigned char LowC, unsigned char HighC> + constexpr auto lit_b_range = _crange<unsigned char, LowC, HighC> {}; template<auto Open, auto Close> constexpr auto position_brackets = lexy::dsl::brackets(lexy::dsl::position(lexy::dsl::lit_c<Open>), lexy::dsl::position(lexy::dsl::lit_c<Close>)); @@ -89,14 +156,13 @@ namespace ovdl::dsl { template<typename Production> constexpr auto p = lexy::dsl::position(lexy::dsl::p<Production>); - template<IsParseState ParseType, typename ReturnType, ovdl::detail::string_literal Keyword> + template<typename ReturnType, ovdl::detail::string_literal Keyword> static constexpr auto default_kw_value = dsl::callback<ReturnType*>( - [](ParseType& state, NodeLocation loc) { + [](detail::IsParseState auto& state, NodeLocation loc) { return state.ast().template create<ReturnType>(loc, state.ast().intern(Keyword.data(), Keyword.size())); }); template< - IsParseState ParseType, auto Identifier, typename RuleValue, ovdl::detail::string_literal Keyword, @@ -109,18 +175,17 @@ namespace ovdl::dsl { static constexpr auto value = Value; }; static constexpr auto rule = dsl::p<rule_t> >> Production; - static constexpr auto value = construct<ParseType, RuleValue>; + static constexpr auto value = construct<RuleValue>; }; template< - IsParseState ParseType, auto Identifier, typename RuleValue, ovdl::detail::string_literal Keyword, auto Production, auto Value> - struct fkeyword_rule : keyword_rule<ParseType, Identifier, RuleValue, Keyword, Production, Value> { - using base_type = keyword_rule<ParseType, Identifier, RuleValue, Keyword, Production, Value>; + struct fkeyword_rule : keyword_rule<Identifier, RuleValue, Keyword, Production, Value> { + using base_type = keyword_rule<Identifier, RuleValue, Keyword, Production, Value>; struct context_t; struct rule_t : base_type::rule_t { static constexpr auto flag = lexy::dsl::context_flag<context_t>; @@ -139,7 +204,7 @@ namespace ovdl::dsl { static constexpr auto make_flag = rule_t::flag.create(); static constexpr auto rule = dsl::p<rule_t> >> (rule_t::must >> rule_t::flag.set()) >> Production; - static constexpr auto value = construct<ParseType, RuleValue>; + static constexpr auto value = construct<RuleValue>; }; template<typename... Args> @@ -147,4 +212,71 @@ namespace ovdl::dsl { static constexpr auto flags = (Args::make_flag + ...); static constexpr auto p = (lexy::dsl::p<Args> | ...); }; + + template<typename Rule, typename RuleUtf, typename Tag> + struct _peek : lexyd::branch_base { + template<typename Reader> + struct bp { + typename Reader::iterator begin; + typename Reader::marker end; + + constexpr bool try_parse(const void*, Reader reader) { + using encoding = typename Reader::encoding; + + auto parser = [&] { + if constexpr (std::same_as<encoding, lexy::default_encoding> || std::same_as<encoding, lexy::byte_encoding>) { + // We need to match the entire rule. + return lexy::token_parser_for<decltype(lexy::dsl::token(Rule {})), Reader> { reader }; + } else { + // We need to match the entire rule. + return lexy::token_parser_for<decltype(lexy::dsl::token(RuleUtf {})), Reader> { reader }; + } + }(); + + begin = reader.position(); + auto result = parser.try_parse(reader); + end = parser.end; + + return result; + } + + template<typename Context> + constexpr void cancel(Context& context) { + context.on(lexyd::_ev::backtracked {}, begin, end.position()); + } + + template<typename NextParser, typename Context, typename... Args> + LEXY_PARSER_FUNC bool finish(Context& context, Reader& reader, Args&&... args) { + context.on(lexyd::_ev::backtracked {}, begin, end.position()); + return NextParser::parse(context, reader, LEXY_FWD(args)...); + } + }; + + template<typename NextParser> + struct p { + template<typename Context, typename Reader, typename... Args> + LEXY_PARSER_FUNC static bool parse(Context& context, Reader& reader, Args&&... args) { + bp<Reader> impl {}; + if (!impl.try_parse(context.control_block, reader)) { + // Report that we've failed. + using tag = lexy::_detail::type_or<Tag, lexy::peek_failure>; + auto err = lexy::error<Reader, tag>(impl.begin, impl.end.position()); + context.on(lexyd::_ev::error {}, err); + + // But recover immediately, as we wouldn't have consumed anything either way. + } + + context.on(lexyd::_ev::backtracked {}, impl.begin, impl.end); + return NextParser::parse(context, reader, LEXY_FWD(args)...); + } + }; + + template<typename Error> + static constexpr _peek<Rule, RuleUtf, Error> error = {}; + }; + + template<typename Rule, typename RuleUtf> + constexpr auto peek(Rule, RuleUtf) { + return _peek<Rule, RuleUtf, void> {}; + } }
\ No newline at end of file |