aboutsummaryrefslogtreecommitdiff
path: root/src/openvic-dataloader/detail/DetectUtf8.hpp
diff options
context:
space:
mode:
author Spartan322 <Megacake1234@gmail.com>2024-05-09 16:06:02 +0200
committer Spartan322 <Megacake1234@gmail.com>2024-06-18 01:31:12 +0200
commitb0c3ba3f91926b0c95625bdbf4aab69269130b13 (patch)
treef15ebc47d6bf370031af28e4bb4814ae30ef46e1 /src/openvic-dataloader/detail/DetectUtf8.hpp
parent7b521d6023113372cf6b02e562828273c4040f0e (diff)
Add runtime encoding detection and conversionfix/char-detection
Win-1251/1252 detection is a reduced C++ version of https://github.com/hsivonen/chardetng Add manually-specified encoding fallback Add default system encoding fallback Add error recovery to v2script Add unknown encoding detection warning Remove csv::Parser templating Fix lua files dropping data Update lexy to foonathan/lexy@1e5d99fa3826b1c3c8628d3a11117fb4fb4cc0d0 Remove exclusive reliance on lexy::default_encoding for v2script Move internal concepts to src/openvic-detail/InternalConcepts.hpp Move contents of DetectUtf8.hpp to src/detail/Detect.hpp Move openvic-dataloader/AbstractSyntaxTree.hpp to src Move DiagnosticLogger.hpp to src Move File.hpp to src Move openvic-dataloader/detail/utlity files to openvic-dataloader/detail Add ovdl::utility::type_concat Add ovdl::utility::type_prepend Add ovdl::utility::is_instance_of Overhaul parse error messages
Diffstat (limited to 'src/openvic-dataloader/detail/DetectUtf8.hpp')
-rw-r--r--src/openvic-dataloader/detail/DetectUtf8.hpp53
1 files changed, 0 insertions, 53 deletions
diff --git a/src/openvic-dataloader/detail/DetectUtf8.hpp b/src/openvic-dataloader/detail/DetectUtf8.hpp
deleted file mode 100644
index e9d0350..0000000
--- a/src/openvic-dataloader/detail/DetectUtf8.hpp
+++ /dev/null
@@ -1,53 +0,0 @@
-#pragma once
-
-#include <lexy/action/match.hpp>
-#include <lexy/dsl.hpp>
-
-#include "detail/dsl.hpp"
-
-namespace ovdl::detail {
- namespace detect_utf8 {
-
- template<bool INCLUDE_ASCII>
- struct DetectUtf8 {
- struct not_utf8 {
- static constexpr auto name = "not utf8";
- };
-
- static constexpr auto rule = [] {
- constexpr auto is_not_ascii_flag = lexy::dsl::context_flag<DetectUtf8>;
-
- // & 0b10000000 == 0b00000000
- constexpr auto ascii_values = dsl::make_range<0b00000000, 0b01111111>();
- // & 0b11100000 == 0b11000000
- constexpr auto two_byte = dsl::make_range<0b11000000, 0b11011111>();
- // & 0b11110000 == 0b11100000
- constexpr auto three_byte = dsl::make_range<0b11100000, 0b11101111>();
- // & 0b11111000 == 0b11110000
- constexpr auto four_byte = dsl::make_range<0b11110000, 0b11110111>();
- // & 0b11000000 == 0b10000000
- constexpr auto check_bytes = dsl::make_range<0b10000000, 0b10111111>();
-
- constexpr auto utf8_check =
- ((four_byte >> lexy::dsl::times<3>(check_bytes)) |
- (three_byte >> lexy::dsl::times<2>(check_bytes)) |
- (two_byte >> lexy::dsl::times<1>(check_bytes))) >>
- is_not_ascii_flag.set();
-
- return is_not_ascii_flag.template create<INCLUDE_ASCII>() +
- lexy::dsl::while_(utf8_check | ascii_values) +
- lexy::dsl::must(is_not_ascii_flag.is_set()).template error<not_utf8>;
- }();
- };
- }
-
- template<typename Input>
- constexpr bool is_utf8_no_ascii(const Input& input) {
- return lexy::match<detect_utf8::DetectUtf8<false>>(input);
- }
-
- template<typename Input>
- constexpr bool is_utf8(const Input& input) {
- return lexy::match<detect_utf8::DetectUtf8<true>>(input);
- }
-} \ No newline at end of file