aboutsummaryrefslogtreecommitdiff
path: root/src/openvic-dataloader/detail/DetectUtf8.hpp
diff options
context:
space:
mode:
author Spartan322 <Megacake1234@gmail.com>2023-07-28 06:52:00 +0200
committer Spartan322 <Megacake1234@gmail.com>2023-09-02 14:28:21 +0200
commit7440a5d1433eec4bf87e3723022db187e7f61b1a (patch)
tree2bb062c320fa2227b18956617b94d0e8800420d8 /src/openvic-dataloader/detail/DetectUtf8.hpp
parente941573f47fb867ff75c8a2cf78302b754ffbeee (diff)
Rework Grammar and Parser
Add proper headless binary construction: Includes basic validation Add Error and Warning structs to Parser Add FileNode pointer getter to Parser Change all `char8_t*` and `const char8_t` to `const char*` in Parser Add Parser move operators and Parser deconstructor Add BufferHandler PIMPL object to Parser Add UTF-8 file Warning to v2script Add proper Grammar value retrieval Add AbstractSyntaxTree for v2script data parser: Has compile-time embedded type information accessible at compile-time and runtime Has Tab-based print functionality Fix wrong environment reference for headless construction in SConstruct Add error retrieval Add BasicCallbackOStreamBuffer for callback streaming Add CallbackStreamBuffer for char Add CallbackWStreamBuffer for wchar_t Add BasicCallbackStream Add CallbackStream for char Add CallbackWStream for wchar_t Add grammar for events and decisions Add event_parse to Parser Add decision_parse to Parser Add .clang-format Ignore dirty lexy module Add CSV parser and grammar: Creates std::vector<csv::LineObject> for a list of lines Add BasicParser and BasicBufferHandler to reduce code reduplication
Diffstat (limited to 'src/openvic-dataloader/detail/DetectUtf8.hpp')
-rw-r--r--src/openvic-dataloader/detail/DetectUtf8.hpp53
1 files changed, 53 insertions, 0 deletions
diff --git a/src/openvic-dataloader/detail/DetectUtf8.hpp b/src/openvic-dataloader/detail/DetectUtf8.hpp
new file mode 100644
index 0000000..2045b3c
--- /dev/null
+++ b/src/openvic-dataloader/detail/DetectUtf8.hpp
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <lexy/action/match.hpp>
+#include <lexy/dsl.hpp>
+
+#include "detail/LexyLitRange.hpp"
+
+namespace ovdl::detail {
+ namespace detect_utf8 {
+
+ template<bool INCLUDE_ASCII>
+ struct DetectUtf8 {
+ struct not_utf8 {
+ static constexpr auto name = "not utf8";
+ };
+
+ static constexpr auto rule = [] {
+ constexpr auto is_not_ascii_flag = lexy::dsl::context_flag<DetectUtf8>;
+
+ // & 0b10000000 == 0b00000000
+ constexpr auto ascii_values = lexydsl::make_range<0b00000000, 0b01111111>();
+ // & 0b11100000 == 0b11000000
+ constexpr auto two_byte = lexydsl::make_range<0b11000000, 0b11011111>();
+ // & 0b11110000 == 0b11100000
+ constexpr auto three_byte = lexydsl::make_range<0b11100000, 0b11101111>();
+ // & 0b11111000 == 0b11110000
+ constexpr auto four_byte = lexydsl::make_range<0b11110000, 0b11110111>();
+ // & 0b11000000 == 0b10000000
+ constexpr auto check_bytes = lexydsl::make_range<0b10000000, 0b10111111>();
+
+ constexpr auto utf8_check =
+ ((four_byte >> lexy::dsl::times<3>(check_bytes)) |
+ (three_byte >> lexy::dsl::times<2>(check_bytes)) |
+ (two_byte >> lexy::dsl::times<1>(check_bytes))) >>
+ is_not_ascii_flag.set();
+
+ return is_not_ascii_flag.template create<INCLUDE_ASCII>() +
+ lexy::dsl::while_(utf8_check | ascii_values) +
+ lexy::dsl::must(is_not_ascii_flag.is_set()).template error<not_utf8>;
+ }();
+ };
+ }
+
+ template<typename Input>
+ constexpr bool is_utf8_no_ascii(const Input& input) {
+ return lexy::match<detect_utf8::DetectUtf8<false>>(input);
+ }
+
+ template<typename Input>
+ constexpr bool is_utf8(const Input& input) {
+ return lexy::match<detect_utf8::DetectUtf8<true>>(input);
+ }
+} \ No newline at end of file