diff options
author | Spartan322 <Megacake1234@gmail.com> | 2023-07-28 06:52:00 +0200 |
---|---|---|
committer | Spartan322 <Megacake1234@gmail.com> | 2023-09-02 14:28:21 +0200 |
commit | 7440a5d1433eec4bf87e3723022db187e7f61b1a (patch) | |
tree | 2bb062c320fa2227b18956617b94d0e8800420d8 /src/openvic-dataloader/detail/DetectUtf8.hpp | |
parent | e941573f47fb867ff75c8a2cf78302b754ffbeee (diff) |
Rework Grammar and Parser
Add proper headless binary construction:
Includes basic validation
Add Error and Warning structs to Parser
Add FileNode pointer getter to Parser
Change all `char8_t*` and `const char8_t` to `const char*` in Parser
Add Parser move operators and Parser deconstructor
Add BufferHandler PIMPL object to Parser
Add UTF-8 file Warning to v2script
Add proper Grammar value retrieval
Add AbstractSyntaxTree for v2script data parser:
Has compile-time embedded type information accessible at compile-time and runtime
Has Tab-based print functionality
Fix wrong environment reference for headless construction in SConstruct
Add error retrieval
Add BasicCallbackOStreamBuffer for callback streaming
Add CallbackStreamBuffer for char
Add CallbackWStreamBuffer for wchar_t
Add BasicCallbackStream
Add CallbackStream for char
Add CallbackWStream for wchar_t
Add grammar for events and decisions
Add event_parse to Parser
Add decision_parse to Parser
Add .clang-format
Ignore dirty lexy module
Add CSV parser and grammar:
Creates std::vector<csv::LineObject> for a list of lines
Add BasicParser and BasicBufferHandler to reduce code reduplication
Diffstat (limited to 'src/openvic-dataloader/detail/DetectUtf8.hpp')
-rw-r--r-- | src/openvic-dataloader/detail/DetectUtf8.hpp | 53 |
1 files changed, 53 insertions, 0 deletions
diff --git a/src/openvic-dataloader/detail/DetectUtf8.hpp b/src/openvic-dataloader/detail/DetectUtf8.hpp new file mode 100644 index 0000000..2045b3c --- /dev/null +++ b/src/openvic-dataloader/detail/DetectUtf8.hpp @@ -0,0 +1,53 @@ +#pragma once + +#include <lexy/action/match.hpp> +#include <lexy/dsl.hpp> + +#include "detail/LexyLitRange.hpp" + +namespace ovdl::detail { + namespace detect_utf8 { + + template<bool INCLUDE_ASCII> + struct DetectUtf8 { + struct not_utf8 { + static constexpr auto name = "not utf8"; + }; + + static constexpr auto rule = [] { + constexpr auto is_not_ascii_flag = lexy::dsl::context_flag<DetectUtf8>; + + // & 0b10000000 == 0b00000000 + constexpr auto ascii_values = lexydsl::make_range<0b00000000, 0b01111111>(); + // & 0b11100000 == 0b11000000 + constexpr auto two_byte = lexydsl::make_range<0b11000000, 0b11011111>(); + // & 0b11110000 == 0b11100000 + constexpr auto three_byte = lexydsl::make_range<0b11100000, 0b11101111>(); + // & 0b11111000 == 0b11110000 + constexpr auto four_byte = lexydsl::make_range<0b11110000, 0b11110111>(); + // & 0b11000000 == 0b10000000 + constexpr auto check_bytes = lexydsl::make_range<0b10000000, 0b10111111>(); + + constexpr auto utf8_check = + ((four_byte >> lexy::dsl::times<3>(check_bytes)) | + (three_byte >> lexy::dsl::times<2>(check_bytes)) | + (two_byte >> lexy::dsl::times<1>(check_bytes))) >> + is_not_ascii_flag.set(); + + return is_not_ascii_flag.template create<INCLUDE_ASCII>() + + lexy::dsl::while_(utf8_check | ascii_values) + + lexy::dsl::must(is_not_ascii_flag.is_set()).template error<not_utf8>; + }(); + }; + } + + template<typename Input> + constexpr bool is_utf8_no_ascii(const Input& input) { + return lexy::match<detect_utf8::DetectUtf8<false>>(input); + } + + template<typename Input> + constexpr bool is_utf8(const Input& input) { + return lexy::match<detect_utf8::DetectUtf8<true>>(input); + } +}
\ No newline at end of file |