aboutsummaryrefslogtreecommitdiff
path: root/src/openvic-dataloader/detail/DetectUtf8.hpp
blob: e9d0350a16cbaf505b8e9324651fa9dcdbd6820f (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#pragma once

#include <lexy/action/match.hpp>
#include <lexy/dsl.hpp>

#include "detail/dsl.hpp"

namespace ovdl::detail {
   namespace detect_utf8 {

      template<bool INCLUDE_ASCII>
      struct DetectUtf8 {
         struct not_utf8 {
            static constexpr auto name = "not utf8";
         };

         static constexpr auto rule = [] {
            constexpr auto is_not_ascii_flag = lexy::dsl::context_flag<DetectUtf8>;

            // & 0b10000000 == 0b00000000
            constexpr auto ascii_values = dsl::make_range<0b00000000, 0b01111111>();
            // & 0b11100000 == 0b11000000
            constexpr auto two_byte = dsl::make_range<0b11000000, 0b11011111>();
            // & 0b11110000 == 0b11100000
            constexpr auto three_byte = dsl::make_range<0b11100000, 0b11101111>();
            // & 0b11111000 == 0b11110000
            constexpr auto four_byte = dsl::make_range<0b11110000, 0b11110111>();
            // & 0b11000000 == 0b10000000
            constexpr auto check_bytes = dsl::make_range<0b10000000, 0b10111111>();

            constexpr auto utf8_check =
               ((four_byte >> lexy::dsl::times<3>(check_bytes)) |
                  (three_byte >> lexy::dsl::times<2>(check_bytes)) |
                  (two_byte >> lexy::dsl::times<1>(check_bytes))) >>
               is_not_ascii_flag.set();

            return is_not_ascii_flag.template create<INCLUDE_ASCII>() +
                  lexy::dsl::while_(utf8_check | ascii_values) +
                  lexy::dsl::must(is_not_ascii_flag.is_set()).template error<not_utf8>;
         }();
      };
   }

   template<typename Input>
   constexpr bool is_utf8_no_ascii(const Input& input) {
      return lexy::match<detect_utf8::DetectUtf8<false>>(input);
   }

   template<typename Input>
   constexpr bool is_utf8(const Input& input) {
      return lexy::match<detect_utf8::DetectUtf8<true>>(input);
   }
}