1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
|
#include <stdint.h>
#include "utf8.h"
static bool in(uint8_t lo, uint8_t x, uint8_t hi) {
return lo <= x && x <= hi;
}
static const uint64_t TOP_BIT_MASK_64 = 0x8080808080808080ULL;
// 1-byte: U+000000 -- U+00007F: 00 -- 7F
// 2-byte: U+000080 -- U+0007FF: C2 80 -- DF BF
// 3-byte: U+000800 -- U+00FFFF: E0 A0 80 -- EF BF BF
// 4-byte: U+010000 -- U+10FFFF: F0 90 80 80 -- F4 8F BF BF
//
// Surrogate codepoints (U+D800 -- U+DFFF) are invalid, so we have to split the
// 3-byte range.
//
// Resulting ranges:
// 1-byte : U+000000 -- U+00007F: 00 -- 7F
// 2-byte : U+000080 -- U+0007FF: C2 80 -- DF BF
// 3-byte A: U+000800 -- U+00D7FF: E0 A0 80 -- ED 9F BF
// 3-byte B: U+00E000 -- U+00FFFF: EE 80 80 -- EF BF BF
// 4-byte : U+010000 -- U+10FFFF: F0 90 80 80 -- F4 8F BF BF
bool validate_utf8(const char *buf_, size_t length) {
const uint8_t *buf = (const uint8_t*)buf_;
size_t i = 0;
while (i < length) {
// skip 1-byte codepoints, i.e. ASCII values
const uint8_t a = buf[i];
if (a <= 0x7F) {
i++;
const size_t aligned_index = ((uintptr_t)(buf + i) | 7) + 1 - (uintptr_t)buf;
while (i != aligned_index) {
if (i == length) return true;
if (buf[i] >= 0x7F) goto non_ascii_found;
i++;
}
while (i + 8 <= length &&
(*(const uint64_t*)&buf[i] & TOP_BIT_MASK_64) == 0) {
i += 8;
}
while (true) {
if (i == length) return true;
if (buf[i] >= 0x7F) break;
i++;
}
non_ascii_found:
continue;
}
// 2-byte
if (i + 1 >= length) return false;
const uint8_t b = buf[i+1];
if (in(0xC2, a, 0xDF) && in(0x80, b, 0xBF)) {
i += 2; continue;
}
// 3-byte
if (i + 2 >= length) return false;
const uint8_t c = buf[i+2];
if (!in(0x80, c, 0xBF)) return false;
if ((a == 0xE0 && in(0xA0, b, 0xBF)) ||
(in(0xE1, a, 0xEC) && in(0x80, b, 0xBF)) ||
(a == 0xED && in(0x80, b, 0x9F)) ||
(in(0xEE, a, 0xEF) && in(0x80, b, 0xBF))) {
i += 3; continue;
}
// 4-byte
if (i + 3 >= length) return false;
const uint8_t d = buf[i+3];
if (!in(0x80, d, 0xBF)) return false;
if ((a == 0xF0 && in(0x90, b, 0xBF)) ||
(in(0xF1, a, 0xF3) && in(0x80, b, 0xBF)) ||
(a == 0xF4 && in(0x80, b, 0x8F))) {
i += 4; continue;
}
return false;
}
return true;
}
|