aboutsummaryrefslogtreecommitdiff
path: root/utf8.c
blob: 0a0600230d47f597a54f359932c68a3af3a6eb5e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#include <stdint.h>
#include "utf8.h"


static bool in(uint8_t lo, uint8_t x, uint8_t hi) {
	return lo <= x && x <= hi;
}

static const uint64_t TOP_BIT_MASK_64 = 0x8080808080808080ULL;


// 1-byte: U+000000 -- U+00007F: 00 -- 7F
// 2-byte: U+000080 -- U+0007FF: C2 80 -- DF BF
// 3-byte: U+000800 -- U+00FFFF: E0 A0 80 -- EF BF BF
// 4-byte: U+010000 -- U+10FFFF: F0 90 80 80 -- F4 8F BF BF
//
// Surrogate codepoints (U+D800 -- U+DFFF) are invalid, so we have to split the
// 3-byte range.
//
// Resulting ranges:
// 1-byte  : U+000000 -- U+00007F: 00 -- 7F
// 2-byte  : U+000080 -- U+0007FF: C2 80 -- DF BF
// 3-byte A: U+000800 -- U+00D7FF: E0 A0 80 -- ED 9F BF
// 3-byte B: U+00E000 -- U+00FFFF: EE 80 80 -- EF BF BF
// 4-byte  : U+010000 -- U+10FFFF: F0 90 80 80 -- F4 8F BF BF
bool validate_utf8(const char *buf_, size_t length) {
	const uint8_t *buf = (const uint8_t*)buf_;

	size_t i = 0;
	while (i < length) {
		// skip 1-byte codepoints, i.e. ASCII values
		const uint8_t a = buf[i];
		if (a <= 0x7F) {
			i++;

			const size_t aligned_index = ((uintptr_t)(buf + i) | 7) + 1 - (uintptr_t)buf;

			while (i != aligned_index) {
				if (i == length) return true;
				if (buf[i] >= 0x7F) goto non_ascii_found;
				i++;
			}

			while (i + 8 <= length &&
					(*(const uint64_t*)&buf[i] & TOP_BIT_MASK_64) == 0) {
				i += 8;
			}

			while (true) {
				if (i == length) return true;
				if (buf[i] >= 0x7F) break;
				i++;
			}

		non_ascii_found:
			continue;
		}

		// 2-byte
		if (i + 1 >= length) return false;
		const uint8_t b = buf[i+1];
		if (in(0xC2, a, 0xDF) && in(0x80, b, 0xBF)) {
			i += 2; continue;
		}

		// 3-byte
		if (i + 2 >= length) return false;
		const uint8_t c = buf[i+2];
		if (!in(0x80, c, 0xBF)) return false;
		if ((a == 0xE0 && in(0xA0, b, 0xBF)) ||
			(in(0xE1, a, 0xEC) && in(0x80, b, 0xBF)) ||
			(a == 0xED && in(0x80, b, 0x9F)) ||
			(in(0xEE, a, 0xEF) && in(0x80, b, 0xBF))) {
			i += 3; continue;
		}

		// 4-byte
		if (i + 3 >= length) return false;
		const uint8_t d = buf[i+3];
		if (!in(0x80, d, 0xBF)) return false;
		if ((a == 0xF0 && in(0x90, b, 0xBF)) ||
			(in(0xF1, a, 0xF3) && in(0x80, b, 0xBF)) ||
			(a == 0xF4 && in(0x80, b, 0x8F))) {
			i += 4; continue;
		}

		return false;
	}

	return true;
}