aboutsummaryrefslogtreecommitdiff
path: root/utf8.c
diff options
context:
space:
mode:
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c91
1 files changed, 91 insertions, 0 deletions
diff --git a/utf8.c b/utf8.c
new file mode 100644
index 0000000..0a06002
--- /dev/null
+++ b/utf8.c
@@ -0,0 +1,91 @@
+#include <stdint.h>
+#include "utf8.h"
+
+
+static bool in(uint8_t lo, uint8_t x, uint8_t hi) {
+ return lo <= x && x <= hi;
+}
+
+static const uint64_t TOP_BIT_MASK_64 = 0x8080808080808080ULL;
+
+
+// 1-byte: U+000000 -- U+00007F: 00 -- 7F
+// 2-byte: U+000080 -- U+0007FF: C2 80 -- DF BF
+// 3-byte: U+000800 -- U+00FFFF: E0 A0 80 -- EF BF BF
+// 4-byte: U+010000 -- U+10FFFF: F0 90 80 80 -- F4 8F BF BF
+//
+// Surrogate codepoints (U+D800 -- U+DFFF) are invalid, so we have to split the
+// 3-byte range.
+//
+// Resulting ranges:
+// 1-byte : U+000000 -- U+00007F: 00 -- 7F
+// 2-byte : U+000080 -- U+0007FF: C2 80 -- DF BF
+// 3-byte A: U+000800 -- U+00D7FF: E0 A0 80 -- ED 9F BF
+// 3-byte B: U+00E000 -- U+00FFFF: EE 80 80 -- EF BF BF
+// 4-byte : U+010000 -- U+10FFFF: F0 90 80 80 -- F4 8F BF BF
+bool validate_utf8(const char *buf_, size_t length) {
+ const uint8_t *buf = (const uint8_t*)buf_;
+
+ size_t i = 0;
+ while (i < length) {
+ // skip 1-byte codepoints, i.e. ASCII values
+ const uint8_t a = buf[i];
+ if (a <= 0x7F) {
+ i++;
+
+ const size_t aligned_index = ((uintptr_t)(buf + i) | 7) + 1 - (uintptr_t)buf;
+
+ while (i != aligned_index) {
+ if (i == length) return true;
+ if (buf[i] >= 0x7F) goto non_ascii_found;
+ i++;
+ }
+
+ while (i + 8 <= length &&
+ (*(const uint64_t*)&buf[i] & TOP_BIT_MASK_64) == 0) {
+ i += 8;
+ }
+
+ while (true) {
+ if (i == length) return true;
+ if (buf[i] >= 0x7F) break;
+ i++;
+ }
+
+ non_ascii_found:
+ continue;
+ }
+
+ // 2-byte
+ if (i + 1 >= length) return false;
+ const uint8_t b = buf[i+1];
+ if (in(0xC2, a, 0xDF) && in(0x80, b, 0xBF)) {
+ i += 2; continue;
+ }
+
+ // 3-byte
+ if (i + 2 >= length) return false;
+ const uint8_t c = buf[i+2];
+ if (!in(0x80, c, 0xBF)) return false;
+ if ((a == 0xE0 && in(0xA0, b, 0xBF)) ||
+ (in(0xE1, a, 0xEC) && in(0x80, b, 0xBF)) ||
+ (a == 0xED && in(0x80, b, 0x9F)) ||
+ (in(0xEE, a, 0xEF) && in(0x80, b, 0xBF))) {
+ i += 3; continue;
+ }
+
+ // 4-byte
+ if (i + 3 >= length) return false;
+ const uint8_t d = buf[i+3];
+ if (!in(0x80, d, 0xBF)) return false;
+ if ((a == 0xF0 && in(0x90, b, 0xBF)) ||
+ (in(0xF1, a, 0xF3) && in(0x80, b, 0xBF)) ||
+ (a == 0xF4 && in(0x80, b, 0x8F))) {
+ i += 4; continue;
+ }
+
+ return false;
+ }
+
+ return true;
+}