diff options
Diffstat (limited to 'utf8.c')
-rw-r--r-- | utf8.c | 91 |
1 files changed, 91 insertions, 0 deletions
@@ -0,0 +1,91 @@ +#include <stdint.h> +#include "utf8.h" + + +static bool in(uint8_t lo, uint8_t x, uint8_t hi) { + return lo <= x && x <= hi; +} + +static const uint64_t TOP_BIT_MASK_64 = 0x8080808080808080ULL; + + +// 1-byte: U+000000 -- U+00007F: 00 -- 7F +// 2-byte: U+000080 -- U+0007FF: C2 80 -- DF BF +// 3-byte: U+000800 -- U+00FFFF: E0 A0 80 -- EF BF BF +// 4-byte: U+010000 -- U+10FFFF: F0 90 80 80 -- F4 8F BF BF +// +// Surrogate codepoints (U+D800 -- U+DFFF) are invalid, so we have to split the +// 3-byte range. +// +// Resulting ranges: +// 1-byte : U+000000 -- U+00007F: 00 -- 7F +// 2-byte : U+000080 -- U+0007FF: C2 80 -- DF BF +// 3-byte A: U+000800 -- U+00D7FF: E0 A0 80 -- ED 9F BF +// 3-byte B: U+00E000 -- U+00FFFF: EE 80 80 -- EF BF BF +// 4-byte : U+010000 -- U+10FFFF: F0 90 80 80 -- F4 8F BF BF +bool validate_utf8(const char *buf_, size_t length) { + const uint8_t *buf = (const uint8_t*)buf_; + + size_t i = 0; + while (i < length) { + // skip 1-byte codepoints, i.e. ASCII values + const uint8_t a = buf[i]; + if (a <= 0x7F) { + i++; + + const size_t aligned_index = ((uintptr_t)(buf + i) | 7) + 1 - (uintptr_t)buf; + + while (i != aligned_index) { + if (i == length) return true; + if (buf[i] >= 0x7F) goto non_ascii_found; + i++; + } + + while (i + 8 <= length && + (*(const uint64_t*)&buf[i] & TOP_BIT_MASK_64) == 0) { + i += 8; + } + + while (true) { + if (i == length) return true; + if (buf[i] >= 0x7F) break; + i++; + } + + non_ascii_found: + continue; + } + + // 2-byte + if (i + 1 >= length) return false; + const uint8_t b = buf[i+1]; + if (in(0xC2, a, 0xDF) && in(0x80, b, 0xBF)) { + i += 2; continue; + } + + // 3-byte + if (i + 2 >= length) return false; + const uint8_t c = buf[i+2]; + if (!in(0x80, c, 0xBF)) return false; + if ((a == 0xE0 && in(0xA0, b, 0xBF)) || + (in(0xE1, a, 0xEC) && in(0x80, b, 0xBF)) || + (a == 0xED && in(0x80, b, 0x9F)) || + (in(0xEE, a, 0xEF) && in(0x80, b, 0xBF))) { + i += 3; continue; + } + + // 4-byte + if (i + 3 >= length) return false; + const uint8_t d = buf[i+3]; + if (!in(0x80, d, 0xBF)) return false; + if ((a == 0xF0 && in(0x90, b, 0xBF)) || + (in(0xF1, a, 0xF3) && in(0x80, b, 0xBF)) || + (a == 0xF4 && in(0x80, b, 0x8F))) { + i += 4; continue; + } + + return false; + } + + return true; +} |