#include #include "utf8.h" static bool in(uint8_t lo, uint8_t x, uint8_t hi) { return lo <= x && x <= hi; } static const uint64_t TOP_BIT_MASK_64 = 0x8080808080808080ULL; // 1-byte: U+000000 -- U+00007F: 00 -- 7F // 2-byte: U+000080 -- U+0007FF: C2 80 -- DF BF // 3-byte: U+000800 -- U+00FFFF: E0 A0 80 -- EF BF BF // 4-byte: U+010000 -- U+10FFFF: F0 90 80 80 -- F4 8F BF BF // // Surrogate codepoints (U+D800 -- U+DFFF) are invalid, so we have to split the // 3-byte range. // // Resulting ranges: // 1-byte : U+000000 -- U+00007F: 00 -- 7F // 2-byte : U+000080 -- U+0007FF: C2 80 -- DF BF // 3-byte A: U+000800 -- U+00D7FF: E0 A0 80 -- ED 9F BF // 3-byte B: U+00E000 -- U+00FFFF: EE 80 80 -- EF BF BF // 4-byte : U+010000 -- U+10FFFF: F0 90 80 80 -- F4 8F BF BF bool validate_utf8(const char *buf_, size_t length) { const uint8_t *buf = (const uint8_t*)buf_; size_t i = 0; while (i < length) { // skip 1-byte codepoints, i.e. ASCII values const uint8_t a = buf[i]; if (a <= 0x7F) { i++; const size_t aligned_index = ((uintptr_t)(buf + i) | 7) + 1 - (uintptr_t)buf; while (i != aligned_index) { if (i == length) return true; if (buf[i] >= 0x7F) goto non_ascii_found; i++; } while (i + 8 <= length && (*(const uint64_t*)&buf[i] & TOP_BIT_MASK_64) == 0) { i += 8; } while (true) { if (i == length) return true; if (buf[i] >= 0x7F) break; i++; } non_ascii_found: continue; } // 2-byte if (i + 1 >= length) return false; const uint8_t b = buf[i+1]; if (in(0xC2, a, 0xDF) && in(0x80, b, 0xBF)) { i += 2; continue; } // 3-byte if (i + 2 >= length) return false; const uint8_t c = buf[i+2]; if (!in(0x80, c, 0xBF)) return false; if ((a == 0xE0 && in(0xA0, b, 0xBF)) || (in(0xE1, a, 0xEC) && in(0x80, b, 0xBF)) || (a == 0xED && in(0x80, b, 0x9F)) || (in(0xEE, a, 0xEF) && in(0x80, b, 0xBF))) { i += 3; continue; } // 4-byte if (i + 3 >= length) return false; const uint8_t d = buf[i+3]; if (!in(0x80, d, 0xBF)) return false; if ((a == 0xF0 && in(0x90, b, 0xBF)) || (in(0xF1, a, 0xF3) && in(0x80, b, 0xBF)) || (a == 0xF4 && in(0x80, b, 0x8F))) { i += 4; continue; } return false; } return true; }