From 831af1d49c9bb7d17794d259c99f92b2513496c5 Mon Sep 17 00:00:00 2001 From: Tom Smeding Date: Sun, 16 May 2021 19:13:05 +0200 Subject: server: WIP utf8 validation implementation --- test/utf8.c | 219 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 219 insertions(+) create mode 100644 test/utf8.c (limited to 'test/utf8.c') diff --git a/test/utf8.c b/test/utf8.c new file mode 100644 index 0000000..afc7383 --- /dev/null +++ b/test/utf8.c @@ -0,0 +1,219 @@ +#include +#include +#include +#include "test_framework.h" +#include "../global.h" +#include "../utf8.h" + + +// Returns the number of bytes in the utf8 unit, or -1 on invalid input. +// If the parse is successful, puts the bits that are part of the unit being +// parsed in *unit. +static int parse_utf8_prefix_byte(uint8_t b, int64_t *unit) { + if ((b & 0b10000000) == 0b00000000) {*unit = b & 0b01111111; return 1;} + if ((b & 0b11100000) == 0b11000000) {*unit = b & 0b00011111; return 2;} + if ((b & 0b11110000) == 0b11100000) {*unit = b & 0b00001111; return 3;} + if ((b & 0b11111000) == 0b11110000) {*unit = b & 0b00000111; return 4;} + return -1; +} + +// Returns length of the parsed utf8 unit, and puts the parsed value in *unitp. +// No range checking is done. +static int parse_utf8_unit(const uint8_t *buf, size_t length, int64_t *unitp, bool debug) { + if (length == 0) { + if (debug) fprintf(stderr, "[utf8ref] unit at EOS\n"); + return -1; + } + + int64_t unit; + const int num_bytes = parse_utf8_prefix_byte(buf[0], &unit); + assert(num_bytes == -1 || (1 <= num_bytes && num_bytes <= 4)); + if (num_bytes == -1) { + if (debug) fprintf(stderr, "[utf8ref] invalid prefix byte %x\n", (unsigned)buf[0]); + return -1; + } + assert(unit >= 0); + if (length < (size_t)num_bytes) { + if (debug) fprintf(stderr, "[utf8ref] prefix byte %x specifies length %d, but EOS\n", (unsigned)buf[0], num_bytes); + return -1; + } + + for (int i = 1; i < num_bytes; i++) { + if ((buf[i] & 0b11000000) != 0b10000000) { + if (debug) fprintf(stderr, "[utf8ref] invalid continuation byte %x\n", (unsigned)buf[i]); + return -1; + } + unit = (unit << 6) | (buf[i] & 0b00111111); + } + + // check for overlong encodings + if ((num_bytes >= 2 && unit <= 0x7F) || + (num_bytes >= 3 && unit <= 0x7FF) || + (num_bytes >= 4 && unit <= 0xFFFF)) { + if (debug) fprintf(stderr, "[utf8ref] overlong encoding with prefix byte %x\n", (unsigned)buf[0]); + return -1; + } + + *unitp = unit; + return num_bytes; +} + +static bool validate_utf8_reference(const char *buf_, size_t length, bool debug) { + const uint8_t *buf = (const uint8_t*)buf_; + + size_t cursor = 0; + while (cursor < length) { + int64_t unit; + int len = parse_utf8_unit(buf + cursor, length - cursor, &unit, debug); + assert(len == -1 || (1 <= len && len <= 4)); + if (len == -1) return false; + assert(unit >= 0); + // fprintf(stderr, "unit = 0x%lx\n", unit); + + // Surrogate code point + if (0xD800 <= unit && unit <= 0xDFFF) { + if (debug) fprintf(stderr, "[utf8ref] surrogate code point %lx (prefix byte %x)\n", unit, (unsigned)buf[cursor]); + return false; + } + // Maximal unicode value + if (unit > 0x10FFFF) { + if (debug) fprintf(stderr, "[utf8ref] out of range code point %lx (prefix byte %x)\n", unit, (unsigned)buf[cursor]); + return false; + } + + cursor += len; + } + + return true; +} + +// Requires that the buffer has space for at least 4 bytes. +// Returns the number of bytes written. +static int utf8_serialise(char *buf_, int64_t unit) { + uint8_t *buf = (uint8_t*)buf_; + +#define PLACE_CONTINUATION_BYTE(idx_) \ + {buf[(idx_)] = 0x80 | (unit & 0x3F); unit >>= 6;} + + if (unit <= 0x7F) { + buf[0] = unit; + return 1; + } + if (unit <= 0x7FF) { + PLACE_CONTINUATION_BYTE(1); + buf[0] = 0xC0 | (unit & 0x1F); + return 2; + } + if (unit <= 0xFFFF) { + PLACE_CONTINUATION_BYTE(2); + PLACE_CONTINUATION_BYTE(1); + buf[0] = 0xE0 | (unit & 0x0F); + return 3; + } + if (unit <= 0x10FFFF) { + PLACE_CONTINUATION_BYTE(3); + PLACE_CONTINUATION_BYTE(2); + PLACE_CONTINUATION_BYTE(1); + buf[0] = 0xF0 | (unit & 0x07); + return 4; + } + assert(false && "Invalid unit in utf8_serialise"); + +#undef PLACE_CONTINUATION_BYTE +} + +static void fill_random_buffer(char *buf, size_t length) { + size_t i = 0; + while (i + sizeof(long) < length) { + *(long*)&buf[i] = random(); + i += sizeof(long); + } + while (i < length) buf[i++] = random(); +} + +DEFINE_TEST(utf8_unit1) { + EXPECT(validate_utf8("hello", 5)); + EXPECT(validate_utf8_reference("hello", 5, true)); + const char *str = "hello 🧀🇳🇱"; + EXPECT(validate_utf8(str, strlen(str))); + EXPECT(validate_utf8_reference(str, strlen(str), true)); + EXPECT(validate_utf8("\xe0\xad\xbc`j", 5)); + EXPECT(validate_utf8_reference("\xe0\xad\xbc`j", 5, true)); + EXPECT(validate_utf8("\xd3\xb0\\i\x00\x00\x00\x001\xc7\xaa_", 12)); + EXPECT(validate_utf8("\xc7\xaa_", 3)); + EXPECT(validate_utf8_reference("\xd3\xb0\\i\x00\x00\x00\x001\xc7\xaa_", 12, true)); + EXPECT(!validate_utf8("\xf2\x98\xbcx", 4)); + EXPECT(!validate_utf8_reference("\xf2\x98\xbcx", 4, false)); + return 0; +} + +DEFINE_TEST(utf8_random) { + const int max_length = 100; + const int num_tests = 10000000; + + char *buffer = malloc(max_length + 1, char); + for (int test = 0; test < num_tests; test++) { + // fprintf(stderr, "== test = %d\n", test); + const int length = random() % max_length; + fill_random_buffer(buffer, length); + const bool ret_ref = validate_utf8_reference(buffer, length, false); + const bool ret_impl = validate_utf8(buffer, length); + if (ret_ref != ret_impl) { + fprintf(stderr, "buffer: "); + print_buffer(stderr, buffer, length); + fprintf(stderr, "\n"); + fprintf(stderr, "reference -> %d, implementation -> %d\n", ret_ref, ret_impl); + EXPECTRET(1, false && "validate_utf8_reference == validate_utf8"); + } + } + free(buffer); + + return 0; +} + +DEFINE_TEST(utf8_random_valid) { + const int max_length = 100; + const int num_tests = 3000000; + + char *buffer = malloc(max_length + 1, char); + for (int test = 0; test < num_tests; test++) { + int length = random() % max_length; + + int cursor = 0; + while (cursor + 4 <= length) { + const int64_t unit = random() % 0x110000; + if (0xD800 <= unit && unit <= 0xDFFF) continue; // surrogate + cursor += utf8_serialise(buffer + cursor, unit); + } + length = cursor; + + const bool ret_ref = validate_utf8_reference(buffer, length, true); + if (!ret_ref) { + fprintf(stderr, "buffer: "); + print_buffer(stderr, buffer, length); + fprintf(stderr, "\n"); + EXPECTRET(1, false && "validate_utf8_reference on valid string"); + } + + const bool ret_impl = validate_utf8(buffer, length); + if (!ret_impl) { + fprintf(stderr, "buffer: "); + print_buffer(stderr, buffer, length); + fprintf(stderr, "\n"); + EXPECTRET(1, false && "validate_utf8 on valid string"); + } + } + free(buffer); + + return 0; +} + +DEFINE_TEST(utf8_exhaustive_1) { + for (int64_t number = 0; number < 0x100000000LL; number++) { + EXPECT( + validate_utf8_reference((const char*)&number, 4, false) + == validate_utf8((const char*)&number, 4) + ); + } + return 0; +} -- cgit v1.2.3-54-g00ecf