diff options
| author | Tom Smeding <tom@tomsmeding.com> | 2021-05-16 19:13:05 +0200 | 
|---|---|---|
| committer | Tom Smeding <tom@tomsmeding.com> | 2021-05-16 19:13:05 +0200 | 
| commit | 831af1d49c9bb7d17794d259c99f92b2513496c5 (patch) | |
| tree | 1a7674bb6d4fbcecf1e6b83ff1834b238b06f2a5 | |
| parent | 6aebd10ba045085ed1d0ca7c0ffbf69196c3308c (diff) | |
server: WIP utf8 validation implementation
| -rw-r--r-- | test/hashtable.c | 5 | ||||
| -rw-r--r-- | test/main.c | 32 | ||||
| -rw-r--r-- | test/test_framework.c | 20 | ||||
| -rw-r--r-- | test/test_framework.h | 4 | ||||
| -rw-r--r-- | test/utf8.c | 219 | ||||
| -rw-r--r-- | utf8.c | 91 | ||||
| -rw-r--r-- | utf8.h | 7 | 
7 files changed, 374 insertions, 4 deletions
diff --git a/test/hashtable.c b/test/hashtable.c index ea6217c..a147d61 100644 --- a/test/hashtable.c +++ b/test/hashtable.c @@ -2,12 +2,11 @@  #include <string.h>  #include <assert.h>  #include "test_framework.h" -#include "../global.h"  #include "../hashtable.h" -static const int NUM_RUNS = 1000; -static const int NUM_OPERATIONS_PER_RUN = 100000; +static const int NUM_RUNS = 100; +static const int NUM_OPERATIONS_PER_RUN = 10000;  struct pair { diff --git a/test/main.c b/test/main.c index 5c0656c..8c2fffe 100644 --- a/test/main.c +++ b/test/main.c @@ -1,4 +1,5 @@  #include <stdio.h> +#include <stdlib.h>  #include <stdatomic.h>  #include <time.h>  #include "test_framework.h" @@ -13,7 +14,7 @@ atomic_flag test_framework_assertion_failed = ATOMIC_FLAG_INIT;  static void report(const char *clr, const char *label, const char *name, clock_t taken) { -	printf("\x1B[%sm[%s] %s (%lfs)\n", +	printf("\x1B[%sm[%s] %s (%lfs)\x1B[0m\n",  			clr, label, name, (double)taken / CLOCKS_PER_SEC);  } @@ -41,10 +42,39 @@ static void report_failure(const char *name, clock_t taken) {  static int run_tests(void) {  	RUN_TEST(hashtable_unit1);  	RUN_TEST(hashtable); +	RUN_TEST(utf8_unit1); +	RUN_TEST(utf8_random); +	RUN_TEST(utf8_random_valid); +	RUN_TEST(utf8_exhaustive_1);  	return 0;  } +static unsigned int random_seed_from_device(void) { +	FILE *f = fopen("/dev/urandom", "r"); +	if (!f) { +		fprintf(stderr, "Cannot open /dev/urandom\n"); +		exit(1); +	} +	unsigned int seed; +	int nread = fread(&seed, 1, sizeof seed, f); +	if (nread < (int)sizeof seed) { +		fprintf(stderr, "Cannot read from /dev/urandom\n"); +		exit(1); +	} +	fclose(f); +	return seed; +}  int main(void) { +	unsigned int seed; +	const char *seed_env_var = getenv("SEED"); +	if (seed_env_var && seed_env_var[0]) { +		seed = strtol(seed_env_var, NULL, 10); +	} else { +		seed = random_seed_from_device(); +	} +	fprintf(stderr, "seed = %u\n", seed); +	srandom(seed); +  	return run_tests();  } diff --git a/test/test_framework.c b/test/test_framework.c index 8cd53ac..d566944 100644 --- a/test/test_framework.c +++ b/test/test_framework.c @@ -1,5 +1,6 @@  #include <stdio.h>  #include <stdatomic.h> +#include <stdint.h>  #include "test_framework.h" @@ -12,3 +13,22 @@ void test_report_error(  			fname, lineno, type, condition);  	atomic_flag_test_and_set(&test_framework_assertion_failed);  } + +void print_buffer(FILE *stream, const void *buffer_, size_t length) { +	const uint8_t *buffer = (const uint8_t*)buffer_; +	fputc('"', stream); + +	for (size_t i = 0; i < length; i++) { +		const uint8_t b = buffer[i]; +		if (b == '"') fprintf(stream, "\\\""); +		else if (b == '\\') fprintf(stream, "\\\\"); +		else if (32 <= b && b < 127) fputc(b, stream); +		else { +			fprintf(stream, "\\x%c%c", +					"0123456789abcdef"[b >> 4], +					"0123456789abcdef"[b & 0xf]); +		} +	} + +	fprintf(stream, "\"[%zu]", length); +} diff --git a/test/test_framework.h b/test/test_framework.h index 327de00..3323190 100644 --- a/test/test_framework.h +++ b/test/test_framework.h @@ -1,5 +1,7 @@  #pragma once +#include <stdio.h> +  #define TEST(name) testfn__ ## name @@ -23,3 +25,5 @@ void test_report_error(  			return (ret_); \  		} \  	} while (0) + +void print_buffer(FILE *stream, const void *buffer, size_t length); diff --git a/test/utf8.c b/test/utf8.c new file mode 100644 index 0000000..afc7383 --- /dev/null +++ b/test/utf8.c @@ -0,0 +1,219 @@ +#include <stdio.h> +#include <string.h> +#include <assert.h> +#include "test_framework.h" +#include "../global.h" +#include "../utf8.h" + + +// Returns the number of bytes in the utf8 unit, or -1 on invalid input. +// If the parse is successful, puts the bits that are part of the unit being +// parsed in *unit. +static int parse_utf8_prefix_byte(uint8_t b, int64_t *unit) { +	if ((b & 0b10000000) == 0b00000000) {*unit = b & 0b01111111; return 1;} +	if ((b & 0b11100000) == 0b11000000) {*unit = b & 0b00011111; return 2;} +	if ((b & 0b11110000) == 0b11100000) {*unit = b & 0b00001111; return 3;} +	if ((b & 0b11111000) == 0b11110000) {*unit = b & 0b00000111; return 4;} +	return -1; +} + +// Returns length of the parsed utf8 unit, and puts the parsed value in *unitp. +// No range checking is done. +static int parse_utf8_unit(const uint8_t *buf, size_t length, int64_t *unitp, bool debug) { +	if (length == 0) { +		if (debug) fprintf(stderr, "[utf8ref] unit at EOS\n"); +		return -1; +	} + +	int64_t unit; +	const int num_bytes = parse_utf8_prefix_byte(buf[0], &unit); +	assert(num_bytes == -1 || (1 <= num_bytes && num_bytes <= 4)); +	if (num_bytes == -1) { +		if (debug) fprintf(stderr, "[utf8ref] invalid prefix byte %x\n", (unsigned)buf[0]); +		return -1; +	} +	assert(unit >= 0); +	if (length < (size_t)num_bytes) { +		if (debug) fprintf(stderr, "[utf8ref] prefix byte %x specifies length %d, but EOS\n", (unsigned)buf[0], num_bytes); +		return -1; +	} + +	for (int i = 1; i < num_bytes; i++) { +		if ((buf[i] & 0b11000000) != 0b10000000) { +		if (debug) fprintf(stderr, "[utf8ref] invalid continuation byte %x\n", (unsigned)buf[i]); +			return -1; +		} +		unit = (unit << 6) | (buf[i] & 0b00111111); +	} + +	// check for overlong encodings +	if ((num_bytes >= 2 && unit <= 0x7F) || +			(num_bytes >= 3 && unit <= 0x7FF) || +			(num_bytes >= 4 && unit <= 0xFFFF)) { +		if (debug) fprintf(stderr, "[utf8ref] overlong encoding with prefix byte %x\n", (unsigned)buf[0]); +		return -1; +	} + +	*unitp = unit; +	return num_bytes; +} + +static bool validate_utf8_reference(const char *buf_, size_t length, bool debug) { +	const uint8_t *buf = (const uint8_t*)buf_; + +	size_t cursor = 0; +	while (cursor < length) { +		int64_t unit; +		int len = parse_utf8_unit(buf + cursor, length - cursor, &unit, debug); +		assert(len == -1 || (1 <= len && len <= 4)); +		if (len == -1) return false; +		assert(unit >= 0); +		// fprintf(stderr, "unit = 0x%lx\n", unit); + +		// Surrogate code point +		if (0xD800 <= unit && unit <= 0xDFFF) { +			if (debug) fprintf(stderr, "[utf8ref] surrogate code point %lx (prefix byte %x)\n", unit, (unsigned)buf[cursor]); +			return false; +		} +		// Maximal unicode value +		if (unit > 0x10FFFF) { +			if (debug) fprintf(stderr, "[utf8ref] out of range code point %lx (prefix byte %x)\n", unit, (unsigned)buf[cursor]); +			return false; +		} + +		cursor += len; +	} + +	return true; +} + +// Requires that the buffer has space for at least 4 bytes. +// Returns the number of bytes written. +static int utf8_serialise(char *buf_, int64_t unit) { +	uint8_t *buf = (uint8_t*)buf_; + +#define PLACE_CONTINUATION_BYTE(idx_) \ +		{buf[(idx_)] = 0x80 | (unit & 0x3F); unit >>= 6;} + +	if (unit <= 0x7F) { +		buf[0] = unit; +		return 1; +	} +	if (unit <= 0x7FF) { +		PLACE_CONTINUATION_BYTE(1); +		buf[0] = 0xC0 | (unit & 0x1F); +		return 2; +	} +	if (unit <= 0xFFFF) { +		PLACE_CONTINUATION_BYTE(2); +		PLACE_CONTINUATION_BYTE(1); +		buf[0] = 0xE0 | (unit & 0x0F); +		return 3; +	} +	if (unit <= 0x10FFFF) { +		PLACE_CONTINUATION_BYTE(3); +		PLACE_CONTINUATION_BYTE(2); +		PLACE_CONTINUATION_BYTE(1); +		buf[0] = 0xF0 | (unit & 0x07); +		return 4; +	} +	assert(false && "Invalid unit in utf8_serialise"); + +#undef PLACE_CONTINUATION_BYTE +} + +static void fill_random_buffer(char *buf, size_t length) { +	size_t i = 0; +	while (i + sizeof(long) < length) { +		*(long*)&buf[i] = random(); +		i += sizeof(long); +	} +	while (i < length) buf[i++] = random(); +} + +DEFINE_TEST(utf8_unit1) { +	EXPECT(validate_utf8("hello", 5)); +	EXPECT(validate_utf8_reference("hello", 5, true)); +	const char *str = "hello 🧀🇳🇱"; +	EXPECT(validate_utf8(str, strlen(str))); +	EXPECT(validate_utf8_reference(str, strlen(str), true)); +	EXPECT(validate_utf8("\xe0\xad\xbc`j", 5)); +	EXPECT(validate_utf8_reference("\xe0\xad\xbc`j", 5, true)); +	EXPECT(validate_utf8("\xd3\xb0\\i\x00\x00\x00\x001\xc7\xaa_", 12)); +	EXPECT(validate_utf8("\xc7\xaa_", 3)); +	EXPECT(validate_utf8_reference("\xd3\xb0\\i\x00\x00\x00\x001\xc7\xaa_", 12, true)); +	EXPECT(!validate_utf8("\xf2\x98\xbcx", 4)); +	EXPECT(!validate_utf8_reference("\xf2\x98\xbcx", 4, false)); +	return 0; +} + +DEFINE_TEST(utf8_random) { +	const int max_length = 100; +	const int num_tests = 10000000; + +	char *buffer = malloc(max_length + 1, char); +	for (int test = 0; test < num_tests; test++) { +		// fprintf(stderr, "== test = %d\n", test); +		const int length = random() % max_length; +		fill_random_buffer(buffer, length); +		const bool ret_ref = validate_utf8_reference(buffer, length, false); +		const bool ret_impl = validate_utf8(buffer, length); +		if (ret_ref != ret_impl) { +			fprintf(stderr, "buffer: "); +			print_buffer(stderr, buffer, length); +			fprintf(stderr, "\n"); +			fprintf(stderr, "reference -> %d, implementation -> %d\n", ret_ref, ret_impl); +			EXPECTRET(1, false && "validate_utf8_reference == validate_utf8"); +		} +	} +	free(buffer); + +	return 0; +} + +DEFINE_TEST(utf8_random_valid) { +	const int max_length = 100; +	const int num_tests = 3000000; + +	char *buffer = malloc(max_length + 1, char); +	for (int test = 0; test < num_tests; test++) { +		int length = random() % max_length; + +		int cursor = 0; +		while (cursor + 4 <= length) { +			const int64_t unit = random() % 0x110000; +			if (0xD800 <= unit && unit <= 0xDFFF) continue;  // surrogate +			cursor += utf8_serialise(buffer + cursor, unit); +		} +		length = cursor; + +		const bool ret_ref = validate_utf8_reference(buffer, length, true); +		if (!ret_ref) { +			fprintf(stderr, "buffer: "); +			print_buffer(stderr, buffer, length); +			fprintf(stderr, "\n"); +			EXPECTRET(1, false && "validate_utf8_reference on valid string"); +		} + +		const bool ret_impl = validate_utf8(buffer, length); +		if (!ret_impl) { +			fprintf(stderr, "buffer: "); +			print_buffer(stderr, buffer, length); +			fprintf(stderr, "\n"); +			EXPECTRET(1, false && "validate_utf8 on valid string"); +		} +	} +	free(buffer); + +	return 0; +} + +DEFINE_TEST(utf8_exhaustive_1) { +	for (int64_t number = 0; number < 0x100000000LL; number++) { +		EXPECT( +			validate_utf8_reference((const char*)&number, 4, false) +			== validate_utf8((const char*)&number, 4) +		); +	} +	return 0; +} @@ -0,0 +1,91 @@ +#include <stdint.h> +#include "utf8.h" + + +static bool in(uint8_t lo, uint8_t x, uint8_t hi) { +	return lo <= x && x <= hi; +} + +static const uint64_t TOP_BIT_MASK_64 = 0x8080808080808080ULL; + + +// 1-byte: U+000000 -- U+00007F: 00 -- 7F +// 2-byte: U+000080 -- U+0007FF: C2 80 -- DF BF +// 3-byte: U+000800 -- U+00FFFF: E0 A0 80 -- EF BF BF +// 4-byte: U+010000 -- U+10FFFF: F0 90 80 80 -- F4 8F BF BF +// +// Surrogate codepoints (U+D800 -- U+DFFF) are invalid, so we have to split the +// 3-byte range. +// +// Resulting ranges: +// 1-byte  : U+000000 -- U+00007F: 00 -- 7F +// 2-byte  : U+000080 -- U+0007FF: C2 80 -- DF BF +// 3-byte A: U+000800 -- U+00D7FF: E0 A0 80 -- ED 9F BF +// 3-byte B: U+00E000 -- U+00FFFF: EE 80 80 -- EF BF BF +// 4-byte  : U+010000 -- U+10FFFF: F0 90 80 80 -- F4 8F BF BF +bool validate_utf8(const char *buf_, size_t length) { +	const uint8_t *buf = (const uint8_t*)buf_; + +	size_t i = 0; +	while (i < length) { +		// skip 1-byte codepoints, i.e. ASCII values +		const uint8_t a = buf[i]; +		if (a <= 0x7F) { +			i++; + +			const size_t aligned_index = ((uintptr_t)(buf + i) | 7) + 1 - (uintptr_t)buf; + +			while (i != aligned_index) { +				if (i == length) return true; +				if (buf[i] >= 0x7F) goto non_ascii_found; +				i++; +			} + +			while (i + 8 <= length && +					(*(const uint64_t*)&buf[i] & TOP_BIT_MASK_64) == 0) { +				i += 8; +			} + +			while (true) { +				if (i == length) return true; +				if (buf[i] >= 0x7F) break; +				i++; +			} + +		non_ascii_found: +			continue; +		} + +		// 2-byte +		if (i + 1 >= length) return false; +		const uint8_t b = buf[i+1]; +		if (in(0xC2, a, 0xDF) && in(0x80, b, 0xBF)) { +			i += 2; continue; +		} + +		// 3-byte +		if (i + 2 >= length) return false; +		const uint8_t c = buf[i+2]; +		if (!in(0x80, c, 0xBF)) return false; +		if ((a == 0xE0 && in(0xA0, b, 0xBF)) || +			(in(0xE1, a, 0xEC) && in(0x80, b, 0xBF)) || +			(a == 0xED && in(0x80, b, 0x9F)) || +			(in(0xEE, a, 0xEF) && in(0x80, b, 0xBF))) { +			i += 3; continue; +		} + +		// 4-byte +		if (i + 3 >= length) return false; +		const uint8_t d = buf[i+3]; +		if (!in(0x80, d, 0xBF)) return false; +		if ((a == 0xF0 && in(0x90, b, 0xBF)) || +			(in(0xF1, a, 0xF3) && in(0x80, b, 0xBF)) || +			(a == 0xF4 && in(0x80, b, 0x8F))) { +			i += 4; continue; +		} + +		return false; +	} + +	return true; +} @@ -0,0 +1,7 @@ +#pragma once + +#include <stddef.h> +#include <stdbool.h> + + +bool validate_utf8(const char *buf, size_t length);  | 
