From 831af1d49c9bb7d17794d259c99f92b2513496c5 Mon Sep 17 00:00:00 2001
From: Tom Smeding <tom@tomsmeding.com>
Date: Sun, 16 May 2021 19:13:05 +0200
Subject: server: WIP utf8 validation implementation

---
 test/hashtable.c      |   5 +-
 test/main.c           |  32 +++++++-
 test/test_framework.c |  20 +++++
 test/test_framework.h |   4 +
 test/utf8.c           | 219 ++++++++++++++++++++++++++++++++++++++++++++++++++
 utf8.c                |  91 +++++++++++++++++++++
 utf8.h                |   7 ++
 7 files changed, 374 insertions(+), 4 deletions(-)
 create mode 100644 test/utf8.c
 create mode 100644 utf8.c
 create mode 100644 utf8.h

diff --git a/test/hashtable.c b/test/hashtable.c
index ea6217c..a147d61 100644
--- a/test/hashtable.c
+++ b/test/hashtable.c
@@ -2,12 +2,11 @@
 #include <string.h>
 #include <assert.h>
 #include "test_framework.h"
-#include "../global.h"
 #include "../hashtable.h"
 
 
-static const int NUM_RUNS = 1000;
-static const int NUM_OPERATIONS_PER_RUN = 100000;
+static const int NUM_RUNS = 100;
+static const int NUM_OPERATIONS_PER_RUN = 10000;
 
 
 struct pair {
diff --git a/test/main.c b/test/main.c
index 5c0656c..8c2fffe 100644
--- a/test/main.c
+++ b/test/main.c
@@ -1,4 +1,5 @@
 #include <stdio.h>
+#include <stdlib.h>
 #include <stdatomic.h>
 #include <time.h>
 #include "test_framework.h"
@@ -13,7 +14,7 @@ atomic_flag test_framework_assertion_failed = ATOMIC_FLAG_INIT;
 
 
 static void report(const char *clr, const char *label, const char *name, clock_t taken) {
-	printf("\x1B[%sm[%s] %s (%lfs)\n",
+	printf("\x1B[%sm[%s] %s (%lfs)\x1B[0m\n",
 			clr, label, name, (double)taken / CLOCKS_PER_SEC);
 }
 
@@ -41,10 +42,39 @@ static void report_failure(const char *name, clock_t taken) {
 static int run_tests(void) {
 	RUN_TEST(hashtable_unit1);
 	RUN_TEST(hashtable);
+	RUN_TEST(utf8_unit1);
+	RUN_TEST(utf8_random);
+	RUN_TEST(utf8_random_valid);
+	RUN_TEST(utf8_exhaustive_1);
 	return 0;
 }
 
+static unsigned int random_seed_from_device(void) {
+	FILE *f = fopen("/dev/urandom", "r");
+	if (!f) {
+		fprintf(stderr, "Cannot open /dev/urandom\n");
+		exit(1);
+	}
+	unsigned int seed;
+	int nread = fread(&seed, 1, sizeof seed, f);
+	if (nread < (int)sizeof seed) {
+		fprintf(stderr, "Cannot read from /dev/urandom\n");
+		exit(1);
+	}
+	fclose(f);
+	return seed;
+}
 
 int main(void) {
+	unsigned int seed;
+	const char *seed_env_var = getenv("SEED");
+	if (seed_env_var && seed_env_var[0]) {
+		seed = strtol(seed_env_var, NULL, 10);
+	} else {
+		seed = random_seed_from_device();
+	}
+	fprintf(stderr, "seed = %u\n", seed);
+	srandom(seed);
+
 	return run_tests();
 }
diff --git a/test/test_framework.c b/test/test_framework.c
index 8cd53ac..d566944 100644
--- a/test/test_framework.c
+++ b/test/test_framework.c
@@ -1,5 +1,6 @@
 #include <stdio.h>
 #include <stdatomic.h>
+#include <stdint.h>
 #include "test_framework.h"
 
 
@@ -12,3 +13,22 @@ void test_report_error(
 			fname, lineno, type, condition);
 	atomic_flag_test_and_set(&test_framework_assertion_failed);
 }
+
+void print_buffer(FILE *stream, const void *buffer_, size_t length) {
+	const uint8_t *buffer = (const uint8_t*)buffer_;
+	fputc('"', stream);
+
+	for (size_t i = 0; i < length; i++) {
+		const uint8_t b = buffer[i];
+		if (b == '"') fprintf(stream, "\\\"");
+		else if (b == '\\') fprintf(stream, "\\\\");
+		else if (32 <= b && b < 127) fputc(b, stream);
+		else {
+			fprintf(stream, "\\x%c%c",
+					"0123456789abcdef"[b >> 4],
+					"0123456789abcdef"[b & 0xf]);
+		}
+	}
+
+	fprintf(stream, "\"[%zu]", length);
+}
diff --git a/test/test_framework.h b/test/test_framework.h
index 327de00..3323190 100644
--- a/test/test_framework.h
+++ b/test/test_framework.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <stdio.h>
+
 
 #define TEST(name) testfn__ ## name
 
@@ -23,3 +25,5 @@ void test_report_error(
 			return (ret_); \
 		} \
 	} while (0)
+
+void print_buffer(FILE *stream, const void *buffer, size_t length);
diff --git a/test/utf8.c b/test/utf8.c
new file mode 100644
index 0000000..afc7383
--- /dev/null
+++ b/test/utf8.c
@@ -0,0 +1,219 @@
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include "test_framework.h"
+#include "../global.h"
+#include "../utf8.h"
+
+
+// Returns the number of bytes in the utf8 unit, or -1 on invalid input.
+// If the parse is successful, puts the bits that are part of the unit being
+// parsed in *unit.
+static int parse_utf8_prefix_byte(uint8_t b, int64_t *unit) {
+	if ((b & 0b10000000) == 0b00000000) {*unit = b & 0b01111111; return 1;}
+	if ((b & 0b11100000) == 0b11000000) {*unit = b & 0b00011111; return 2;}
+	if ((b & 0b11110000) == 0b11100000) {*unit = b & 0b00001111; return 3;}
+	if ((b & 0b11111000) == 0b11110000) {*unit = b & 0b00000111; return 4;}
+	return -1;
+}
+
+// Returns length of the parsed utf8 unit, and puts the parsed value in *unitp.
+// No range checking is done.
+static int parse_utf8_unit(const uint8_t *buf, size_t length, int64_t *unitp, bool debug) {
+	if (length == 0) {
+		if (debug) fprintf(stderr, "[utf8ref] unit at EOS\n");
+		return -1;
+	}
+
+	int64_t unit;
+	const int num_bytes = parse_utf8_prefix_byte(buf[0], &unit);
+	assert(num_bytes == -1 || (1 <= num_bytes && num_bytes <= 4));
+	if (num_bytes == -1) {
+		if (debug) fprintf(stderr, "[utf8ref] invalid prefix byte %x\n", (unsigned)buf[0]);
+		return -1;
+	}
+	assert(unit >= 0);
+	if (length < (size_t)num_bytes) {
+		if (debug) fprintf(stderr, "[utf8ref] prefix byte %x specifies length %d, but EOS\n", (unsigned)buf[0], num_bytes);
+		return -1;
+	}
+
+	for (int i = 1; i < num_bytes; i++) {
+		if ((buf[i] & 0b11000000) != 0b10000000) {
+		if (debug) fprintf(stderr, "[utf8ref] invalid continuation byte %x\n", (unsigned)buf[i]);
+			return -1;
+		}
+		unit = (unit << 6) | (buf[i] & 0b00111111);
+	}
+
+	// check for overlong encodings
+	if ((num_bytes >= 2 && unit <= 0x7F) ||
+			(num_bytes >= 3 && unit <= 0x7FF) ||
+			(num_bytes >= 4 && unit <= 0xFFFF)) {
+		if (debug) fprintf(stderr, "[utf8ref] overlong encoding with prefix byte %x\n", (unsigned)buf[0]);
+		return -1;
+	}
+
+	*unitp = unit;
+	return num_bytes;
+}
+
+static bool validate_utf8_reference(const char *buf_, size_t length, bool debug) {
+	const uint8_t *buf = (const uint8_t*)buf_;
+
+	size_t cursor = 0;
+	while (cursor < length) {
+		int64_t unit;
+		int len = parse_utf8_unit(buf + cursor, length - cursor, &unit, debug);
+		assert(len == -1 || (1 <= len && len <= 4));
+		if (len == -1) return false;
+		assert(unit >= 0);
+		// fprintf(stderr, "unit = 0x%lx\n", unit);
+
+		// Surrogate code point
+		if (0xD800 <= unit && unit <= 0xDFFF) {
+			if (debug) fprintf(stderr, "[utf8ref] surrogate code point %lx (prefix byte %x)\n", unit, (unsigned)buf[cursor]);
+			return false;
+		}
+		// Maximal unicode value
+		if (unit > 0x10FFFF) {
+			if (debug) fprintf(stderr, "[utf8ref] out of range code point %lx (prefix byte %x)\n", unit, (unsigned)buf[cursor]);
+			return false;
+		}
+
+		cursor += len;
+	}
+
+	return true;
+}
+
+// Requires that the buffer has space for at least 4 bytes.
+// Returns the number of bytes written.
+static int utf8_serialise(char *buf_, int64_t unit) {
+	uint8_t *buf = (uint8_t*)buf_;
+
+#define PLACE_CONTINUATION_BYTE(idx_) \
+		{buf[(idx_)] = 0x80 | (unit & 0x3F); unit >>= 6;}
+
+	if (unit <= 0x7F) {
+		buf[0] = unit;
+		return 1;
+	}
+	if (unit <= 0x7FF) {
+		PLACE_CONTINUATION_BYTE(1);
+		buf[0] = 0xC0 | (unit & 0x1F);
+		return 2;
+	}
+	if (unit <= 0xFFFF) {
+		PLACE_CONTINUATION_BYTE(2);
+		PLACE_CONTINUATION_BYTE(1);
+		buf[0] = 0xE0 | (unit & 0x0F);
+		return 3;
+	}
+	if (unit <= 0x10FFFF) {
+		PLACE_CONTINUATION_BYTE(3);
+		PLACE_CONTINUATION_BYTE(2);
+		PLACE_CONTINUATION_BYTE(1);
+		buf[0] = 0xF0 | (unit & 0x07);
+		return 4;
+	}
+	assert(false && "Invalid unit in utf8_serialise");
+
+#undef PLACE_CONTINUATION_BYTE
+}
+
+static void fill_random_buffer(char *buf, size_t length) {
+	size_t i = 0;
+	while (i + sizeof(long) < length) {
+		*(long*)&buf[i] = random();
+		i += sizeof(long);
+	}
+	while (i < length) buf[i++] = random();
+}
+
+DEFINE_TEST(utf8_unit1) {
+	EXPECT(validate_utf8("hello", 5));
+	EXPECT(validate_utf8_reference("hello", 5, true));
+	const char *str = "hello 🧀🇳🇱";
+	EXPECT(validate_utf8(str, strlen(str)));
+	EXPECT(validate_utf8_reference(str, strlen(str), true));
+	EXPECT(validate_utf8("\xe0\xad\xbc`j", 5));
+	EXPECT(validate_utf8_reference("\xe0\xad\xbc`j", 5, true));
+	EXPECT(validate_utf8("\xd3\xb0\\i\x00\x00\x00\x001\xc7\xaa_", 12));
+	EXPECT(validate_utf8("\xc7\xaa_", 3));
+	EXPECT(validate_utf8_reference("\xd3\xb0\\i\x00\x00\x00\x001\xc7\xaa_", 12, true));
+	EXPECT(!validate_utf8("\xf2\x98\xbcx", 4));
+	EXPECT(!validate_utf8_reference("\xf2\x98\xbcx", 4, false));
+	return 0;
+}
+
+DEFINE_TEST(utf8_random) {
+	const int max_length = 100;
+	const int num_tests = 10000000;
+
+	char *buffer = malloc(max_length + 1, char);
+	for (int test = 0; test < num_tests; test++) {
+		// fprintf(stderr, "== test = %d\n", test);
+		const int length = random() % max_length;
+		fill_random_buffer(buffer, length);
+		const bool ret_ref = validate_utf8_reference(buffer, length, false);
+		const bool ret_impl = validate_utf8(buffer, length);
+		if (ret_ref != ret_impl) {
+			fprintf(stderr, "buffer: ");
+			print_buffer(stderr, buffer, length);
+			fprintf(stderr, "\n");
+			fprintf(stderr, "reference -> %d, implementation -> %d\n", ret_ref, ret_impl);
+			EXPECTRET(1, false && "validate_utf8_reference == validate_utf8");
+		}
+	}
+	free(buffer);
+
+	return 0;
+}
+
+DEFINE_TEST(utf8_random_valid) {
+	const int max_length = 100;
+	const int num_tests = 3000000;
+
+	char *buffer = malloc(max_length + 1, char);
+	for (int test = 0; test < num_tests; test++) {
+		int length = random() % max_length;
+
+		int cursor = 0;
+		while (cursor + 4 <= length) {
+			const int64_t unit = random() % 0x110000;
+			if (0xD800 <= unit && unit <= 0xDFFF) continue;  // surrogate
+			cursor += utf8_serialise(buffer + cursor, unit);
+		}
+		length = cursor;
+
+		const bool ret_ref = validate_utf8_reference(buffer, length, true);
+		if (!ret_ref) {
+			fprintf(stderr, "buffer: ");
+			print_buffer(stderr, buffer, length);
+			fprintf(stderr, "\n");
+			EXPECTRET(1, false && "validate_utf8_reference on valid string");
+		}
+
+		const bool ret_impl = validate_utf8(buffer, length);
+		if (!ret_impl) {
+			fprintf(stderr, "buffer: ");
+			print_buffer(stderr, buffer, length);
+			fprintf(stderr, "\n");
+			EXPECTRET(1, false && "validate_utf8 on valid string");
+		}
+	}
+	free(buffer);
+
+	return 0;
+}
+
+DEFINE_TEST(utf8_exhaustive_1) {
+	for (int64_t number = 0; number < 0x100000000LL; number++) {
+		EXPECT(
+			validate_utf8_reference((const char*)&number, 4, false)
+			== validate_utf8((const char*)&number, 4)
+		);
+	}
+	return 0;
+}
diff --git a/utf8.c b/utf8.c
new file mode 100644
index 0000000..0a06002
--- /dev/null
+++ b/utf8.c
@@ -0,0 +1,91 @@
+#include <stdint.h>
+#include "utf8.h"
+
+
+static bool in(uint8_t lo, uint8_t x, uint8_t hi) {
+	return lo <= x && x <= hi;
+}
+
+static const uint64_t TOP_BIT_MASK_64 = 0x8080808080808080ULL;
+
+
+// 1-byte: U+000000 -- U+00007F: 00 -- 7F
+// 2-byte: U+000080 -- U+0007FF: C2 80 -- DF BF
+// 3-byte: U+000800 -- U+00FFFF: E0 A0 80 -- EF BF BF
+// 4-byte: U+010000 -- U+10FFFF: F0 90 80 80 -- F4 8F BF BF
+//
+// Surrogate codepoints (U+D800 -- U+DFFF) are invalid, so we have to split the
+// 3-byte range.
+//
+// Resulting ranges:
+// 1-byte  : U+000000 -- U+00007F: 00 -- 7F
+// 2-byte  : U+000080 -- U+0007FF: C2 80 -- DF BF
+// 3-byte A: U+000800 -- U+00D7FF: E0 A0 80 -- ED 9F BF
+// 3-byte B: U+00E000 -- U+00FFFF: EE 80 80 -- EF BF BF
+// 4-byte  : U+010000 -- U+10FFFF: F0 90 80 80 -- F4 8F BF BF
+bool validate_utf8(const char *buf_, size_t length) {
+	const uint8_t *buf = (const uint8_t*)buf_;
+
+	size_t i = 0;
+	while (i < length) {
+		// skip 1-byte codepoints, i.e. ASCII values
+		const uint8_t a = buf[i];
+		if (a <= 0x7F) {
+			i++;
+
+			const size_t aligned_index = ((uintptr_t)(buf + i) | 7) + 1 - (uintptr_t)buf;
+
+			while (i != aligned_index) {
+				if (i == length) return true;
+				if (buf[i] >= 0x7F) goto non_ascii_found;
+				i++;
+			}
+
+			while (i + 8 <= length &&
+					(*(const uint64_t*)&buf[i] & TOP_BIT_MASK_64) == 0) {
+				i += 8;
+			}
+
+			while (true) {
+				if (i == length) return true;
+				if (buf[i] >= 0x7F) break;
+				i++;
+			}
+
+		non_ascii_found:
+			continue;
+		}
+
+		// 2-byte
+		if (i + 1 >= length) return false;
+		const uint8_t b = buf[i+1];
+		if (in(0xC2, a, 0xDF) && in(0x80, b, 0xBF)) {
+			i += 2; continue;
+		}
+
+		// 3-byte
+		if (i + 2 >= length) return false;
+		const uint8_t c = buf[i+2];
+		if (!in(0x80, c, 0xBF)) return false;
+		if ((a == 0xE0 && in(0xA0, b, 0xBF)) ||
+			(in(0xE1, a, 0xEC) && in(0x80, b, 0xBF)) ||
+			(a == 0xED && in(0x80, b, 0x9F)) ||
+			(in(0xEE, a, 0xEF) && in(0x80, b, 0xBF))) {
+			i += 3; continue;
+		}
+
+		// 4-byte
+		if (i + 3 >= length) return false;
+		const uint8_t d = buf[i+3];
+		if (!in(0x80, d, 0xBF)) return false;
+		if ((a == 0xF0 && in(0x90, b, 0xBF)) ||
+			(in(0xF1, a, 0xF3) && in(0x80, b, 0xBF)) ||
+			(a == 0xF4 && in(0x80, b, 0x8F))) {
+			i += 4; continue;
+		}
+
+		return false;
+	}
+
+	return true;
+}
diff --git a/utf8.h b/utf8.h
new file mode 100644
index 0000000..fea5e50
--- /dev/null
+++ b/utf8.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#include <stddef.h>
+#include <stdbool.h>
+
+
+bool validate_utf8(const char *buf, size_t length);
-- 
cgit v1.3.1