aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTom Smeding <tom@tomsmeding.com>2021-05-16 19:13:05 +0200
committerTom Smeding <tom@tomsmeding.com>2021-05-16 19:13:05 +0200
commit831af1d49c9bb7d17794d259c99f92b2513496c5 (patch)
tree1a7674bb6d4fbcecf1e6b83ff1834b238b06f2a5
parent6aebd10ba045085ed1d0ca7c0ffbf69196c3308c (diff)
server: WIP utf8 validation implementation
-rw-r--r--test/hashtable.c5
-rw-r--r--test/main.c32
-rw-r--r--test/test_framework.c20
-rw-r--r--test/test_framework.h4
-rw-r--r--test/utf8.c219
-rw-r--r--utf8.c91
-rw-r--r--utf8.h7
7 files changed, 374 insertions, 4 deletions
diff --git a/test/hashtable.c b/test/hashtable.c
index ea6217c..a147d61 100644
--- a/test/hashtable.c
+++ b/test/hashtable.c
@@ -2,12 +2,11 @@
#include <string.h>
#include <assert.h>
#include "test_framework.h"
-#include "../global.h"
#include "../hashtable.h"
-static const int NUM_RUNS = 1000;
-static const int NUM_OPERATIONS_PER_RUN = 100000;
+static const int NUM_RUNS = 100;
+static const int NUM_OPERATIONS_PER_RUN = 10000;
struct pair {
diff --git a/test/main.c b/test/main.c
index 5c0656c..8c2fffe 100644
--- a/test/main.c
+++ b/test/main.c
@@ -1,4 +1,5 @@
#include <stdio.h>
+#include <stdlib.h>
#include <stdatomic.h>
#include <time.h>
#include "test_framework.h"
@@ -13,7 +14,7 @@ atomic_flag test_framework_assertion_failed = ATOMIC_FLAG_INIT;
static void report(const char *clr, const char *label, const char *name, clock_t taken) {
- printf("\x1B[%sm[%s] %s (%lfs)\n",
+ printf("\x1B[%sm[%s] %s (%lfs)\x1B[0m\n",
clr, label, name, (double)taken / CLOCKS_PER_SEC);
}
@@ -41,10 +42,39 @@ static void report_failure(const char *name, clock_t taken) {
static int run_tests(void) {
RUN_TEST(hashtable_unit1);
RUN_TEST(hashtable);
+ RUN_TEST(utf8_unit1);
+ RUN_TEST(utf8_random);
+ RUN_TEST(utf8_random_valid);
+ RUN_TEST(utf8_exhaustive_1);
return 0;
}
+static unsigned int random_seed_from_device(void) {
+ FILE *f = fopen("/dev/urandom", "r");
+ if (!f) {
+ fprintf(stderr, "Cannot open /dev/urandom\n");
+ exit(1);
+ }
+ unsigned int seed;
+ int nread = fread(&seed, 1, sizeof seed, f);
+ if (nread < (int)sizeof seed) {
+ fprintf(stderr, "Cannot read from /dev/urandom\n");
+ exit(1);
+ }
+ fclose(f);
+ return seed;
+}
int main(void) {
+ unsigned int seed;
+ const char *seed_env_var = getenv("SEED");
+ if (seed_env_var && seed_env_var[0]) {
+ seed = strtol(seed_env_var, NULL, 10);
+ } else {
+ seed = random_seed_from_device();
+ }
+ fprintf(stderr, "seed = %u\n", seed);
+ srandom(seed);
+
return run_tests();
}
diff --git a/test/test_framework.c b/test/test_framework.c
index 8cd53ac..d566944 100644
--- a/test/test_framework.c
+++ b/test/test_framework.c
@@ -1,5 +1,6 @@
#include <stdio.h>
#include <stdatomic.h>
+#include <stdint.h>
#include "test_framework.h"
@@ -12,3 +13,22 @@ void test_report_error(
fname, lineno, type, condition);
atomic_flag_test_and_set(&test_framework_assertion_failed);
}
+
+void print_buffer(FILE *stream, const void *buffer_, size_t length) {
+ const uint8_t *buffer = (const uint8_t*)buffer_;
+ fputc('"', stream);
+
+ for (size_t i = 0; i < length; i++) {
+ const uint8_t b = buffer[i];
+ if (b == '"') fprintf(stream, "\\\"");
+ else if (b == '\\') fprintf(stream, "\\\\");
+ else if (32 <= b && b < 127) fputc(b, stream);
+ else {
+ fprintf(stream, "\\x%c%c",
+ "0123456789abcdef"[b >> 4],
+ "0123456789abcdef"[b & 0xf]);
+ }
+ }
+
+ fprintf(stream, "\"[%zu]", length);
+}
diff --git a/test/test_framework.h b/test/test_framework.h
index 327de00..3323190 100644
--- a/test/test_framework.h
+++ b/test/test_framework.h
@@ -1,5 +1,7 @@
#pragma once
+#include <stdio.h>
+
#define TEST(name) testfn__ ## name
@@ -23,3 +25,5 @@ void test_report_error(
return (ret_); \
} \
} while (0)
+
+void print_buffer(FILE *stream, const void *buffer, size_t length);
diff --git a/test/utf8.c b/test/utf8.c
new file mode 100644
index 0000000..afc7383
--- /dev/null
+++ b/test/utf8.c
@@ -0,0 +1,219 @@
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include "test_framework.h"
+#include "../global.h"
+#include "../utf8.h"
+
+
+// Returns the number of bytes in the utf8 unit, or -1 on invalid input.
+// If the parse is successful, puts the bits that are part of the unit being
+// parsed in *unit.
+static int parse_utf8_prefix_byte(uint8_t b, int64_t *unit) {
+ if ((b & 0b10000000) == 0b00000000) {*unit = b & 0b01111111; return 1;}
+ if ((b & 0b11100000) == 0b11000000) {*unit = b & 0b00011111; return 2;}
+ if ((b & 0b11110000) == 0b11100000) {*unit = b & 0b00001111; return 3;}
+ if ((b & 0b11111000) == 0b11110000) {*unit = b & 0b00000111; return 4;}
+ return -1;
+}
+
+// Returns length of the parsed utf8 unit, and puts the parsed value in *unitp.
+// No range checking is done.
+static int parse_utf8_unit(const uint8_t *buf, size_t length, int64_t *unitp, bool debug) {
+ if (length == 0) {
+ if (debug) fprintf(stderr, "[utf8ref] unit at EOS\n");
+ return -1;
+ }
+
+ int64_t unit;
+ const int num_bytes = parse_utf8_prefix_byte(buf[0], &unit);
+ assert(num_bytes == -1 || (1 <= num_bytes && num_bytes <= 4));
+ if (num_bytes == -1) {
+ if (debug) fprintf(stderr, "[utf8ref] invalid prefix byte %x\n", (unsigned)buf[0]);
+ return -1;
+ }
+ assert(unit >= 0);
+ if (length < (size_t)num_bytes) {
+ if (debug) fprintf(stderr, "[utf8ref] prefix byte %x specifies length %d, but EOS\n", (unsigned)buf[0], num_bytes);
+ return -1;
+ }
+
+ for (int i = 1; i < num_bytes; i++) {
+ if ((buf[i] & 0b11000000) != 0b10000000) {
+ if (debug) fprintf(stderr, "[utf8ref] invalid continuation byte %x\n", (unsigned)buf[i]);
+ return -1;
+ }
+ unit = (unit << 6) | (buf[i] & 0b00111111);
+ }
+
+ // check for overlong encodings
+ if ((num_bytes >= 2 && unit <= 0x7F) ||
+ (num_bytes >= 3 && unit <= 0x7FF) ||
+ (num_bytes >= 4 && unit <= 0xFFFF)) {
+ if (debug) fprintf(stderr, "[utf8ref] overlong encoding with prefix byte %x\n", (unsigned)buf[0]);
+ return -1;
+ }
+
+ *unitp = unit;
+ return num_bytes;
+}
+
+static bool validate_utf8_reference(const char *buf_, size_t length, bool debug) {
+ const uint8_t *buf = (const uint8_t*)buf_;
+
+ size_t cursor = 0;
+ while (cursor < length) {
+ int64_t unit;
+ int len = parse_utf8_unit(buf + cursor, length - cursor, &unit, debug);
+ assert(len == -1 || (1 <= len && len <= 4));
+ if (len == -1) return false;
+ assert(unit >= 0);
+ // fprintf(stderr, "unit = 0x%lx\n", unit);
+
+ // Surrogate code point
+ if (0xD800 <= unit && unit <= 0xDFFF) {
+ if (debug) fprintf(stderr, "[utf8ref] surrogate code point %lx (prefix byte %x)\n", unit, (unsigned)buf[cursor]);
+ return false;
+ }
+ // Maximal unicode value
+ if (unit > 0x10FFFF) {
+ if (debug) fprintf(stderr, "[utf8ref] out of range code point %lx (prefix byte %x)\n", unit, (unsigned)buf[cursor]);
+ return false;
+ }
+
+ cursor += len;
+ }
+
+ return true;
+}
+
+// Requires that the buffer has space for at least 4 bytes.
+// Returns the number of bytes written.
+static int utf8_serialise(char *buf_, int64_t unit) {
+ uint8_t *buf = (uint8_t*)buf_;
+
+#define PLACE_CONTINUATION_BYTE(idx_) \
+ {buf[(idx_)] = 0x80 | (unit & 0x3F); unit >>= 6;}
+
+ if (unit <= 0x7F) {
+ buf[0] = unit;
+ return 1;
+ }
+ if (unit <= 0x7FF) {
+ PLACE_CONTINUATION_BYTE(1);
+ buf[0] = 0xC0 | (unit & 0x1F);
+ return 2;
+ }
+ if (unit <= 0xFFFF) {
+ PLACE_CONTINUATION_BYTE(2);
+ PLACE_CONTINUATION_BYTE(1);
+ buf[0] = 0xE0 | (unit & 0x0F);
+ return 3;
+ }
+ if (unit <= 0x10FFFF) {
+ PLACE_CONTINUATION_BYTE(3);
+ PLACE_CONTINUATION_BYTE(2);
+ PLACE_CONTINUATION_BYTE(1);
+ buf[0] = 0xF0 | (unit & 0x07);
+ return 4;
+ }
+ assert(false && "Invalid unit in utf8_serialise");
+
+#undef PLACE_CONTINUATION_BYTE
+}
+
+static void fill_random_buffer(char *buf, size_t length) {
+ size_t i = 0;
+ while (i + sizeof(long) < length) {
+ *(long*)&buf[i] = random();
+ i += sizeof(long);
+ }
+ while (i < length) buf[i++] = random();
+}
+
+DEFINE_TEST(utf8_unit1) {
+ EXPECT(validate_utf8("hello", 5));
+ EXPECT(validate_utf8_reference("hello", 5, true));
+ const char *str = "hello 🧀🇳🇱";
+ EXPECT(validate_utf8(str, strlen(str)));
+ EXPECT(validate_utf8_reference(str, strlen(str), true));
+ EXPECT(validate_utf8("\xe0\xad\xbc`j", 5));
+ EXPECT(validate_utf8_reference("\xe0\xad\xbc`j", 5, true));
+ EXPECT(validate_utf8("\xd3\xb0\\i\x00\x00\x00\x001\xc7\xaa_", 12));
+ EXPECT(validate_utf8("\xc7\xaa_", 3));
+ EXPECT(validate_utf8_reference("\xd3\xb0\\i\x00\x00\x00\x001\xc7\xaa_", 12, true));
+ EXPECT(!validate_utf8("\xf2\x98\xbcx", 4));
+ EXPECT(!validate_utf8_reference("\xf2\x98\xbcx", 4, false));
+ return 0;
+}
+
+DEFINE_TEST(utf8_random) {
+ const int max_length = 100;
+ const int num_tests = 10000000;
+
+ char *buffer = malloc(max_length + 1, char);
+ for (int test = 0; test < num_tests; test++) {
+ // fprintf(stderr, "== test = %d\n", test);
+ const int length = random() % max_length;
+ fill_random_buffer(buffer, length);
+ const bool ret_ref = validate_utf8_reference(buffer, length, false);
+ const bool ret_impl = validate_utf8(buffer, length);
+ if (ret_ref != ret_impl) {
+ fprintf(stderr, "buffer: ");
+ print_buffer(stderr, buffer, length);
+ fprintf(stderr, "\n");
+ fprintf(stderr, "reference -> %d, implementation -> %d\n", ret_ref, ret_impl);
+ EXPECTRET(1, false && "validate_utf8_reference == validate_utf8");
+ }
+ }
+ free(buffer);
+
+ return 0;
+}
+
+DEFINE_TEST(utf8_random_valid) {
+ const int max_length = 100;
+ const int num_tests = 3000000;
+
+ char *buffer = malloc(max_length + 1, char);
+ for (int test = 0; test < num_tests; test++) {
+ int length = random() % max_length;
+
+ int cursor = 0;
+ while (cursor + 4 <= length) {
+ const int64_t unit = random() % 0x110000;
+ if (0xD800 <= unit && unit <= 0xDFFF) continue; // surrogate
+ cursor += utf8_serialise(buffer + cursor, unit);
+ }
+ length = cursor;
+
+ const bool ret_ref = validate_utf8_reference(buffer, length, true);
+ if (!ret_ref) {
+ fprintf(stderr, "buffer: ");
+ print_buffer(stderr, buffer, length);
+ fprintf(stderr, "\n");
+ EXPECTRET(1, false && "validate_utf8_reference on valid string");
+ }
+
+ const bool ret_impl = validate_utf8(buffer, length);
+ if (!ret_impl) {
+ fprintf(stderr, "buffer: ");
+ print_buffer(stderr, buffer, length);
+ fprintf(stderr, "\n");
+ EXPECTRET(1, false && "validate_utf8 on valid string");
+ }
+ }
+ free(buffer);
+
+ return 0;
+}
+
+DEFINE_TEST(utf8_exhaustive_1) {
+ for (int64_t number = 0; number < 0x100000000LL; number++) {
+ EXPECT(
+ validate_utf8_reference((const char*)&number, 4, false)
+ == validate_utf8((const char*)&number, 4)
+ );
+ }
+ return 0;
+}
diff --git a/utf8.c b/utf8.c
new file mode 100644
index 0000000..0a06002
--- /dev/null
+++ b/utf8.c
@@ -0,0 +1,91 @@
+#include <stdint.h>
+#include "utf8.h"
+
+
+static bool in(uint8_t lo, uint8_t x, uint8_t hi) {
+ return lo <= x && x <= hi;
+}
+
+static const uint64_t TOP_BIT_MASK_64 = 0x8080808080808080ULL;
+
+
+// 1-byte: U+000000 -- U+00007F: 00 -- 7F
+// 2-byte: U+000080 -- U+0007FF: C2 80 -- DF BF
+// 3-byte: U+000800 -- U+00FFFF: E0 A0 80 -- EF BF BF
+// 4-byte: U+010000 -- U+10FFFF: F0 90 80 80 -- F4 8F BF BF
+//
+// Surrogate codepoints (U+D800 -- U+DFFF) are invalid, so we have to split the
+// 3-byte range.
+//
+// Resulting ranges:
+// 1-byte : U+000000 -- U+00007F: 00 -- 7F
+// 2-byte : U+000080 -- U+0007FF: C2 80 -- DF BF
+// 3-byte A: U+000800 -- U+00D7FF: E0 A0 80 -- ED 9F BF
+// 3-byte B: U+00E000 -- U+00FFFF: EE 80 80 -- EF BF BF
+// 4-byte : U+010000 -- U+10FFFF: F0 90 80 80 -- F4 8F BF BF
+bool validate_utf8(const char *buf_, size_t length) {
+ const uint8_t *buf = (const uint8_t*)buf_;
+
+ size_t i = 0;
+ while (i < length) {
+ // skip 1-byte codepoints, i.e. ASCII values
+ const uint8_t a = buf[i];
+ if (a <= 0x7F) {
+ i++;
+
+ const size_t aligned_index = ((uintptr_t)(buf + i) | 7) + 1 - (uintptr_t)buf;
+
+ while (i != aligned_index) {
+ if (i == length) return true;
+ if (buf[i] >= 0x7F) goto non_ascii_found;
+ i++;
+ }
+
+ while (i + 8 <= length &&
+ (*(const uint64_t*)&buf[i] & TOP_BIT_MASK_64) == 0) {
+ i += 8;
+ }
+
+ while (true) {
+ if (i == length) return true;
+ if (buf[i] >= 0x7F) break;
+ i++;
+ }
+
+ non_ascii_found:
+ continue;
+ }
+
+ // 2-byte
+ if (i + 1 >= length) return false;
+ const uint8_t b = buf[i+1];
+ if (in(0xC2, a, 0xDF) && in(0x80, b, 0xBF)) {
+ i += 2; continue;
+ }
+
+ // 3-byte
+ if (i + 2 >= length) return false;
+ const uint8_t c = buf[i+2];
+ if (!in(0x80, c, 0xBF)) return false;
+ if ((a == 0xE0 && in(0xA0, b, 0xBF)) ||
+ (in(0xE1, a, 0xEC) && in(0x80, b, 0xBF)) ||
+ (a == 0xED && in(0x80, b, 0x9F)) ||
+ (in(0xEE, a, 0xEF) && in(0x80, b, 0xBF))) {
+ i += 3; continue;
+ }
+
+ // 4-byte
+ if (i + 3 >= length) return false;
+ const uint8_t d = buf[i+3];
+ if (!in(0x80, d, 0xBF)) return false;
+ if ((a == 0xF0 && in(0x90, b, 0xBF)) ||
+ (in(0xF1, a, 0xF3) && in(0x80, b, 0xBF)) ||
+ (a == 0xF4 && in(0x80, b, 0x8F))) {
+ i += 4; continue;
+ }
+
+ return false;
+ }
+
+ return true;
+}
diff --git a/utf8.h b/utf8.h
new file mode 100644
index 0000000..fea5e50
--- /dev/null
+++ b/utf8.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#include <stddef.h>
+#include <stdbool.h>
+
+
+bool validate_utf8(const char *buf, size_t length);