aboutsummaryrefslogtreecommitdiff
path: root/test/utf8.c
diff options
context:
space:
mode:
Diffstat (limited to 'test/utf8.c')
-rw-r--r--test/utf8.c219
1 files changed, 219 insertions, 0 deletions
diff --git a/test/utf8.c b/test/utf8.c
new file mode 100644
index 0000000..afc7383
--- /dev/null
+++ b/test/utf8.c
@@ -0,0 +1,219 @@
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include "test_framework.h"
+#include "../global.h"
+#include "../utf8.h"
+
+
+// Returns the number of bytes in the utf8 unit, or -1 on invalid input.
+// If the parse is successful, puts the bits that are part of the unit being
+// parsed in *unit.
+static int parse_utf8_prefix_byte(uint8_t b, int64_t *unit) {
+ if ((b & 0b10000000) == 0b00000000) {*unit = b & 0b01111111; return 1;}
+ if ((b & 0b11100000) == 0b11000000) {*unit = b & 0b00011111; return 2;}
+ if ((b & 0b11110000) == 0b11100000) {*unit = b & 0b00001111; return 3;}
+ if ((b & 0b11111000) == 0b11110000) {*unit = b & 0b00000111; return 4;}
+ return -1;
+}
+
+// Returns length of the parsed utf8 unit, and puts the parsed value in *unitp.
+// No range checking is done.
+static int parse_utf8_unit(const uint8_t *buf, size_t length, int64_t *unitp, bool debug) {
+ if (length == 0) {
+ if (debug) fprintf(stderr, "[utf8ref] unit at EOS\n");
+ return -1;
+ }
+
+ int64_t unit;
+ const int num_bytes = parse_utf8_prefix_byte(buf[0], &unit);
+ assert(num_bytes == -1 || (1 <= num_bytes && num_bytes <= 4));
+ if (num_bytes == -1) {
+ if (debug) fprintf(stderr, "[utf8ref] invalid prefix byte %x\n", (unsigned)buf[0]);
+ return -1;
+ }
+ assert(unit >= 0);
+ if (length < (size_t)num_bytes) {
+ if (debug) fprintf(stderr, "[utf8ref] prefix byte %x specifies length %d, but EOS\n", (unsigned)buf[0], num_bytes);
+ return -1;
+ }
+
+ for (int i = 1; i < num_bytes; i++) {
+ if ((buf[i] & 0b11000000) != 0b10000000) {
+ if (debug) fprintf(stderr, "[utf8ref] invalid continuation byte %x\n", (unsigned)buf[i]);
+ return -1;
+ }
+ unit = (unit << 6) | (buf[i] & 0b00111111);
+ }
+
+ // check for overlong encodings
+ if ((num_bytes >= 2 && unit <= 0x7F) ||
+ (num_bytes >= 3 && unit <= 0x7FF) ||
+ (num_bytes >= 4 && unit <= 0xFFFF)) {
+ if (debug) fprintf(stderr, "[utf8ref] overlong encoding with prefix byte %x\n", (unsigned)buf[0]);
+ return -1;
+ }
+
+ *unitp = unit;
+ return num_bytes;
+}
+
+static bool validate_utf8_reference(const char *buf_, size_t length, bool debug) {
+ const uint8_t *buf = (const uint8_t*)buf_;
+
+ size_t cursor = 0;
+ while (cursor < length) {
+ int64_t unit;
+ int len = parse_utf8_unit(buf + cursor, length - cursor, &unit, debug);
+ assert(len == -1 || (1 <= len && len <= 4));
+ if (len == -1) return false;
+ assert(unit >= 0);
+ // fprintf(stderr, "unit = 0x%lx\n", unit);
+
+ // Surrogate code point
+ if (0xD800 <= unit && unit <= 0xDFFF) {
+ if (debug) fprintf(stderr, "[utf8ref] surrogate code point %lx (prefix byte %x)\n", unit, (unsigned)buf[cursor]);
+ return false;
+ }
+ // Maximal unicode value
+ if (unit > 0x10FFFF) {
+ if (debug) fprintf(stderr, "[utf8ref] out of range code point %lx (prefix byte %x)\n", unit, (unsigned)buf[cursor]);
+ return false;
+ }
+
+ cursor += len;
+ }
+
+ return true;
+}
+
+// Requires that the buffer has space for at least 4 bytes.
+// Returns the number of bytes written.
+static int utf8_serialise(char *buf_, int64_t unit) {
+ uint8_t *buf = (uint8_t*)buf_;
+
+#define PLACE_CONTINUATION_BYTE(idx_) \
+ {buf[(idx_)] = 0x80 | (unit & 0x3F); unit >>= 6;}
+
+ if (unit <= 0x7F) {
+ buf[0] = unit;
+ return 1;
+ }
+ if (unit <= 0x7FF) {
+ PLACE_CONTINUATION_BYTE(1);
+ buf[0] = 0xC0 | (unit & 0x1F);
+ return 2;
+ }
+ if (unit <= 0xFFFF) {
+ PLACE_CONTINUATION_BYTE(2);
+ PLACE_CONTINUATION_BYTE(1);
+ buf[0] = 0xE0 | (unit & 0x0F);
+ return 3;
+ }
+ if (unit <= 0x10FFFF) {
+ PLACE_CONTINUATION_BYTE(3);
+ PLACE_CONTINUATION_BYTE(2);
+ PLACE_CONTINUATION_BYTE(1);
+ buf[0] = 0xF0 | (unit & 0x07);
+ return 4;
+ }
+ assert(false && "Invalid unit in utf8_serialise");
+
+#undef PLACE_CONTINUATION_BYTE
+}
+
+static void fill_random_buffer(char *buf, size_t length) {
+ size_t i = 0;
+ while (i + sizeof(long) < length) {
+ *(long*)&buf[i] = random();
+ i += sizeof(long);
+ }
+ while (i < length) buf[i++] = random();
+}
+
+DEFINE_TEST(utf8_unit1) {
+ EXPECT(validate_utf8("hello", 5));
+ EXPECT(validate_utf8_reference("hello", 5, true));
+ const char *str = "hello 🧀🇳🇱";
+ EXPECT(validate_utf8(str, strlen(str)));
+ EXPECT(validate_utf8_reference(str, strlen(str), true));
+ EXPECT(validate_utf8("\xe0\xad\xbc`j", 5));
+ EXPECT(validate_utf8_reference("\xe0\xad\xbc`j", 5, true));
+ EXPECT(validate_utf8("\xd3\xb0\\i\x00\x00\x00\x001\xc7\xaa_", 12));
+ EXPECT(validate_utf8("\xc7\xaa_", 3));
+ EXPECT(validate_utf8_reference("\xd3\xb0\\i\x00\x00\x00\x001\xc7\xaa_", 12, true));
+ EXPECT(!validate_utf8("\xf2\x98\xbcx", 4));
+ EXPECT(!validate_utf8_reference("\xf2\x98\xbcx", 4, false));
+ return 0;
+}
+
+DEFINE_TEST(utf8_random) {
+ const int max_length = 100;
+ const int num_tests = 10000000;
+
+ char *buffer = malloc(max_length + 1, char);
+ for (int test = 0; test < num_tests; test++) {
+ // fprintf(stderr, "== test = %d\n", test);
+ const int length = random() % max_length;
+ fill_random_buffer(buffer, length);
+ const bool ret_ref = validate_utf8_reference(buffer, length, false);
+ const bool ret_impl = validate_utf8(buffer, length);
+ if (ret_ref != ret_impl) {
+ fprintf(stderr, "buffer: ");
+ print_buffer(stderr, buffer, length);
+ fprintf(stderr, "\n");
+ fprintf(stderr, "reference -> %d, implementation -> %d\n", ret_ref, ret_impl);
+ EXPECTRET(1, false && "validate_utf8_reference == validate_utf8");
+ }
+ }
+ free(buffer);
+
+ return 0;
+}
+
+DEFINE_TEST(utf8_random_valid) {
+ const int max_length = 100;
+ const int num_tests = 3000000;
+
+ char *buffer = malloc(max_length + 1, char);
+ for (int test = 0; test < num_tests; test++) {
+ int length = random() % max_length;
+
+ int cursor = 0;
+ while (cursor + 4 <= length) {
+ const int64_t unit = random() % 0x110000;
+ if (0xD800 <= unit && unit <= 0xDFFF) continue; // surrogate
+ cursor += utf8_serialise(buffer + cursor, unit);
+ }
+ length = cursor;
+
+ const bool ret_ref = validate_utf8_reference(buffer, length, true);
+ if (!ret_ref) {
+ fprintf(stderr, "buffer: ");
+ print_buffer(stderr, buffer, length);
+ fprintf(stderr, "\n");
+ EXPECTRET(1, false && "validate_utf8_reference on valid string");
+ }
+
+ const bool ret_impl = validate_utf8(buffer, length);
+ if (!ret_impl) {
+ fprintf(stderr, "buffer: ");
+ print_buffer(stderr, buffer, length);
+ fprintf(stderr, "\n");
+ EXPECTRET(1, false && "validate_utf8 on valid string");
+ }
+ }
+ free(buffer);
+
+ return 0;
+}
+
+DEFINE_TEST(utf8_exhaustive_1) {
+ for (int64_t number = 0; number < 0x100000000LL; number++) {
+ EXPECT(
+ validate_utf8_reference((const char*)&number, 4, false)
+ == validate_utf8((const char*)&number, 4)
+ );
+ }
+ return 0;
+}