#include #include #include #include "test_framework.h" #include "../global.h" #include "../utf8.h" // Returns the number of bytes in the utf8 unit, or -1 on invalid input. // If the parse is successful, puts the bits that are part of the unit being // parsed in *unit. static int parse_utf8_prefix_byte(uint8_t b, int64_t *unit) { if ((b & 0b10000000) == 0b00000000) {*unit = b & 0b01111111; return 1;} if ((b & 0b11100000) == 0b11000000) {*unit = b & 0b00011111; return 2;} if ((b & 0b11110000) == 0b11100000) {*unit = b & 0b00001111; return 3;} if ((b & 0b11111000) == 0b11110000) {*unit = b & 0b00000111; return 4;} return -1; } // Returns length of the parsed utf8 unit, and puts the parsed value in *unitp. // No range checking is done. static int parse_utf8_unit(const uint8_t *buf, size_t length, int64_t *unitp, bool debug) { if (length == 0) { if (debug) fprintf(stderr, "[utf8ref] unit at EOS\n"); return -1; } int64_t unit; const int num_bytes = parse_utf8_prefix_byte(buf[0], &unit); assert(num_bytes == -1 || (1 <= num_bytes && num_bytes <= 4)); if (num_bytes == -1) { if (debug) fprintf(stderr, "[utf8ref] invalid prefix byte %x\n", (unsigned)buf[0]); return -1; } assert(unit >= 0); if (length < (size_t)num_bytes) { if (debug) fprintf(stderr, "[utf8ref] prefix byte %x specifies length %d, but EOS\n", (unsigned)buf[0], num_bytes); return -1; } for (int i = 1; i < num_bytes; i++) { if ((buf[i] & 0b11000000) != 0b10000000) { if (debug) fprintf(stderr, "[utf8ref] invalid continuation byte %x\n", (unsigned)buf[i]); return -1; } unit = (unit << 6) | (buf[i] & 0b00111111); } // check for overlong encodings if ((num_bytes >= 2 && unit <= 0x7F) || (num_bytes >= 3 && unit <= 0x7FF) || (num_bytes >= 4 && unit <= 0xFFFF)) { if (debug) fprintf(stderr, "[utf8ref] overlong encoding with prefix byte %x\n", (unsigned)buf[0]); return -1; } *unitp = unit; return num_bytes; } static bool validate_utf8_reference(const char *buf_, size_t length, bool debug) { const uint8_t *buf = (const uint8_t*)buf_; size_t cursor = 0; while (cursor < length) { int64_t unit; int len = parse_utf8_unit(buf + cursor, length - cursor, &unit, debug); assert(len == -1 || (1 <= len && len <= 4)); if (len == -1) return false; assert(unit >= 0); // fprintf(stderr, "unit = 0x%lx\n", unit); // Surrogate code point if (0xD800 <= unit && unit <= 0xDFFF) { if (debug) fprintf(stderr, "[utf8ref] surrogate code point %lx (prefix byte %x)\n", unit, (unsigned)buf[cursor]); return false; } // Maximal unicode value if (unit > 0x10FFFF) { if (debug) fprintf(stderr, "[utf8ref] out of range code point %lx (prefix byte %x)\n", unit, (unsigned)buf[cursor]); return false; } cursor += len; } return true; } // Requires that the buffer has space for at least 4 bytes. // Returns the number of bytes written. static int utf8_serialise(char *buf_, int64_t unit) { uint8_t *buf = (uint8_t*)buf_; #define PLACE_CONTINUATION_BYTE(idx_) \ {buf[(idx_)] = 0x80 | (unit & 0x3F); unit >>= 6;} if (unit <= 0x7F) { buf[0] = unit; return 1; } if (unit <= 0x7FF) { PLACE_CONTINUATION_BYTE(1); buf[0] = 0xC0 | (unit & 0x1F); return 2; } if (unit <= 0xFFFF) { PLACE_CONTINUATION_BYTE(2); PLACE_CONTINUATION_BYTE(1); buf[0] = 0xE0 | (unit & 0x0F); return 3; } if (unit <= 0x10FFFF) { PLACE_CONTINUATION_BYTE(3); PLACE_CONTINUATION_BYTE(2); PLACE_CONTINUATION_BYTE(1); buf[0] = 0xF0 | (unit & 0x07); return 4; } assert(false && "Invalid unit in utf8_serialise"); #undef PLACE_CONTINUATION_BYTE } static void fill_random_buffer(char *buf, size_t length) { size_t i = 0; while (i + sizeof(long) < length) { *(long*)&buf[i] = random(); i += sizeof(long); } while (i < length) buf[i++] = random(); } DEFINE_TEST(utf8_unit1) { EXPECT(validate_utf8("hello", 5)); EXPECT(validate_utf8_reference("hello", 5, true)); const char *str = "hello 🧀🇳🇱"; EXPECT(validate_utf8(str, strlen(str))); EXPECT(validate_utf8_reference(str, strlen(str), true)); EXPECT(validate_utf8("\xe0\xad\xbc`j", 5)); EXPECT(validate_utf8_reference("\xe0\xad\xbc`j", 5, true)); EXPECT(validate_utf8("\xd3\xb0\\i\x00\x00\x00\x001\xc7\xaa_", 12)); EXPECT(validate_utf8("\xc7\xaa_", 3)); EXPECT(validate_utf8_reference("\xd3\xb0\\i\x00\x00\x00\x001\xc7\xaa_", 12, true)); EXPECT(!validate_utf8("\xf2\x98\xbcx", 4)); EXPECT(!validate_utf8_reference("\xf2\x98\xbcx", 4, false)); return 0; } DEFINE_TEST(utf8_random) { const int max_length = 100; const int num_tests = 10000000; char *buffer = malloc(max_length + 1, char); for (int test = 0; test < num_tests; test++) { // fprintf(stderr, "== test = %d\n", test); const int length = random() % max_length; fill_random_buffer(buffer, length); const bool ret_ref = validate_utf8_reference(buffer, length, false); const bool ret_impl = validate_utf8(buffer, length); if (ret_ref != ret_impl) { fprintf(stderr, "buffer: "); print_buffer(stderr, buffer, length); fprintf(stderr, "\n"); fprintf(stderr, "reference -> %d, implementation -> %d\n", ret_ref, ret_impl); EXPECTRET(1, false && "validate_utf8_reference == validate_utf8"); } } free(buffer); return 0; } DEFINE_TEST(utf8_random_valid) { const int max_length = 100; const int num_tests = 3000000; char *buffer = malloc(max_length + 1, char); for (int test = 0; test < num_tests; test++) { int length = random() % max_length; int cursor = 0; while (cursor + 4 <= length) { const int64_t unit = random() % 0x110000; if (0xD800 <= unit && unit <= 0xDFFF) continue; // surrogate cursor += utf8_serialise(buffer + cursor, unit); } length = cursor; const bool ret_ref = validate_utf8_reference(buffer, length, true); if (!ret_ref) { fprintf(stderr, "buffer: "); print_buffer(stderr, buffer, length); fprintf(stderr, "\n"); EXPECTRET(1, false && "validate_utf8_reference on valid string"); } const bool ret_impl = validate_utf8(buffer, length); if (!ret_impl) { fprintf(stderr, "buffer: "); print_buffer(stderr, buffer, length); fprintf(stderr, "\n"); EXPECTRET(1, false && "validate_utf8 on valid string"); } } free(buffer); return 0; } DEFINE_TEST(utf8_exhaustive_1) { for (int64_t number = 0; number < 0x100000000LL; number++) { EXPECT( validate_utf8_reference((const char*)&number, 4, false) == validate_utf8((const char*)&number, 4) ); } return 0; }