// istuf8 - what are the odds of random bytes being valid UTF-8? #include #include #include #include // doas pkg_add libutf8proc #include double byteaudit(size_t len, size_t trials) { size_t pass = 0; assert(len > 0); utf8proc_int32_t ref; utf8proc_uint8_t *in = malloc(len); if (!in) abort(); for (size_t i = 0; i < trials; i++) { // PORTABILITY this may need libbsd or something else arc4random_buf(in, len); utf8proc_uint8_t *ip = in; utf8proc_ssize_t left = (utf8proc_ssize_t) len; while (1) { utf8proc_ssize_t ret = utf8proc_iterate(ip, left, &ref); if (ret < 1) break; ip += ret; left -= ret; if (left == 0) { pass++; break; } } } free(in); return (double) pass / (double) trials * 100.0; } int main(void) { for (size_t i = 1; i <= 8; i++) printf("%lu %.2lf\n", i, byteaudit(i, 10000000U)); }