💾 Archived View for thrig.me › blog › 2023 › 08 › 15 › isutf8.c captured on 2024-12-17 at 11:20:35.
⬅️ Previous capture (2023-09-08)
-=-=-=-=-=-=-
// istuf8 - what are the odds of random bytes being valid UTF-8? #include <assert.h> #include <stdio.h> #include <stdlib.h> #include <unistd.h> // doas pkg_add libutf8proc #include <libutf8proc/utf8proc.h> double byteaudit(size_t len, size_t trials) { size_t pass = 0; assert(len > 0); utf8proc_int32_t ref; utf8proc_uint8_t *in = malloc(len); if (!in) abort(); for (size_t i = 0; i < trials; i++) { // PORTABILITY this may need libbsd or something else arc4random_buf(in, len); utf8proc_uint8_t *ip = in; utf8proc_ssize_t left = (utf8proc_ssize_t) len; while (1) { utf8proc_ssize_t ret = utf8proc_iterate(ip, left, &ref); if (ret < 1) break; ip += ret; left -= ret; if (left == 0) { pass++; break; } } } free(in); return (double) pass / (double) trials * 100.0; } int main(void) { for (size_t i = 1; i <= 8; i++) printf("%lu %.2lf\n", i, byteaudit(i, 10000000U)); }