💾 Archived View for thrig.me › blog › 2023 › 08 › 15 › isutf82.c captured on 2024-12-17 at 11:20:38.
⬅️ Previous capture (2023-09-08)
-=-=-=-=-=-=-
// istuf8 II - what are the odds of random bytes being valid UTF-8? #include <assert.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> #include <unistd.h> // doas pkg_add libutf8proc #include <libutf8proc/utf8proc.h> double byteaudit(size_t len, size_t trials) { size_t pass = 0; assert(len > 0); utf8proc_int32_t ref; switch (len) { case 1: { // if we do not get 50% from this, something is wrong utf8proc_uint8_t i = 0; trials = 256; do { if (utf8proc_iterate(&i, 1, &ref) == 1) pass++; } while (++i != 0); break; } case 2: { utf8proc_uint8_t ij[2] = {0}; uint16_t *n = (uint16_t *) &ij; trials = 65536; do { utf8proc_ssize_t left = 2; utf8proc_uint8_t *ip = ij; while (1) { utf8proc_ssize_t ret = utf8proc_iterate(ip, left, &ref); if (ret < 1) break; ip += ret; left -= ret; if (left == 0) { pass++; break; } } } while (++*n != 0); break; } default: { utf8proc_uint8_t *in = malloc(len); if (!in) abort(); for (size_t i = 0; i < trials; i++) { // PORTABILITY this may need libbsd or something else arc4random_buf(in, len); utf8proc_uint8_t *ip = in; utf8proc_ssize_t left = (utf8proc_ssize_t) len; while (1) { utf8proc_ssize_t ret = utf8proc_iterate(ip, left, &ref); if (ret < 1) break; ip += ret; left -= ret; if (left == 0) { pass++; break; } } } free(in); } } return (double) pass / (double) trials * 100.0; } int main(void) { for (int j = 0; j < 30; j++) { for (size_t i = 1; i <= 8; i++) printf("%.2lf ", byteaudit(i, 10000U)); putchar('\n'); } }