💾 Archived View for thrig.me › blog › 2023 › 08 › 15 › isutf8.c captured on 2024-12-17 at 11:20:35.

View Raw

More Information

⬅️ Previous capture (2023-09-08)

-=-=-=-=-=-=-

// istuf8 - what are the odds of random bytes being valid UTF-8?
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
// doas pkg_add libutf8proc
#include <libutf8proc/utf8proc.h>

double
byteaudit(size_t len, size_t trials)
{
	size_t pass = 0;
	assert(len > 0);
	utf8proc_int32_t ref;
	utf8proc_uint8_t *in = malloc(len);
	if (!in) abort();
	for (size_t i = 0; i < trials; i++) {
		// PORTABILITY this may need libbsd or something else
		arc4random_buf(in, len);
		utf8proc_uint8_t *ip  = in;
		utf8proc_ssize_t left = (utf8proc_ssize_t) len;
		while (1) {
			utf8proc_ssize_t ret = utf8proc_iterate(ip, left, &ref);
			if (ret < 1) break;
			ip += ret;
			left -= ret;
			if (left == 0) {
				pass++;
				break;
			}
		}
	}
	free(in);
	return (double) pass / (double) trials * 100.0;
}

int
main(void)
{
	for (size_t i = 1; i <= 8; i++)
		printf("%lu %.2lf\n", i, byteaudit(i, 10000000U));
}