💾 Archived View for thrig.me › blog › 2023 › 08 › 15 › isutf82.c captured on 2024-05-26 at 15:44:48.

View Raw

More Information

⬅️ Previous capture (2023-09-08)

-=-=-=-=-=-=-

// istuf8 II - what are the odds of random bytes being valid UTF-8?
#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
// doas pkg_add libutf8proc
#include <libutf8proc/utf8proc.h>

double
byteaudit(size_t len, size_t trials)
{
	size_t pass = 0;
	assert(len > 0);
	utf8proc_int32_t ref;
	switch (len) {
	case 1: { // if we do not get 50% from this, something is wrong
		utf8proc_uint8_t i = 0;
		trials             = 256;
		do {
			if (utf8proc_iterate(&i, 1, &ref) == 1) pass++;
		} while (++i != 0);
		break;
	}
	case 2: {
		utf8proc_uint8_t ij[2] = {0};
		uint16_t *n            = (uint16_t *) &ij;
		trials                 = 65536;
		do {
			utf8proc_ssize_t left = 2;
			utf8proc_uint8_t *ip  = ij;
			while (1) {
				utf8proc_ssize_t ret =
				  utf8proc_iterate(ip, left, &ref);
				if (ret < 1) break;
				ip += ret;
				left -= ret;
				if (left == 0) {
					pass++;
					break;
				}
			}
		} while (++*n != 0);
		break;
	}
	default: {
		utf8proc_uint8_t *in = malloc(len);
		if (!in) abort();
		for (size_t i = 0; i < trials; i++) {
			// PORTABILITY this may need libbsd or something else
			arc4random_buf(in, len);
			utf8proc_uint8_t *ip  = in;
			utf8proc_ssize_t left = (utf8proc_ssize_t) len;
			while (1) {
				utf8proc_ssize_t ret =
				  utf8proc_iterate(ip, left, &ref);
				if (ret < 1) break;
				ip += ret;
				left -= ret;
				if (left == 0) {
					pass++;
					break;
				}
			}
		}
		free(in);
	}
	}
	return (double) pass / (double) trials * 100.0;
}

int
main(void)
{
	for (int j = 0; j < 30; j++) {
		for (size_t i = 1; i <= 8; i++)
			printf("%.2lf ", byteaudit(i, 10000U));
		putchar('\n');
	}
}