UTF-8
Algorithm for converting Unicode to UTF-8
string utf8_unicode(int32_t uni)
{
//uint8_t bytes[5] = { 0b1110'0000, 0b1000'0000, 0b1, 0, 0};
//uint8_t bytes[5] = { 0, 0, 0, 0, 0};
//assert(uni >= 0x800 && uni < 0x10000);
constexpr auto k6 = 0b11'1111;
constexpr auto ext = 1<<7;
if(uni < 0x80) {
string str{"N"};
str[0] = uni;
return str;
}
if(uni < 0x0800) {
string str{"NN"};
str[0] = 0b1100'0000 + (uni>> 6);
str[1] = ext + (uni & k6);
return str;
}
if(uni < 0x10000) {
string str{"NNN"};
str[0] = 0b1110'0000 + (uni>>12);
str[1] = ext + ((uni>>6) & k6);
str[2] = ext + (uni & k6);
//return string(bytes[0] + bytes[1] + bytes[2]);
return str;
}
//puts("deep into the woods");
assert(uni<= 0x10FFFF);
string str{"NNNN"};
str[0] = 0b1111'0000 + (uni>>18);
str[1] = ext + ((uni>>12) & k6);
str[2] = ext + ((uni>>6) & k6);
str[3] = ext + (uni & k6);
return str;
}
SEE ALSO
Unicode
EXITS
Wikipedia: UTF-8 Encoding