💾 Archived View for blinkyshark.chickenkiller.com › utf8.gmi captured on 2023-06-16 at 16:20:04. Gemini links have been rewritten to link to archived content

View Raw

More Information

-=-=-=-=-=-=-

UTF-8

Algorithm for converting Unicode to UTF-8

string utf8_unicode(int32_t uni)
{
        //uint8_t bytes[5] = { 0b1110'0000, 0b1000'0000, 0b1, 0, 0};
        //uint8_t bytes[5] = { 0, 0, 0, 0, 0};
        //assert(uni >= 0x800 && uni < 0x10000);

        constexpr auto k6 = 0b11'1111;
        constexpr auto ext = 1<<7;
        if(uni < 0x80) {
                string str{"N"};
                str[0] = uni;
                return str;
        }
        if(uni < 0x0800) {
                string str{"NN"};
                str[0] = 0b1100'0000 + (uni>> 6);
                str[1] = ext + (uni & k6);
                return str;
        }
        if(uni < 0x10000) {
                string str{"NNN"};
                str[0] =  0b1110'0000 + (uni>>12);
                str[1] = ext + ((uni>>6) & k6);
                str[2] = ext + (uni & k6);
                //return string(bytes[0] + bytes[1] + bytes[2]);
                return str;

        }
        //puts("deep into the woods");
        assert(uni<= 0x10FFFF);
        string str{"NNNN"};
        str[0] =  0b1111'0000 + (uni>>18);
        str[1] = ext + ((uni>>12) & k6);
        str[2] = ext + ((uni>>6) & k6);
        str[3] = ext + (uni & k6);
        return str;
}

SEE ALSO

Unicode

EXITS

Wikipedia: UTF-8 Encoding