學習讓生命更豐富: 關於 UTF8 與 Unicode

UTF8 編碼是 1 ~ 4 bytes 字元編碼, Unicode 則是固定 4 bytes 字元編碼:

#include "stdio.h"
typedef unsigned char u8int;
typedef const char*   c_str;

void toHex(u8int *utf8) {
        if (utf8) {
            int wc = 0;// wide char unicode
            while (*utf8 != 0) {
                if ((*utf8 & 0xc0) == 0x80) { // 10 xx xxxx, bits accumulate
                    wc <<= 6;// self-shift 6 bits to left
                    wc |= *utf8 & 0x3f;// accumulate 6 bits
                } else {// ready to decode
                    if (wc > 0x80) printf("%5x, ", wc); // previous one

                    if (*utf8 < 0x80) { // 0x xx xxxx, 1 bytes utf8
                        printf("%2x, ", *utf8);// decode immediatelly
                        wc = 0; // reset to 0 to prevent overflow
                    } // decode later
                    else if (*utf8 < 0xe0) wc = *utf8 & 0x1f;// 2 bytes utf8, 5 bits begin + 6 bits later = 11 bits total
                    else if (*utf8 < 0xf0) wc = *utf8 & 0xf;// 3 bytes utf8, 4 bits begin + 12 bits later = 16 bits total
                    else                   wc = *utf8 & 0x7;// 4 bytes utf8, 3 bits begin + 18 bits later = 21 bits total
                }
                utf8 ++;
            }
            if (wc > 0x80) printf("%5x", wc);
        }
}

void putwc(int wc){ // putchar for wide char unicode
    if(wc < 0x80)       printf("%c", wc); // 7 bits: 0xxxxxxx
    else if(wc < 0x800) printf("%c%c",
            0xc0 | (wc >> 7),
            0x80 | (wc & 0x3f)
        ); // 11 bits: 110xxxxx 10xxxxxx , 5 + 6 bits
    else if(wc < 0x10000) printf("%c%c%c",
            0xe0 | (wc >> 12),
            0x80 | ((wc >> 6) & 0x3f),
            0x80 | (wc        & 0x3f)
        );// 16 bits: 1110xxxx 10xxxxxx 10xxxxxx, 4 + 6 + 6 bits
    else printf("%c%c%c%c",
            0xf0 | ((wc >> 18) & 7),
            0x80 | ((wc >> 12) & 0x3f),
            0x80 | ((wc >> 6) & 0x3f),
            0x80 | (wc        & 0x3f)
        );// 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx, 3 + 6 + 6 + 6 bits
}

void toHex(c_str utf8) { return toHex((u8int *)utf8); }
int main(){
   c_str src = "Taiwan,台灣,🀄,0123456789,\U0001F004";
   toHex(src);
   printf("\n%s\n", src);
   putwc(0x1f004);
   printf("\n");
   return 0;
}

編譯並執行 g++ a.c && ./a.out

54, 61, 69, 77, 61, 6e, 2c, 53f0, 7063, 2c, 1f004, 2c, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 2c, 1f004
Taiwan,台灣,🀄,0123456789,🀄
🀄

學習讓生命更豐富

2021年8月1日星期日

關於 UTF8 與 Unicode

沒有留言:

張貼留言

使用 python 簡單實現多 cpu 平行處理

檢舉濫用情形

2021年8月1日 星期日

關於 UTF8 與 Unicode

沒有留言:

張貼留言

使用 python 簡單實現多 cpu 平行處理

2021年8月1日星期日