UTF8 編碼是 1 ~ 4 bytes 字元編碼, Unicode 則是固定 4 bytes 字元編碼:
#include "stdio.h"
typedef unsigned char u8int;
typedef const char* c_str;
void toHex(u8int *utf8) {
if (utf8) {
int wc = 0;// wide char unicode
while (*utf8 != 0) {
if ((*utf8 & 0xc0) == 0x80) { // 10 xx xxxx, bits accumulate
wc <<= 6;// self-shift 6 bits to left
wc |= *utf8 & 0x3f;// accumulate 6 bits
} else {// ready to decode
if (wc > 0x80) printf("%5x, ", wc); // previous one
if (*utf8 < 0x80) { // 0x xx xxxx, 1 bytes utf8
printf("%2x, ", *utf8);// decode immediatelly
wc = 0; // reset to 0 to prevent overflow
} // decode later
else if (*utf8 < 0xe0) wc = *utf8 & 0x1f;// 2 bytes utf8, 5 bits begin + 6 bits later = 11 bits total
else if (*utf8 < 0xf0) wc = *utf8 & 0xf;// 3 bytes utf8, 4 bits begin + 12 bits later = 16 bits total
else wc = *utf8 & 0x7;// 4 bytes utf8, 3 bits begin + 18 bits later = 21 bits total
}
utf8 ++;
}
if (wc > 0x80) printf("%5x", wc);
}
}
void putwc(int wc){ // putchar for wide char unicode
if(wc < 0x80) printf("%c", wc); // 7 bits: 0xxxxxxx
else if(wc < 0x800) printf("%c%c",
0xc0 | (wc >> 7),
0x80 | (wc & 0x3f)
); // 11 bits: 110xxxxx 10xxxxxx , 5 + 6 bits
else if(wc < 0x10000) printf("%c%c%c",
0xe0 | (wc >> 12),
0x80 | ((wc >> 6) & 0x3f),
0x80 | (wc & 0x3f)
);// 16 bits: 1110xxxx 10xxxxxx 10xxxxxx, 4 + 6 + 6 bits
else printf("%c%c%c%c",
0xf0 | ((wc >> 18) & 7),
0x80 | ((wc >> 12) & 0x3f),
0x80 | ((wc >> 6) & 0x3f),
0x80 | (wc & 0x3f)
);// 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx, 3 + 6 + 6 + 6 bits
}
void toHex(c_str utf8) { return toHex((u8int *)utf8); }
int main(){
c_str src = "Taiwan,台灣,🀄,0123456789,\U0001F004";
toHex(src);
printf("\n%s\n", src);
putwc(0x1f004);
printf("\n");
return 0;
}
編譯並執行 g++ a.c && ./a.out
54, 61, 69, 77, 61, 6e, 2c, 53f0, 7063, 2c, 1f004, 2c, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 2c, 1f004
Taiwan,台灣,🀄,0123456789,🀄
🀄
沒有留言:
張貼留言