場景:
1. 分析數據時,獲取到的數據是字符串,但是有可能不是正確的完整的utf8字符串,打印出來或輸出到文件時表現出來的就是顯示亂碼.
這時候就需要過濾掉非法字符使utf8字符串能正確顯示, 比如把非法字符替換為#
代碼:
1. 這個函數的特性是1個個字符判斷, 適合任意長度,任意構造的 utf8 (無效)字符串.
bool IREUtil::FilterUtf8(unsigned char * string,int length)
{
if(!string)
{
return false;
}
unsigned char * bytes = string;
unsigned char * end = bytes+length;
//10xxxxxx 應該出現個數
int count_s = 0;
//10xxxxxx 剩余個數
int minus_s = 0;
while(bytes != end)
{
if(bytes[0] > 0xF7)
{
if(minus_s)
{
int m = count_s-minus_s+1;
memset((void*)(bytes-m),'#',m);
}
minus_s = 0;
count_s = 0;
bytes[0] = '#';
bytes+=1;
continue;
}
if(bytes[0] <= 0x7F)
{
if(minus_s)
{
int m = count_s-minus_s+1;
memset((void*)(bytes-m),'#',m);
}
minus_s = 0;
count_s = 0;
//過濾掉不可見字符
if((bytes[0] == 0x09 || bytes[0] == 0x0A || bytes[0] == 0x0D ||
(0x20 <= bytes[0] && bytes[0] <= 0x7E)))
{
;
}else
{
bytes[0] = '#';
}
bytes+=1;
continue;
}
if((bytes[0] & 0xF8) == 0xF0)
{
// 1111 0XXX
if(minus_s)
{
int m = count_s-minus_s+1;
memset((void*)(bytes-m),'#',m);
}
count_s = 3;
minus_s = 3;
bytes+=1;
continue;
}
if((bytes[0] & 0xF0) == 0xE0)
{
// 1110 XXXX
if(minus_s)
{
int m = count_s-minus_s+1;
memset((void*)(bytes-m),'#',m);
}
count_s = 2;
minus_s = 2;
bytes+=1;
continue;
}
if((bytes[0] & 0xE0) == 0xC0)
{
// 110X XXXX
if(minus_s)
{
int m = count_s-minus_s+1;
memset((void*)(bytes-m),'#',m);
}
count_s = 1;
minus_s = 1;
bytes+=1;
continue;
}
if((bytes[0] & 0xC0) == 0x80)
{
// 10XX XXXX
if(minus_s)
{
--minus_s;
}else
{
bytes[0] = '#';
}
bytes+=1;
continue;
}
if(minus_s)
{
int m = count_s-minus_s+1;
memset((void*)(bytes-m),'#',m);
}else
{
bytes[0] = '#';
}
minus_s = 0;
count_s = 0;
bytes+=1;
continue;
}
if(minus_s)
{
int m = count_s-minus_s+1;
memset((void*)(bytes-m),'#',m);
}
return true;
}
原文地址沒留下:
bool IREUtil::is_utf8(const unsigned char * string,int length)
{
if(!string)
{
return false;
}
const unsigned char * bytes = (const unsigned char *)string;
const unsigned char * end = bytes+length;
while(bytes != end)
{
if( (// ASCII
// use bytes[0] <= 0x7F to allow ASCII control characters
bytes[0] == 0x09 ||
bytes[0] == 0x0A ||
bytes[0] == 0x0D ||
(0x20 <= bytes[0] && bytes[0] <= 0x7E)
)
) {
bytes += 1;
continue;
}
if( (// non-overlong 2-byte
(0xC2 <= bytes[0] && bytes[0] <= 0xDF) &&
(0x80 <= bytes[1] && bytes[1] <= 0xBF)
)
) {
bytes += 2;
continue;
}
if( (// excluding overlongs
bytes[0] == 0xE0 &&
(0xA0 <= bytes[1] && bytes[1] <= 0xBF) &&
(0x80 <= bytes[2] && bytes[2] <= 0xBF)
) ||
(// straight 3-byte
((0xE1 <= bytes[0] && bytes[0] <= 0xEC) ||
bytes[0] == 0xEE ||
bytes[0] == 0xEF) &&
(0x80 <= bytes[1] && bytes[1] <= 0xBF) &&
(0x80 <= bytes[2] && bytes[2] <= 0xBF)
) ||
(// excluding surrogates
bytes[0] == 0xED &&
(0x80 <= bytes[1] && bytes[1] <= 0x9F) &&
(0x80 <= bytes[2] && bytes[2] <= 0xBF)
)
) {
bytes += 3;
continue;
}
if( (// planes 1-3
bytes[0] == 0xF0 &&
(0x90 <= bytes[1] && bytes[1] <= 0xBF) &&
(0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
(0x80 <= bytes[3] && bytes[3] <= 0xBF)
) ||
(// planes 4-15
(0xF1 <= bytes[0] && bytes[0] <= 0xF3) &&
(0x80 <= bytes[1] && bytes[1] <= 0xBF) &&
(0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
(0x80 <= bytes[3] && bytes[3] <= 0xBF)
) ||
(// plane 16
bytes[0] == 0xF4 &&
(0x80 <= bytes[1] && bytes[1] <= 0x8F) &&
(0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
(0x80 <= bytes[3] && bytes[3] <= 0xBF)
)
) {
bytes += 4;
continue;
}
return false;
}
return true;
}
歡迎指正!