探测输入字符串是否为UTF8编码
作者:互联网
#include <windows.h>
#include <crtdbg.h>
/*
UTF-8 编码规则
1字节 0BBBBBBB
2字节 110BBBBB 10BBBBBB
3字节 1110BBBB 10BBBBBB 10BBBBBB
4字节 11110BBB 10BBBBBB 10BBBBBB 10BBBBBB
5字节 111110BB 10BBBBBB 10BBBBBB 10BBBBBB 10BBBBBB
6字节 1111110B 10BBBBBB 10BBBBBB 10BBBBBB 10BBBBBB 10BBBBBB
*/
//探测输入字符串是否是UTF8字符串
bool IsUtf8String(const char *pString)
{
bool bRet = true;
//输入长度
int iStrLen = pString ? strlen(pString):0;
//临时变量
int m,n, iBLen; BYTE b0;
for(m=0; bRet && m<iStrLen; )
{
//根据第一个字节计算长度
b0 = (BYTE)pString[m];
iBLen = 0;
if((b0 & 0x80) == 0x00) { iBLen = 1; } //1字节
else if((b0 & 0xE0) == 0xC0) { iBLen = 2; } //2字节
else if((b0 & 0xF0) == 0xE0) { iBLen = 3; } //3字节
else if((b0 & 0xF8) == 0xF0) { iBLen = 4; } //4字节
else if((b0 & 0xFC) == 0xF8) { iBLen = 5; } //5字节
else if((b0 & 0xFE) == 0xFC) { iBLen = 6; } //6字节
else {} //非法
//非法长度或超出缓冲区范围
if(iBLen <= 0 || m+iBLen > iStrLen)
{
bRet = false;
break;
}
//判断后续字节需 10xxxxxx
for(n=1; n<iBLen; n++)
{
if((pString[m+n] & 0xC0) == 0x80)
continue;
bRet = false;
break;
}
//对齐到下一个长度
m += iBLen;
}
return bRet;
}
int _tmain(int argc, _TCHAR* argv[])
{
LPCWSTR strW = L"中国";
CHAR chBuff[16]={0};
WideCharToMultiByte(CP_UTF8, 0, strW, -1, chBuff, sizeof(chBuff), NULL, NULL);
_ASSERT(!IsUtf8String((CHAR*)strW));
_ASSERT(IsUtf8String(chBuff));
return 0;
}
标签:编码,iStrLen,字节,UTF8,10BBBBBB,pString,字符串,bRet 来源: https://blog.csdn.net/zgl7903/article/details/119908116