程序師世界是廣大編程愛好者互助、分享、學習的平台,程序師世界有你更精彩!
首頁
編程語言
C語言|JAVA編程
Python編程
網頁編程
ASP編程|PHP編程
JSP編程
數據庫知識
MYSQL數據庫|SqlServer數據庫
Oracle數據庫|DB2數據庫
 程式師世界 >> 編程語言 >> C語言 >> C++ >> C++入門知識 >> C++ 敏感字符過濾

C++ 敏感字符過濾

編輯:C++入門知識

C++ 敏感字符過濾


WordNode.h
#ifndef __TOOLS_WORDNODE_H_INCLUDE__
#define __TOOLS_WORDNODE_H_INCLUDE__


#include


class CWordNode
{
typedef std::map umap;
public:
CWordNode(const std::string& word) { Reset(word); }
~CWordNode()
{
umap::iterator Ite = this->m_mapWordNodes.begin();
while (Ite != this->m_mapWordNodes.end())
{
CWordNode* pTmp = Ite->second;
delete pTmp;
pTmp = NULL;
++Ite;
}


this->m_mapWordNodes.clear();
this->m_nEndTag = 0;
}


void Reset(const std::string& word)
{
this->m_cWord = word;
this->m_nEndTag = 0;
this->m_mapWordNodes.clear();
}


public:
std::string m_cWord;
int m_nEndTag;
umap m_mapWordNodes;
};


#endif // __TOOLS_WORDNODE_H_INCLUDE__


WordsFilter.h
#ifndef __TOOLS_WORDSFILTER_H_INCLUDE__
#define __TOOLS_WORDSFILTER_H_INCLUDE__


#include
#include "WordNode.h"


class CWordsFilter
{
typedef std::map umap;
private:
std::list m_lsAllSensitiveWords; // 所有敏感詞列表
CWordNode* m_rootWordNode;
bool m_bIsInit;


public:
CWordsFilter();
~CWordsFilter();


static CWordsFilter& GetInstance(); // 獲取共享實例


void InitSensitiveWords(std::string strWord); // 初始化敏感詞集
void InitSensitiveWords(std::list lsAllSensitiveWords); // 初始化敏感詞集
std::string FilterSensitiveWords(const std::string& strContent); // 過濾敏感詞


private:
void BuildWordTree(); // 構建敏感詞樹
void InsertNode(CWordNode* pNode, const std::string& strContent, int nIndex);
CWordNode* FindNode(CWordNode* pNode, const std::string& word);
int GetFirstBytes(const std::string& str); // 獲取字符串中的第一個字符字節長度


};


#endif // __TOOLS_WORDSFILTER_H_INCLUDE__


WordsFilter.cpp
#include
#include
#include
#include
#include "WordsFilter.h"


int nStep = 2;


typedef std::vector Tokens;
Tokens StrSplit(const std::string &src, const std::string &sep)
{
Tokens r;
std::string s;


for (std::string::const_iterator i = src.begin(); i != src.end(); i++)
{
if (sep.find((*i)) != std::string::npos)
{
if (s.length())
{
r.push_back(s);
}
s = "";
}
else
{
s += (*i);
}
}


if (s.length())
{
r.push_back(s);
}
return r;
};


int CWordsFilter::GetFirstBytes(const std::string& str)
{
for (int i = 0; i < (int)str.size(); ++i)
{


unsigned char chr = (unsigned char)str.at(i);


// 如果是該字節是 0XXX XXXX 樣式,說明其是一個英文文字,占1字節
if ((chr >> 7) == 0)
{
return 1;
}
// 如果該字節是 1111 110X 樣式,說明其是一個文字的頭,且該文字占6字節
else if ((chr >> 1) == 126)
{
return 6;
}
// 如果該字節是 1111 10XX 樣式,說明其是一個文字的頭,且該文字占5字節
else if ((chr >> 2) == 62)
{
return 5;
}
// 如果該字節是 1111 0XXX 樣式,說明其是一個文字的頭,且該文字占4字節
else if ((chr >> 3) == 30)
{
return 4;
}
// 如果該字節是 1110 XXXX 樣式,說明其是一個文字的頭,且該文字占3字節
else if ((chr >> 4) == 14)
{
return 3;
}
// 如果該字節是 110X XXXX 樣式,說明其是一個文字的頭,且該文字占2字節
else if ((chr >> 5) == 6)
{
return 2;
}
else
{
continue;
}
}
return 1;
}


CWordsFilter::CWordsFilter():
m_bIsInit(false),
m_rootWordNode(NULL)
{
m_lsAllSensitiveWords.clear();
}


CWordsFilter::~CWordsFilter()
{
this->m_lsAllSensitiveWords.clear();
delete this->m_rootWordNode;
this->m_rootWordNode = NULL;
}


void CWordsFilter::InitSensitiveWords(std::string strWord)
{
Tokens token = StrSplit(strWord, ",");
std::list lsAllSensitiveWords;
Tokens::iterator Ite = token.begin();
while (Ite != token.end())
{
lsAllSensitiveWords.push_back(*Ite);
++Ite;
}
InitSensitiveWords(lsAllSensitiveWords);
}


void CWordsFilter::InitSensitiveWords(std::list lsAllSensitiveWords)
{
std::cout << "start init sensitive words" << std::endl;
this->m_lsAllSensitiveWords.clear();
this->m_lsAllSensitiveWords = lsAllSensitiveWords;


BuildWordTree();
this->m_bIsInit = true;
}


std::string CWordsFilter::FilterSensitiveWords(const std::string& strContent)
{
if (!this->m_bIsInit || NULL == this->m_rootWordNode)
{
std::cout << "the sensitive words is not init" << std::endl;
return "";
}


CWordNode* pNode = this->m_rootWordNode;


std::string strBuffer = "";
std::list lsBad;
int a = 0;
while ( a < strContent.size() )
{
std::string strContentTmp = strContent.substr(a, strContent.size());
nStep = GetFirstBytes(strContentTmp);
std::string strTmp = "";
if (nStep <= strContentTmp.size())
{
strTmp = strContentTmp.substr(0, nStep);
}
pNode = FindNode(pNode, strTmp);
if (pNode == NULL)
{
pNode = this->m_rootWordNode;
int nSize = 0;
std::list::iterator Ite = lsBad.begin();
while (Ite != lsBad.end())
{
nSize += (*Ite).size();
++Ite;
}
if (lsBad.size() > 0)
{
lsBad.clear();
}
a = a - nSize;
if (a < 0)
{
a = 0;
}
std::string strContentTmp = strContent.substr(a, strContent.size());
nStep = GetFirstBytes(strContentTmp);
strTmp = "";
if (nStep <= strContentTmp.size())
{
strTmp = strContentTmp.substr(0, nStep);
}
strBuffer.append(strTmp);
}
else if (pNode->m_nEndTag == 1)
{
lsBad.push_back(strTmp);
for (int nIndex = 0; nIndex < lsBad.size(); ++nIndex)
{
strBuffer.append("*");
}
pNode = this->m_rootWordNode;
lsBad.clear();
}
else
{
lsBad.push_back(strTmp);
if (a == strContent.size() - nStep)
{
std::list::const_iterator cIte = lsBad.begin();
while (cIte != lsBad.end())
{
strBuffer.append(*cIte);
++cIte;
}
}
}
strContentTmp = strContentTmp.substr(nStep, strContentTmp.size());
a += nStep;
}


return strBuffer;
}


void CWordsFilter::BuildWordTree()
{
if ( this->m_rootWordNode == NULL )
{
this->m_rootWordNode = new CWordNode("R");
if (NULL == this->m_rootWordNode)
{
return;
}
}
this->m_rootWordNode->Reset("R");


std::list::const_iterator cIte = this->m_lsAllSensitiveWords.begin();
while (cIte != this->m_lsAllSensitiveWords.end())
{
std::string strTmp = (*cIte);


if (strTmp.size() > 0)
{
InsertNode(this->m_rootWordNode, strTmp, 0);
}


++cIte;
}
}


void CWordsFilter::InsertNode(CWordNode* pNode, const std::string& strContent, int nIndex)
{
if (NULL == pNode)
{
return;
}
nStep = GetFirstBytes(strContent);
std::string strTmp = "";
if (nStep <= strContent.size())
{
strContent.substr(0, nStep);
}
CWordNode* pN = FindNode(pNode, strTmp);
if (NULL == pN)
{
pN = new CWordNode(strTmp);
if (NULL == pN)
{
return;
}
pNode->m_mapWordNodes[strTmp] = pN;
}


if (nIndex == strContent.size() - nStep)
{
pN->m_nEndTag = 1;
}


strTmp = strContent.substr(nStep, strContent.size());


if (strTmp.size() > 0)
{
InsertNode(pN, strTmp, 0);
}
}


CWordNode* CWordsFilter::FindNode(CWordNode* pNode, const std::string& word)
{
if ( NULL == pNode )
{
return NULL;
}
umap::iterator Ite = pNode->m_mapWordNodes.find(word);
if (Ite != pNode->m_mapWordNodes.end())
{
return Ite->second;
}


return NULL;
}


CWordsFilter& CWordsFilter::GetInstance()
{
static CWordsFilter inst;
return inst;
}

  1. 上一頁:
  2. 下一頁:
Copyright © 程式師世界 All Rights Reserved