123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295 |
- <?php
- /**
- * 敏感词类库.
- * User: Lustre
- * Date: 17/3/9
- * Time: 上午9:11
- */
- namespace util;
-
- class SensitiveHelper
- {
- /**
- * 待检测语句长度
- *
- * @var int
- */
- protected $contentLength = 0;
-
- /**
- * 敏感词单例
- *
- * @var object|null
- */
- private static $_instance = null;
-
- /**
- * 铭感词库树
- *
- * @var HashMap|null
- */
- protected $wordTree = null;
-
- /**
- * 存放待检测语句铭感词
- *
- * @var array|null
- */
- protected static $badWordList = null;
-
- /**
- * 获取单例
- *
- * @return self
- */
- public static function init()
- {
- if (!self::$_instance instanceof self) {
- self::$_instance = new self();
- }
- return self::$_instance;
- }
-
- /**
- * 构建铭感词树【文件模式】
- * @param string $filepath
- * @return $this
- * @throws \Exception
- */
- public function setTreeByFile($filepath = '')
- {
- if (!file_exists($filepath)) {
- throw new \Exception('词库文件不存在');
- }
- // 词库树初始化
- $this->wordTree = $this->wordTree ?: new HashMap();
- foreach ($this->yieldToReadFile($filepath) as $word) {
- $this->buildWordToTree(trim($word));
- }
- return $this;
- }
-
- /**
- * 构建铭感词树【数组模式】
- * @param null $sensitiveWords
- * @return $this
- * @throws \Exception
- */
- public function setTree($sensitiveWords = null)
- {
- if (empty($sensitiveWords)) {
- throw new \Exception('词库不能为空');
- }
- $this->wordTree = new HashMap();
- foreach ($sensitiveWords as $word) {
- $this->buildWordToTree($word);
- }
- return $this;
- }
-
- /**
- * 检测文字中的敏感词
- *
- * @param string $content 待检测内容
- * @param int $matchType 匹配类型 [默认为最小匹配规则]
- * @param int $wordNum 需要获取的敏感词数量 [默认获取全部]
- * @return array
- */
- public function getBadWord($content, $matchType = 1, $wordNum = 0)
- {
- $this->contentLength = mb_strlen($content, 'utf-8');
- $badWordList = array();
- for ($length = 0; $length < $this->contentLength; $length++) {
- $matchFlag = 0;
- $flag = false;
- $tempMap = $this->wordTree;
- for ($i = $length; $i < $this->contentLength; $i++) {
- $keyChar = mb_substr($content, $i, 1, 'utf-8');
- // 获取指定节点树
- $nowMap = $tempMap->get($keyChar);
- // 不存在节点树,直接返回
- if (empty($nowMap)) {
- break;
- }
- // 存在,则判断是否为最后一个
- $tempMap = $nowMap;
- // 找到相应key,偏移量+1
- $matchFlag++;
- // 如果为最后一个匹配规则,结束循环,返回匹配标识数
- if (false === $nowMap->get('ending')) {
- continue;
- }
- $flag = true;
- // 最小规则,直接退出
- if (1 === $matchType) {
- break;
- }
- }
- if (!$flag) {
- $matchFlag = 0;
- }
- // 找到相应key
- if ($matchFlag <= 0) {
- continue;
- }
- $badWordList[] = mb_substr($content, $length, $matchFlag, 'utf-8');
- // 有返回数量限制
- if ($wordNum > 0 && count($badWordList) == $wordNum) {
- return $badWordList;
- }
- // 需匹配内容标志位往后移
- $length = $length + $matchFlag - 1;
- }
- return $badWordList;
- }
-
- /**
- * 替换敏感字字符
- *
- * @param $content 文本内容
- * @param string $replaceChar 替换字符
- * @param bool $repeat true=>重复替换为敏感词相同长度的字符
- * @param int $matchType
- * @return mixed
- */
- public function replace($content, $replaceChar = '', $repeat = false, $matchType = 1)
- {
- if (empty($content)) {
- throw new \Exception('请填写检测的内容');
- }
- $badWordList = self::$badWordList ? self::$badWordList : $this->getBadWord($content, $matchType);
- // 未检测到敏感词,直接返回
- if (empty($badWordList)) {
- return $content;
- }
- foreach ($badWordList as $badWord) {
- $hasReplacedChar = $replaceChar;
- if ($repeat) {
- $hasReplacedChar = $this->dfaBadWordConversChars($badWord, $replaceChar);
- }
- $content = str_replace($badWord, $hasReplacedChar, $content);
- }
- return $content;
- }
-
- /**
- * 标记敏感词
- * @param $content 文本内容
- * @param string $sTag 标签开头,如<mark>
- * @param string $eTag 标签结束,如</mark>
- * @param int $matchType
- * @return mixed
- */
- public function mark($content, $sTag, $eTag, $matchType = 1)
- {
- if (empty($content)) {
- throw new \Exception('请填写检测的内容');
- }
- $badWordList = self::$badWordList ? self::$badWordList : $this->getBadWord($content, $matchType);
- // 未检测到敏感词,直接返回
- if (empty($badWordList)) {
- return $content;
- }
- foreach ($badWordList as $badWord) {
- $replaceChar = $sTag . $badWord . $eTag;
- $content = str_replace($badWord, $replaceChar, $content);
- }
- return $content;
- }
-
- /**
- * 被检测内容是否合法
- * @param $content
- * @return bool
- */
- public function islegal($content)
- {
- $this->contentLength = mb_strlen($content, 'utf-8');
- for ($length = 0; $length < $this->contentLength; $length++) {
- $matchFlag = 0;
- $tempMap = $this->wordTree;
- for ($i = $length; $i < $this->contentLength; $i++) {
- $keyChar = mb_substr($content, $i, 1, 'utf-8');
- // 获取指定节点树
- $nowMap = $tempMap->get($keyChar);
- // 不存在节点树,直接返回
- if (empty($nowMap)) {
- break;
- }
- // 找到相应key,偏移量+1
- $tempMap = $nowMap;
- $matchFlag++;
- // 如果为最后一个匹配规则,结束循环,返回匹配标识数
- if (false === $nowMap->get('ending')) {
- continue;
- }
- return true;
- }
- // 找到相应key
- if ($matchFlag <= 0) {
- continue;
- }
- // 需匹配内容标志位往后移
- $length = $length + $matchFlag - 1;
- }
- return false;
- }
-
- protected function yieldToReadFile($filepath)
- {
- $fp = fopen($filepath, 'r');
- while (!feof($fp)) {
- yield fgets($fp);
- }
- fclose($fp);
- }
-
- // 将单个敏感词构建成树结构
- protected function buildWordToTree($word = '')
- {
- if ('' === $word) {
- return;
- }
- $tree = $this->wordTree;
- $wordLength = mb_strlen($word, 'utf-8');
- for ($i = 0; $i < $wordLength; $i++) {
- $keyChar = mb_substr($word, $i, 1, 'utf-8');
-
- // 获取子节点树结构
- $tempTree = $tree->get($keyChar);
-
- if ($tempTree) {
- $tree = $tempTree;
- } else {
- // 设置标志位
- $newTree = new HashMap();
- $newTree->put('ending', false);
-
- // 添加到集合
- $tree->put($keyChar, $newTree);
- $tree = $newTree;
- }
- // 到达最后一个节点
- if ($i == $wordLength - 1) {
- $tree->put('ending', true);
- }
- }
- return;
- }
-
- /**
- * 敏感词替换为对应长度的字符
- * @param $word
- * @param $char
- * @return string
- */
- protected function dfaBadWordConversChars($word, $char)
- {
- $str = '';
- $length = mb_strlen($word, 'utf-8');
- for ($counter = 0; $counter < $length; ++$counter) {
- $str .= $char;
- }
- return $str;
- }
- }
|