Sin descripción
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

SensitiveHelper.php 8.4KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295
  1. <?php
  2. /**
  3. * 敏感词类库.
  4. * User: Lustre
  5. * Date: 17/3/9
  6. * Time: 上午9:11
  7. */
  8. namespace util;
  9. class SensitiveHelper
  10. {
  11. /**
  12. * 待检测语句长度
  13. *
  14. * @var int
  15. */
  16. protected $contentLength = 0;
  17. /**
  18. * 敏感词单例
  19. *
  20. * @var object|null
  21. */
  22. private static $_instance = null;
  23. /**
  24. * 铭感词库树
  25. *
  26. * @var HashMap|null
  27. */
  28. protected $wordTree = null;
  29. /**
  30. * 存放待检测语句铭感词
  31. *
  32. * @var array|null
  33. */
  34. protected static $badWordList = null;
  35. /**
  36. * 获取单例
  37. *
  38. * @return self
  39. */
  40. public static function init()
  41. {
  42. if (!self::$_instance instanceof self) {
  43. self::$_instance = new self();
  44. }
  45. return self::$_instance;
  46. }
  47. /**
  48. * 构建铭感词树【文件模式】
  49. * @param string $filepath
  50. * @return $this
  51. * @throws \Exception
  52. */
  53. public function setTreeByFile($filepath = '')
  54. {
  55. if (!file_exists($filepath)) {
  56. throw new \Exception('词库文件不存在');
  57. }
  58. // 词库树初始化
  59. $this->wordTree = $this->wordTree ?: new HashMap();
  60. foreach ($this->yieldToReadFile($filepath) as $word) {
  61. $this->buildWordToTree(trim($word));
  62. }
  63. return $this;
  64. }
  65. /**
  66. * 构建铭感词树【数组模式】
  67. * @param null $sensitiveWords
  68. * @return $this
  69. * @throws \Exception
  70. */
  71. public function setTree($sensitiveWords = null)
  72. {
  73. if (empty($sensitiveWords)) {
  74. throw new \Exception('词库不能为空');
  75. }
  76. $this->wordTree = new HashMap();
  77. foreach ($sensitiveWords as $word) {
  78. $this->buildWordToTree($word);
  79. }
  80. return $this;
  81. }
  82. /**
  83. * 检测文字中的敏感词
  84. *
  85. * @param string $content 待检测内容
  86. * @param int $matchType 匹配类型 [默认为最小匹配规则]
  87. * @param int $wordNum 需要获取的敏感词数量 [默认获取全部]
  88. * @return array
  89. */
  90. public function getBadWord($content, $matchType = 1, $wordNum = 0)
  91. {
  92. $this->contentLength = mb_strlen($content, 'utf-8');
  93. $badWordList = array();
  94. for ($length = 0; $length < $this->contentLength; $length++) {
  95. $matchFlag = 0;
  96. $flag = false;
  97. $tempMap = $this->wordTree;
  98. for ($i = $length; $i < $this->contentLength; $i++) {
  99. $keyChar = mb_substr($content, $i, 1, 'utf-8');
  100. // 获取指定节点树
  101. $nowMap = $tempMap->get($keyChar);
  102. // 不存在节点树,直接返回
  103. if (empty($nowMap)) {
  104. break;
  105. }
  106. // 存在,则判断是否为最后一个
  107. $tempMap = $nowMap;
  108. // 找到相应key,偏移量+1
  109. $matchFlag++;
  110. // 如果为最后一个匹配规则,结束循环,返回匹配标识数
  111. if (false === $nowMap->get('ending')) {
  112. continue;
  113. }
  114. $flag = true;
  115. // 最小规则,直接退出
  116. if (1 === $matchType) {
  117. break;
  118. }
  119. }
  120. if (!$flag) {
  121. $matchFlag = 0;
  122. }
  123. // 找到相应key
  124. if ($matchFlag <= 0) {
  125. continue;
  126. }
  127. $badWordList[] = mb_substr($content, $length, $matchFlag, 'utf-8');
  128. // 有返回数量限制
  129. if ($wordNum > 0 && count($badWordList) == $wordNum) {
  130. return $badWordList;
  131. }
  132. // 需匹配内容标志位往后移
  133. $length = $length + $matchFlag - 1;
  134. }
  135. return $badWordList;
  136. }
  137. /**
  138. * 替换敏感字字符
  139. *
  140. * @param $content 文本内容
  141. * @param string $replaceChar 替换字符
  142. * @param bool $repeat true=>重复替换为敏感词相同长度的字符
  143. * @param int $matchType
  144. * @return mixed
  145. */
  146. public function replace($content, $replaceChar = '', $repeat = false, $matchType = 1)
  147. {
  148. if (empty($content)) {
  149. throw new \Exception('请填写检测的内容');
  150. }
  151. $badWordList = self::$badWordList ? self::$badWordList : $this->getBadWord($content, $matchType);
  152. // 未检测到敏感词,直接返回
  153. if (empty($badWordList)) {
  154. return $content;
  155. }
  156. foreach ($badWordList as $badWord) {
  157. $hasReplacedChar = $replaceChar;
  158. if ($repeat) {
  159. $hasReplacedChar = $this->dfaBadWordConversChars($badWord, $replaceChar);
  160. }
  161. $content = str_replace($badWord, $hasReplacedChar, $content);
  162. }
  163. return $content;
  164. }
  165. /**
  166. * 标记敏感词
  167. * @param $content 文本内容
  168. * @param string $sTag 标签开头,如<mark>
  169. * @param string $eTag 标签结束,如</mark>
  170. * @param int $matchType
  171. * @return mixed
  172. */
  173. public function mark($content, $sTag, $eTag, $matchType = 1)
  174. {
  175. if (empty($content)) {
  176. throw new \Exception('请填写检测的内容');
  177. }
  178. $badWordList = self::$badWordList ? self::$badWordList : $this->getBadWord($content, $matchType);
  179. // 未检测到敏感词,直接返回
  180. if (empty($badWordList)) {
  181. return $content;
  182. }
  183. foreach ($badWordList as $badWord) {
  184. $replaceChar = $sTag . $badWord . $eTag;
  185. $content = str_replace($badWord, $replaceChar, $content);
  186. }
  187. return $content;
  188. }
  189. /**
  190. * 被检测内容是否合法
  191. * @param $content
  192. * @return bool
  193. */
  194. public function islegal($content)
  195. {
  196. $this->contentLength = mb_strlen($content, 'utf-8');
  197. for ($length = 0; $length < $this->contentLength; $length++) {
  198. $matchFlag = 0;
  199. $tempMap = $this->wordTree;
  200. for ($i = $length; $i < $this->contentLength; $i++) {
  201. $keyChar = mb_substr($content, $i, 1, 'utf-8');
  202. // 获取指定节点树
  203. $nowMap = $tempMap->get($keyChar);
  204. // 不存在节点树,直接返回
  205. if (empty($nowMap)) {
  206. break;
  207. }
  208. // 找到相应key,偏移量+1
  209. $tempMap = $nowMap;
  210. $matchFlag++;
  211. // 如果为最后一个匹配规则,结束循环,返回匹配标识数
  212. if (false === $nowMap->get('ending')) {
  213. continue;
  214. }
  215. return true;
  216. }
  217. // 找到相应key
  218. if ($matchFlag <= 0) {
  219. continue;
  220. }
  221. // 需匹配内容标志位往后移
  222. $length = $length + $matchFlag - 1;
  223. }
  224. return false;
  225. }
  226. protected function yieldToReadFile($filepath)
  227. {
  228. $fp = fopen($filepath, 'r');
  229. while (!feof($fp)) {
  230. yield fgets($fp);
  231. }
  232. fclose($fp);
  233. }
  234. // 将单个敏感词构建成树结构
  235. protected function buildWordToTree($word = '')
  236. {
  237. if ('' === $word) {
  238. return;
  239. }
  240. $tree = $this->wordTree;
  241. $wordLength = mb_strlen($word, 'utf-8');
  242. for ($i = 0; $i < $wordLength; $i++) {
  243. $keyChar = mb_substr($word, $i, 1, 'utf-8');
  244. // 获取子节点树结构
  245. $tempTree = $tree->get($keyChar);
  246. if ($tempTree) {
  247. $tree = $tempTree;
  248. } else {
  249. // 设置标志位
  250. $newTree = new HashMap();
  251. $newTree->put('ending', false);
  252. // 添加到集合
  253. $tree->put($keyChar, $newTree);
  254. $tree = $newTree;
  255. }
  256. // 到达最后一个节点
  257. if ($i == $wordLength - 1) {
  258. $tree->put('ending', true);
  259. }
  260. }
  261. return;
  262. }
  263. /**
  264. * 敏感词替换为对应长度的字符
  265. * @param $word
  266. * @param $char
  267. * @return string
  268. */
  269. protected function dfaBadWordConversChars($word, $char)
  270. {
  271. $str = '';
  272. $length = mb_strlen($word, 'utf-8');
  273. for ($counter = 0; $counter < $length; ++$counter) {
  274. $str .= $char;
  275. }
  276. return $str;
  277. }
  278. }