| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296 | <?php/** * 敏感词类库. * User: wanghui * Date: 17/3/9 * Time: 上午9:11 */namespace addons\cms\library;class SensitiveHelper{    /**     * 待检测语句长度     *     * @var int     */    protected $contentLength = 0;    /**     * 敏感词单例     *     * @var object|null     */    private static $_instance = null;    /**     * 铭感词库树     *     * @var HashMap|null     */    protected $wordTree = null;    /**     * 存放待检测语句铭感词     *     * @var array|null     */    protected static $badWordList = null;    /**     * 获取单例     *     * @return self     */    public static function init()    {        if (!self::$_instance instanceof self) {            self::$_instance = new self();        }        return self::$_instance;    }    /**     * 构建铭感词树【文件模式】     *     * @param string $filepath     * @return $this     * @throws \Exception     */    public function setTreeByFile($filepath = '')    {        if (!file_exists($filepath)) {            throw new \Exception('词库文件不存在');        }        // 词库树初始化        $this->wordTree = new HashMap();        foreach ($this->yieldToReadFile($filepath) as $word) {            $this->buildWordToTree(trim($word));        }        return $this;    }    /**     * 构建铭感词树【数组模式】     *     * @param null $sensitiveWords     * @return $this     * @throws \Exception     */    public function setTree($sensitiveWords = null)    {        if (empty($sensitiveWords)) {            throw new \Exception('词库不能为空');        }        $this->wordTree = new HashMap();        foreach ($sensitiveWords as $word) {            $this->buildWordToTree($word);        }        return $this;    }    /**     * 检测文字中的敏感词     *     * @param string $content 待检测内容     * @param int $matchType 匹配类型 [默认为最小匹配规则]     * @param int $wordNum 需要获取的敏感词数量 [默认获取全部]     * @return array     */    public function getBadWord($content, $matchType = 1, $wordNum = 0)    {        $this->contentLength = mb_strlen($content, 'utf-8');        $badWordList = array();        for ($length = 0; $length < $this->contentLength; $length++) {            $matchFlag = 0;            $flag = false;            $tempMap = $this->wordTree;            for ($i = $length; $i < $this->contentLength; $i++) {                $keyChar = mb_substr($content, $i, 1, 'utf-8');                // 获取指定节点树                $nowMap = $tempMap->get($keyChar);                // 不存在节点树,直接返回                if (empty($nowMap)) {                    break;                }                // 存在,则判断是否为最后一个                $tempMap = $nowMap;                // 找到相应key,偏移量+1                $matchFlag++;                // 如果为最后一个匹配规则,结束循环,返回匹配标识数                if (false === $nowMap->get('ending')) {                    continue;                }                $flag = true;                // 最小规则,直接退出                if (1 === $matchType) {                    break;                }            }            if (!$flag) {                $matchFlag = 0;            }            // 找到相应key            if ($matchFlag <= 0) {                continue;            }            $badWordList[] = mb_substr($content, $length, $matchFlag, 'utf-8');            // 有返回数量限制            if ($wordNum > 0 && count($badWordList) == $wordNum) {                return $badWordList;            }            // 需匹配内容标志位往后移            $length = $length + $matchFlag - 1;        }        return $badWordList;    }    /**     * 替换敏感字字符     *     * @param $content     * @param $replaceChar     * @param string $sTag     * @param string $eTag     * @param int $matchType     * @return mixed     */    public function replace($content, $replaceChar = '', $sTag = '', $eTag = '', $matchType = 1)    {        if (empty($content)) {            throw new \Exception('请填写检测的内容');        }        if (empty(self::$badWordList)) {            $badWordList = $this->getBadWord($content, $matchType);        } else {            $badWordList = self::$badWordList;        }        // 未检测到敏感词,直接返回        if (empty($badWordList)) {            return $content;        }        foreach ($badWordList as $badWord) {            if ($sTag || $eTag) {                $replaceChar = $sTag . $badWord . $eTag;            }            $content = str_replace($badWord, $replaceChar, $content);        }        return $content;    }    /**     * 被检测内容是否合法,合法返回true,非法返回false     * @param $content     * @return bool     */    public function islegal($content)    {        $this->contentLength = mb_strlen($content, 'utf-8');        for ($length = 0; $length < $this->contentLength; $length++) {            $matchFlag = 0;            $tempMap = $this->wordTree;            for ($i = $length; $i < $this->contentLength; $i++) {                $keyChar = mb_substr($content, $i, 1, 'utf-8');                // 获取指定节点树                $nowMap = $tempMap->get($keyChar);                // 不存在节点树,直接返回                if (empty($nowMap)) {                    break;                }                // 找到相应key,偏移量+1                $tempMap = $nowMap;                $matchFlag++;                // 如果为最后一个匹配规则,结束循环,返回匹配标识数                if (false === $nowMap->get('ending')) {                    continue;                }                return false;            }            // 找到相应key            if ($matchFlag <= 0) {                continue;            }            // 需匹配内容标志位往后移            $length = $length + $matchFlag - 1;        }        return true;    }    protected function yieldToReadFile($filepath)    {        $fp = fopen($filepath, 'r');        while (!feof($fp)) {            yield fgets($fp);        }        fclose($fp);    }    // 将单个敏感词构建成树结构    protected function buildWordToTree($word = '')    {        if ('' === $word) {            return;        }        $tree = $this->wordTree;        $wordLength = mb_strlen($word, 'utf-8');        for ($i = 0; $i < $wordLength; $i++) {            $keyChar = mb_substr($word, $i, 1, 'utf-8');            // 获取子节点树结构            $tempTree = $tree->get($keyChar);            if ($tempTree) {                $tree = $tempTree;            } else {                // 设置标志位                $newTree = new HashMap();                $newTree->put('ending', false);                // 添加到集合                $tree->put($keyChar, $newTree);                $tree = $newTree;            }            // 到达最后一个节点            if ($i == $wordLength - 1) {                $tree->put('ending', true);            }        }        return;    }}
 |