Make a trie tree with php language.使用场景为中文|英文 敏感词过滤/关键词过滤字典树,前缀树,内链建设,搜索提示。
composer require abelzhou/php-trie-tree
- 需要swoole扩展,直接运行swoole_server.php
- 测试内容为5000个人名
test目录下有个1.5w左右的敏感词。
mac下检索耗时2~5毫秒左右
这些敏感词来自网络,不是很全。
感知提取50个词以内1ms以内,500个词在3ms左右
最近再尝试改成AC自动机,但中英文差异性比较大。英文单节点变化度可控(26+10)所以比较合适AC自动机,中文会带来额外的性能开销,所以该算法暂时不采用。
$testArr = array("张三","张四","王五","张大宝","张三四","张氏家族","王二麻子");$tree = new \AbelZhou\Tree\TrieTree();
foreach ($testArr as $str){ $tree->append($str); }
$res = $tree->getTree();
var_dump($res);
$res = $tree->search("有一个叫张三的哥们"); var_dump($res);
$res = $tree->search("我叫李四喜"); var_dump($res);
//删除 $res = $tree->delete("张三"); //删除整棵树 连带“张三”和张三下的“张三四”一并删除 $tree->delete("张三",true);
//拼音检测 $tree->append("zhangsan","",true,"张三"); $tree->append("zhangsan","",true,"张伞");
$t1 = microtime(true); var_dump($tree->getTreeWord("zh")); $t2 = microtime(true); echo 'getTreeWordPinyin{' . ($t2 - $t1) . '}s'.PHP_EOL;
//replace & delete $tree->append("z","",true,"在"); $tree->append("z","",true,"走"); $tree->append("z","",true,"做"); var_dump($tree->getTreeWord("z",4)); //覆盖 $tree->append("z",array("1"=>1),true,"做"); var_dump($tree->getTreeWord("z",4)); //删除 $tree->delete("z",false,true,"在"); var_dump($tree->getTreeWord("z",4)); $tree->delete("z",false,true,"走"); $tree->delete("z",false,true,"做"); var_dump($tree->getTreeWord("z", 4));