Trie.php 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. <?php
  2. /**
  3. * Class Trie
  4. * 关键词过滤
  5. *
  6. * 使用方法
  7. * $trie = new Qii\Library\Trie();
  8. * $arr = file('test.txt');
  9. * $trie->setWords($arr);
  10. * $trie->search('过滤词aha');
  11. */
  12. namespace Qii\Library;
  13. class Trie
  14. {
  15. private $trie;
  16. public function __construct()
  17. {
  18. $this->trie = array('children' => array(), 'isWord' => false);
  19. }
  20. /**
  21. * 设置过滤的词
  22. *
  23. * @param string $word 过滤词
  24. */
  25. public function setWord($word)
  26. {
  27. $word = trim($word);
  28. $trieNode = &$this->trie;
  29. for ($i = 0; $i < strlen($word); $i++) {
  30. $character = $word[$i];
  31. if (!isset($trieNode['children'][$character])) {
  32. $trieNode['children'][$character] = array('isWord' => false);
  33. }
  34. if ($i == strlen($word) - 1) {
  35. $trieNode['children'][$character] = array('isWord' => true);
  36. }
  37. $trieNode = &$trieNode['children'][$character];
  38. }
  39. }
  40. /**
  41. * 设置过滤词
  42. *
  43. * @param string $words 过滤词
  44. */
  45. public function setWords($words)
  46. {
  47. if (!is_array($words)) {
  48. return;
  49. }
  50. foreach ($words as $word) {
  51. $this->setWord($word);
  52. }
  53. }
  54. /**
  55. * 是否是过滤的词
  56. *
  57. * @param string $word
  58. * @return bool
  59. */
  60. public function isWord($word)
  61. {
  62. $trieNode = &$this->trie;
  63. for ($i = 0; $i < strlen($word); $i++) {
  64. $character = $word[$i];
  65. if (!isset($trieNode['children'][$character])) {
  66. return false;
  67. } else {
  68. if ($i == (strlen($word) - 1) && $trieNode['children'][$character]['isWord'] == true) {
  69. return true;
  70. } elseif ($i == (strlen($word) - 1) && $trieNode['children'][$character]['isWord'] == false) {
  71. return false;
  72. }
  73. $trieNode = &$trieNode['children'][$character];
  74. }
  75. }
  76. }
  77. /**
  78. * 查找哪些词是在过滤列表中
  79. *
  80. * @param string $text
  81. * @return array
  82. */
  83. public function search($text = "")
  84. {
  85. $textLen = strlen($text);
  86. $trieNode = $tree = $this->trie;
  87. $find = array();
  88. $wordRootPosition = 0;
  89. $preNode = false;
  90. $word = '';
  91. for ($i = 0; $i < $textLen; $i++) {
  92. if (isset($trieNode['children'][$text[$i]])) {
  93. $word = $word . $text[$i];
  94. $trieNode = $trieNode['children'][$text[$i]];
  95. if ($preNode == false) {
  96. $wordRootPosition = $i;
  97. }
  98. $preNode = true;
  99. if ($trieNode['isWord']) {
  100. $find[] = array('position' => $wordRootPosition, 'word' => $word);
  101. }
  102. } else {
  103. $trieNode = $tree;
  104. $word = '';
  105. if ($preNode) {
  106. $i = $i - 1;
  107. $preNode = false;
  108. }
  109. }
  110. }
  111. return $find;
  112. }
  113. /**
  114. * 匹配最长长度
  115. *
  116. * @param string $text
  117. * @return mixed
  118. */
  119. public function searchMax($text)
  120. {
  121. $textLen = strlen($text);
  122. $trieNode = $tree = $this->trie;
  123. $find = array();
  124. $wordRootPosition = 0;
  125. $preNode = false;
  126. $word = '';
  127. for ($i = 0; $i < $textLen; $i++) {
  128. if (isset($trieNode['children'][$text[$i]])) {
  129. $word = $word . $text[$i];
  130. $trieNode = $trieNode['children'][$text[$i]];
  131. if ($preNode == false) {
  132. $wordRootPosition = $i;
  133. }
  134. $preNode = true;
  135. if ($trieNode['isWord']) {
  136. $find[] = array('position' => $wordRootPosition, 'word' => $word);
  137. }
  138. } else {
  139. $trieNode = $tree;
  140. $word = '';
  141. if ($preNode) {
  142. $i = $i - 1;
  143. $preNode = false;
  144. }
  145. }
  146. }
  147. $n = count($find) - 1;
  148. return $find[$n];
  149. }
  150. }