Quellcode durchsuchen

Add : 关键词过滤算法

Zhu Jinhui vor 7 Jahren
Ursprung
Commit
0ef3cc5ddc
1 geänderte Dateien mit 159 neuen und 0 gelöschten Zeilen
  1. 159 0
      src/Library/Trie.php

+ 159 - 0
src/Library/Trie.php

@@ -0,0 +1,159 @@
+<?php
+/**
+ * Class Trie
+ * 关键词过滤
+ * 
+ * 使用方法
+ * $trie = new Qii\Library\Trie();
+ * $arr = file('test.txt');
+ * $trie->setWords($arr);
+ * $trie->search('过滤词aha');
+ */
+
+namespace Qii\Library;
+
+class Trie
+{
+    private $trie;
+    
+    public function __construct()
+    {
+        $this->trie = array('children' => array(), 'isWord' => false);
+    }
+    
+    /**
+     * 设置过滤的词
+     *
+     * @param string $word 过滤词
+     */
+    public function setWord($word)
+    {
+        $word = trim($word);
+        $trieNode = &$this->trie;
+        for ($i = 0; $i < strlen($word); $i++) {
+            $character = $word[$i];
+            if (!isset($trieNode['children'][$character])) {
+                $trieNode['children'][$character] = array('isWord' => false);
+            }
+            if ($i == strlen($word) - 1) {
+                $trieNode['children'][$character] = array('isWord' => true);
+            }
+            $trieNode = &$trieNode['children'][$character];
+        }
+    }
+    
+    /**
+     * 设置过滤词
+     *
+     * @param string $words 过滤词
+     */
+    public function setWords($words)
+    {
+        if (!is_array($words)) {
+            return;
+        }
+        
+        foreach ($words as $word) {
+            $this->setWord($word);
+        }
+    }
+    
+    /**
+     * 是否是过滤的词
+     *
+     * @param string $word
+     * @return bool
+     */
+    public function isWord($word)
+    {
+        $trieNode = &$this->trie;
+        for ($i = 0; $i < strlen($word); $i++) {
+            $character = $word[$i];
+            if (!isset($trieNode['children'][$character])) {
+                return false;
+            } else {
+                if ($i == (strlen($word) - 1) && $trieNode['children'][$character]['isWord'] == true) {
+                    return true;
+                } elseif ($i == (strlen($word) - 1) && $trieNode['children'][$character]['isWord'] == false) {
+                    return false;
+                }
+                $trieNode = &$trieNode['children'][$character];
+            }
+        }
+    }
+    
+    /**
+     * 查找哪些词是在过滤列表中
+     *
+     * @param string $text
+     * @return array
+     */
+    public function search($text = "")
+    {
+        $textLen = strlen($text);
+        $trieNode = $tree = $this->trie;
+        $find = array();
+        $wordRootPosition = 0;
+        $preNode = false;
+        $word = '';
+        for ($i = 0; $i < $textLen; $i++) {
+            if (isset($trieNode['children'][$text[$i]])) {
+                $word = $word . $text[$i];
+                $trieNode = $trieNode['children'][$text[$i]];
+                if ($preNode == false) {
+                    $wordRootPosition = $i;
+                }
+                $preNode = true;
+                if ($trieNode['isWord']) {
+                    $find[] = array('position' => $wordRootPosition, 'word' => $word);
+                }
+            } else {
+                $trieNode = $tree;
+                $word = '';
+                if ($preNode) {
+                    $i = $i - 1;
+                    $preNode = false;
+                }
+            }
+        }
+        return $find;
+    }
+    
+    /**
+     * 匹配最长长度
+     *
+     * @param string $text
+     * @return mixed
+     */
+    public function searchMax($text)
+    {
+        $textLen = strlen($text);
+        $trieNode = $tree = $this->trie;
+        $find = array();
+        $wordRootPosition = 0;
+        $preNode = false;
+        $word = '';
+        for ($i = 0; $i < $textLen; $i++) {
+            if (isset($trieNode['children'][$text[$i]])) {
+                $word = $word . $text[$i];
+                $trieNode = $trieNode['children'][$text[$i]];
+                if ($preNode == false) {
+                    $wordRootPosition = $i;
+                }
+                $preNode = true;
+                if ($trieNode['isWord']) {
+                    $find[] = array('position' => $wordRootPosition, 'word' => $word);
+                }
+            } else {
+                $trieNode = $tree;
+                $word = '';
+                if ($preNode) {
+                    $i = $i - 1;
+                    $preNode = false;
+                }
+            }
+        }
+        $n = count($find) - 1;
+        return $find[$n];
+    }
+}