Browse Source

Update http client

Zhu Jinhui 6 năm trước cách đây
mục cha
commit
6b55faa530

+ 6 - 1
src/Library/Third/hightman/Client.php

@@ -18,10 +18,15 @@ namespace hightman\http;
 class Client
 {
     use HeaderTrait;
-	const PACKAGE = __CLASS__;
+    const PACKAGE = __CLASS__;
     const VERSION = '1.0.0-beta';
     const CRLF = "\r\n";
 
+    /**
+     * @var int the maximum number of concurrent connections for same host and port pair.
+     */
+    public static $maxBurst = 3;
+
     private $_cookiePath, $_parser, $_timeout;
     private static $_debugOpen = false;
     private static $_processKey;

+ 114 - 13
src/Library/Third/hightman/Connection.php

@@ -17,11 +17,6 @@ namespace hightman\http;
  */
 class Connection
 {
-    /**
-     * The maximum number of concurrent connections for the same host and port pair.
-     */
-    const MAX_BURST = 3;
-
     /**
      * The connection socket flags
      */
@@ -32,11 +27,43 @@ class Connection
     const FLAG_REUSED = 0x10;
     const FLAG_SELECT = 0x20;
 
+    /**
+     * @var int state of proxy connection
+     * 1: ask
+     * 2: confirm proxy type
+     * 3: authentication
+     * 4: auth result
+     * 5: send ip & port
+     * 6: ok or refused
+     */
+    public $proxyState = 0;
+
     protected $outBuf, $outLen;
     protected $arg, $sock, $conn, $flag = 0;
     private static $_objs = [];
     private static $_refs = [];
     private static $_lastError;
+    private static $_socks5 = [];
+
+    /**
+     * Set socks5 proxy server
+     * @param string $host proxy server address, passed null to disable
+     * @param int $port proxy server port, default to 1080
+     * @param string $user authentication username
+     * @param string $pass authentication password
+     */
+    public static function useSocks5($host, $port = 1080, $user = null, $pass = null)
+    {
+        if ($host === null) {
+            self::$_socks5 = [];
+        } else {
+            self::$_socks5 = ['conn' => 'tcp://' . $host . ':' . $port];
+            if ($user !== null && $pass !== null) {
+                self::$_socks5['user'] = $user;
+                self::$_socks5['pass'] = $pass;
+            }
+        }
+    }
 
     /**
      * Create connection, with built-in pool.
@@ -57,12 +84,13 @@ class Connection
                 break;
             }
         }
-        if ($obj === null && count(self::$_objs[$conn]) < self::MAX_BURST) {
+        if ($obj === null && count(self::$_objs[$conn]) < Client::$maxBurst) {
             $obj = new self($conn);
             self::$_objs[$conn][] = $obj;
             Client::debug('create conn \'', $conn, '\'');
         }
         if ($obj !== null) {
+            $obj->arg = $arg;
             if ($obj->flag & self::FLAG_OPENED) {
                 $obj->flag |= self::FLAG_REUSED;
             } else {
@@ -73,7 +101,6 @@ class Connection
             $obj->flag |= self::FLAG_BUSY;
             $obj->outBuf = null;
             $obj->outLen = 0;
-            $obj->arg = $arg;
         }
         return $obj;
     }
@@ -135,6 +162,9 @@ class Connection
      */
     public function hasDataToWrite()
     {
+        if ($this->proxyState > 0) {
+            return $this->proxyState & 1 ? true : false;
+        }
         return ($this->outBuf !== null && strlen($this->outBuf) > $this->outLen);
     }
 
@@ -146,6 +176,9 @@ class Connection
     public function write($buf = null)
     {
         if ($buf === null) {
+            if ($this->proxyState > 0) {
+                return $this->proxyWrite();
+            }
             $len = 0;
             if ($this->hasDataToWrite()) {
                 $buf = $this->outLen > 0 ? substr($this->outBuf, $this->outLen) : $this->outBuf;
@@ -156,7 +189,7 @@ class Connection
             }
             return $len;
         }
-        $n = fwrite($this->sock, $buf);
+        $n = @fwrite($this->sock, $buf);
         if ($n === 0 && $this->ioEmptyError()) {
             $n = false;
         }
@@ -170,7 +203,7 @@ class Connection
      */
     public function getLine()
     {
-        $line = stream_get_line($this->sock, 2048, "\n");
+        $line = @stream_get_line($this->sock, 2048, "\n");
         if ($line === '' || $line === false) {
             $line = $this->ioEmptyError() ? false : null;
         } else {
@@ -187,7 +220,7 @@ class Connection
      */
     public function read($size = 8192)
     {
-        $buf = fread($this->sock, $size);
+        $buf = @fread($this->sock, $size);
         if ($buf === '' || $buf === false) {
             $buf = $this->ioEmptyError() ? false : null;
         }
@@ -195,6 +228,63 @@ class Connection
         return $buf;
     }
 
+    /**
+     * Read data for proxy communication
+     */
+    public function proxyRead()
+    {
+        $proxyState = $this->proxyState;
+        Client::debug('proxy readState: ', $proxyState);
+        if ($proxyState === 2) {
+            $buf = $this->read(2);
+            if ($buf === "\x05\x00") {
+                $this->proxyState = 5;
+            } elseif ($buf === "\x05\x02") {
+                $this->proxyState = 3;
+            }
+        } elseif ($proxyState === 4) {
+            $buf = $this->read(2);
+            if ($buf === "\x01\x00") {
+                $this->proxyState = 5;
+            }
+        } elseif ($proxyState === 6) {
+            $buf = $this->read(10);
+            if (substr($buf, 0, 4) === "\x05\x00\x00\x01") {
+                $this->proxyState = 0;
+            }
+        }
+        return $proxyState !== $this->proxyState;
+    }
+
+    /**
+     * Write data for proxy communication
+     * @return mixed
+     */
+    public function proxyWrite()
+    {
+        Client::debug('proxy writeState: ', $this->proxyState);
+        if ($this->proxyState === 1) {
+            $buf = isset(self::$_socks5['user']) ? "\x05\x01\x02" : "\x05\x01\x00";
+            $this->proxyState++;
+            return $this->write($buf);
+        } elseif ($this->proxyState === 3) {
+            if (!isset(self::$_socks5['user'])) {
+                return false;
+            }
+            $buf = chr(0x01) . chr(strlen(self::$_socks5['user'])) . self::$_socks5['user']
+                . chr(strlen(self::$_socks5['pass'])) . self::$_socks5['pass'];
+            $this->proxyState++;
+            return $this->write($buf);
+        } elseif ($this->proxyState === 5) {
+            $pa = parse_url($this->conn);
+            $buf = "\x05\x01\x00\x01" . pack('Nn', ip2long($pa['host']), isset($pa['port']) ? $pa['port'] : 80);
+            $this->proxyState++;
+            return $this->write($buf);
+        } else {
+            return false;
+        }
+    }
+
     /**
      * Get the connection socket
      * @return resource the socket
@@ -233,12 +323,23 @@ class Connection
             $this->flag |= self::FLAG_NEW2;
         }
         // async-connect
-        $this->sock = stream_socket_client($this->conn, $errno, $error, 1, STREAM_CLIENT_ASYNC_CONNECT);
+        $ctx = ['ssl' => ['verify_peer' => false, 'verify_peer_name' => false]];
+        if (isset(self::$_socks5['conn'])) {
+            $this->proxyState = 1;
+            $conn = self::$_socks5['conn'];
+        } else {
+            $this->proxyState = 0;
+            $conn = $this->conn;
+            if (!strncmp($conn, 'ssl:', 4) && $this->arg instanceof Processor) {
+                $ctx['ssl']['peer_name'] = $this->arg->req->getUrlParam('host');
+            }
+        }
+        $this->sock = @stream_socket_client($conn, $errno, $error, 10, STREAM_CLIENT_ASYNC_CONNECT, stream_context_create($ctx));
         if ($this->sock === false) {
-            Client::debug($repeat ? 're' : '', 'open \'', $this->conn, '\' failed: ', $error);
+            Client::debug($repeat ? 're' : '', 'open \'', $conn, '\' failed: ', $error);
             self::$_lastError = $error;
         } else {
-            Client::debug($repeat ? 're' : '', 'open \'', $this->conn, '\' success: ', $this->sock);
+            Client::debug($repeat ? 're' : '', 'open \'', $conn, '\' success: ', $this->sock);
             stream_set_blocking($this->sock, false);
             $this->flag |= self::FLAG_OPENED;
             $this->addSockRef();

+ 21 - 8
src/Library/Third/hightman/HeaderTrait.php

@@ -25,15 +25,18 @@ trait HeaderTrait
      * Set http header or headers
      * @param mixed $key string key or key-value pairs to set multiple headers.
      * @param string $value the header value when key is string, set null to remove header.
+     * @param boolean $toLower convert key to lowercase
      */
-    public function setHeader($key, $value = null)
+    public function setHeader($key, $value = null, $toLower = true)
     {
         if (is_array($key)) {
             foreach ($key as $k => $v) {
                 $this->setHeader($k, $v);
             }
         } else {
-            $key = strtolower($key);
+            if ($toLower === true) {
+                $key = strtolower($key);
+            }
             if ($value === null) {
                 unset($this->_headers[$key]);
             } else {
@@ -46,8 +49,9 @@ trait HeaderTrait
      * Add http header or headers
      * @param mixed $key string key or key-value pairs to be added.
      * @param string $value the header value when key is string.
+     * @param boolean $toLower convert key to lowercase
      */
-    public function addHeader($key, $value = null)
+    public function addHeader($key, $value = null, $toLower = true)
     {
         if (is_array($key)) {
             foreach ($key as $k => $v) {
@@ -55,7 +59,9 @@ trait HeaderTrait
             }
         } else {
             if ($value !== null) {
-                $key = strtolower($key);
+                if ($toLower === true) {
+                    $key = strtolower($key);
+                }
                 if (!isset($this->_headers[$key])) {
                     $this->_headers[$key] = $value;
                 } else {
@@ -80,25 +86,32 @@ trait HeaderTrait
     /**
      * Get a http header or all http headers
      * @param mixed $key the header key to be got, or null to get all headers
+     * @param boolean $toLower convert key to lowercase
      * @return array|string the header value, or headers array when key is null.
      */
-    public function getHeader($key = null)
+    public function getHeader($key = null, $toLower = true)
     {
         if ($key === null) {
             return $this->_headers;
         }
-        $key = strtolower($key);
+        if ($toLower === true) {
+            $key = strtolower($key);
+        }
         return isset($this->_headers[$key]) ? $this->_headers[$key] : null;
     }
 
     /**
      * Check HTTP header is set or not
      * @param string $key the header key to be check, not case sensitive
+     * @param boolean $toLower convert key to lowercase
      * @return boolean if there is http header with the name.
      */
-    public function hasHeader($key)
+    public function hasHeader($key, $toLower = true)
     {
-        return isset($this->_headers[strtolower($key)]);
+        if ($toLower === true) {
+            $key = strtolower($key);
+        }
+        return isset($this->_headers[$key]);
     }
 
     /**

+ 7 - 1
src/Library/Third/hightman/Processor.php

@@ -105,7 +105,13 @@ class Processor
 
     public function recv()
     {
-        return $this->headerOk ? $this->readBody() : $this->readHeader();
+        if ($this->conn->proxyState !== 0) {
+            if ($this->conn->proxyRead() === false) {
+                $this->finish('BROKEN');
+            }
+        } else {
+            return $this->headerOk ? $this->readBody() : $this->readHeader();
+        }
     }
 
     /**

+ 0 - 223
src/Library/Third/hightman/StringHelper.php

@@ -1,223 +0,0 @@
-<?php
-/**
- * 多功能字符串工具
- *
- * @author hightman <hightman@twomice.net>
- * @link http://www.hightman.cn/
- * @copyright Copyright &copy; 2008-2013 Twomice Studio
- */
-
-/**
- * String Helper (all are static function)
- *
- * <pre>
- * StringHelper::decodeHtml($html);
- * StringHelper::fixHtmlCharset($html, $charset = 'utf-8');
- * StringHelper::finds($buf, $tag1, $tag2[, ...]);
- * StringHelper::find($buf, $tag1, $tag2[, ...]);
- * StringHelper::contains($buf, $tokens);
- * </pre>
- */
-class StringHelper
-{
-
-	/**
-	 * @param string $html
-	 * @return string 解码后的 html
-	 */
-	public static function decodeHtml($html)
-	{
-		if (strpos($html, '<') !== false) {
-			$html = strip_tags($html); /* preg_replace('/<.+?>/u', '', $html); */
-		}
-		return html_entity_decode(trim($html), ENT_QUOTES, 'utf-8');
-	}
-
-	/**
-	 * @param string $charset 目标字符集,默认 utf-8
-	 * @return string 强制转换网页内容为目标字符集
-	 */
-	public static function fixHtmlCharset($html, $charset = 'utf-8')
-	{
-		if (preg_match('/charset=["\']?([0-9a-zA-Z_-]+)/', $html, $match)
-			&& (strncasecmp($charset, 'gb', 2) || strncasecmp($match[1], 'gb', 2))
-			&& strcasecmp($charset, $match[1])) {
-			if (!strcasecmp($match[1], 'gb2312')) {
-				$match[1] = 'gbk';
-			}
-			if (function_exists('iconv')) {
-				return iconv($match[1], $charset . '//IGNORE', $html);
-			} elseif (function_exists('mb_convert_encoding')) {
-				return mb_convert_encoding($html, $charset, $match[1]);
-			}
-		}
-		return $html;
-	}
-
-	/**
-	 * 根据标记快速查找字符串列表
-	 * @param string $buf
-	 * @param array $config
-	 * array(
-	 *   array(key1, arg1, arg2, ...),
-	 *   array(key2, arg1, arg2, ...),
-	 * ),
-	 * @return array
-	 * @see StringMatcher::find
-	 */
-	public static function finds($buf, $config, &$error = null)
-	{
-		$obj = new StringMatcher($buf);
-		return $obj->finds($config, $error);
-	}
-
-	/**
-	 * 根据标记快速查找字符串
-	 * @param string $buf
-	 * @return string 返回最后两个标记之间的内容,找不到返回 null
-	 * @see StringMatcher::find
-	 */
-	public static function find($buf)
-	{
-		$args = func_get_args();
-		array_shift($args);
-		$obj = new StringMatcher($buf);
-		return call_user_func_array(array($obj, 'find'), $args);
-	}
-
-	/**
-	 * 判断字符串是否包含数组中的字符串
-	 * @param string $buf 源字符串
-	 * @param array $tokens 字符串标记列表
-	 * @return boolean
-	 */
-	public static function contains($buf, $tokens)
-	{
-		foreach ($tokens as $token) {
-			if (strpos($buf, $token) !== false) {
-				return true;
-			}
-		}
-		return false;
-	}
-}
-
-/**
- * StringMatcher to parse data
- */
-class StringMatcher
-{
-	private $_buf, $_pos;
-
-	/**
-	 * @param string $buf
-	 */
-	public function __construct($buf)
-	{
-		$this->_buf = $buf;
-		$this->_pos = 0;
-	}
-
-	/**
-	 * 批量查找
-	 * @param array $config
-	 * array(
-	 *   array(key1, arg1, arg2, ...),
-	 *   array(key2, arg1, arg2, ...),
-	 * ),
-	 * @param string $error optional reference
-	 * @return array
-	 */
-	public function finds($config, &$error = null)
-	{
-		$ret = array();
-		foreach ($config as $args) {
-			$key = array_shift($args);
-			$val = call_user_func_array(array($this, 'find'), $args);
-			if ($val === null || $val === false) {
-				$error = 'Cannot find `' . $key . '\': ' . implode(' ... ', $args);
-				$pos = strrpos($error, '...');
-				$error = substr_replace($error, '???', $pos, 3);
-				continue;
-				//return false;
-			}
-			$ret[$key] = $val;
-		}
-		return $ret;
-	}
-
-	/**
-	 * 根据特征查找字符串,不定参数:
-	 * 起始1,起始2,起始3 ... 结束关键
-	 * 新增支持特殊串
-	 * "$$$...",表示后面的字符串必须在这个字符串之前,以免跨越太大
-	 * "^^^...",表示后面的字符串如果在这个串之前就用采用当前串的位置
-	 * @return string 成功返回区间内的字符串并将位置设在本字符串之末,若找不到返回 null
-	 */
-	public function find()
-	{
-		$args = func_get_args();
-		$cnt = count($args);
-		if ($cnt < 2) {
-			return trigger_error(__CLASS__ . '::find() expects at least 2 parameters, ' . $cnt . ' given', E_USER_WARNING);
-		}
-		for ($end = $pre = false, $pos1 = $this->_pos, $i = 0; $i < ($cnt - 1); $i++) {
-			if (substr($args[$i], 0, 3) === '$$$') {
-				$end = strpos($this->_buf, substr($args[$i], 3), $pos1);
-			} elseif (substr($args[$i], 0, 3) === '^^^') {
-				$pre = strpos($this->_buf, substr($args[$i], 3), $pos1);
-			} else {
-				$pos1 = strpos($this->_buf, $args[$i], $pos1);
-				if ($pos1 === false) {
-					return null;
-				} elseif ($end !== false && $pos1 > $end) {
-					return '';
-				}
-				if ($pre !== false) {
-					if ($pos1 > $pre) {
-						$pos1 = $pre;
-					}
-					$pre = false;
-				}
-				$pos1 += strlen($args[$i]);
-			}
-		}
-		if (($pos2 = strpos($this->_buf, $args[$i], $pos1)) !== false) {
-			if ($end !== false && $pos2 > $end) {
-				return '';
-			}
-			if ($pre !== false) {
-				if ($pos2 > $pre) {
-					$pos2 = $pre;
-				}
-				$pre = false;
-			}
-			$this->_pos = $pos2;
-			return substr($this->_buf, $pos1, $pos2 - $pos1);
-		}
-		return null;
-	}
-
-	/**
-	 * 移动当前处理位置位置指针,类似 fseek
-	 * @param int $offset
-	 * @param int $whence 可选值:SEEK_SET/SEEK_CUR/SEEK_END
-	 */
-	public function seek($offset, $whence = SEEK_CUR)
-	{
-		$offset = intval($offset);
-		switch ($whence) {
-			case SEEK_SET:
-				$this->_pos = $offset;
-				break;
-			case SEEK_END:
-				$this->_pos = $offset + strlen($this->_buf);
-				break;
-			case SEEK_CUR:
-			default:
-				$this->_pos += $offset;
-				break;
-		}
-		return $this->_pos;
-	}
-}

+ 0 - 590
src/Library/Third/hightman/UrlTable.php

@@ -1,590 +0,0 @@
-<?php
-/**
- * 多功能 URL 采集管理及解析器
- *
- * @author hightman <hightman@twomice.net>
- * @link http://www.hightman.cn/
- * @copyright Copyright &copy; 2008-2013 Twomice Studio
- */
-use hightman\http\ParseInterface;
-use hightman\http\Response;
-use hightman\http\Request;
-
-/**
- * URL 列表管理接口
- */
-interface UrlTable
-{
-	/**
-	 * 同一 URL 连续处理的时间间隔
-	 */
-	const DURATION = 3600;
-
-	/**
-	 * @return int URL 列表总个数
-	 */
-	public function getCount();
-
-	/**
-	 * @param int $duration 同一
-	 * @return string 返回一个待处理的 URL,若无返回 null 出错则返回 false
-	 */
-	public function getOne($duration = self::DURATION);
-
-	/**
-	 * @param int $limit
-	 * @param int $duration
-	 * @return array 返回不超过指定个数的 URL 数组,若无返回空数组,出错则返回 false
-	 */
-	public function getSome($limit = 5, $duration = self::DURATION);
-
-	/**
-	 * @param string $url 要添加的 URL
-	 * @param int $rank 被取出处理的优先级
-	 * @return boolean 成功返回 true,若已存在或其它原因失败均返回 false
-	 */
-	public function addUrl($url, $rank = 0);
-
-	/**
-	 * @param string $url 要更新的 URL
-	 * @param int $status URL 处理后的状态码
-	 * @return boolean 成功返回 true, 失败返回 false
-	 */
-	public function updateUrl($url, $status = 200);
-
-	/**
-	 * @param string $url 要删除的 URL
-	 * @return boolean 成功返回 true,失败返回 false
-	 */
-	public function delUrl($url);
-}
-
-/**
- * 基于 MySQLi 的 URL 列表管理,结构如下:
- * CREATE TABLE `_urls` (
- *   `id` varchar(32) NOT NULL COMMENT 'md5 hash of URL',
- *   `url` text NOT NULL,
- *   `rank` smallint(6) NOT NULL COMMENT 'process prior level',
- *   `status` smallint(6) NOT NULL COMMENT 'last http response status',
- *   `select_time` bigint(20) NOT NULL COMMENT 'last process time',
- *   `update_time` bigint(20) NOT NULL COMMENT 'last update time',
- *   PRIMARY KEY (`id`)
- * ) ENGINE=MyISAM DEFAULT CHARSET=utf8 COMMENT='url table for pspider';
- */
-class UrlTableMySQL extends mysqli implements UrlTable
-{
-	private $_table = '_urls';
-	private $_addCache = array();
-
-	/**
-	 * @param string $name 设置数据库表名,默认 _urls
-	 */
-	public function setTableName($name)
-	{
-		$this->_table = $name;
-	}
-
-	public function getCount()
-	{
-		$res = $this->query('SELECT COUNT(*) AS count FROM ' . $this->_table);
-		if ($res !== false) {
-			$row = $res->fetch_assoc();
-			$res->free();
-			return $row['count'];
-		}
-		return 0;
-	}
-
-	public function getOne($duration = self::DURATION)
-	{
-		$urls = $this->getSome(1, $duration);
-		if (!is_array($urls)) {
-			return false;
-		}
-		return count($urls) > 0 ? $urls[0] : null;
-	}
-
-	public function getSome($limit = 5, $duration = self::DURATION)
-	{
-		$now = time();
-		$sql = 'SELECT id, url, ((' . $now . ' - select_time) * (rank + 1) / (status + 1)) AS score FROM ' . $this->_table . ' ';
-		$sql .= 'WHERE select_time < ' . ($now - $duration) . ' '; // expired
-		$sql .= 'OR (select_time > update_time AND select_time < ' . ($now - 300) . ') '; // failed
-		$sql .= 'ORDER BY score DESC LIMIT ' . intval($limit);
-		($fd = @fopen(sys_get_temp_dir() . DIRECTORY_SEPARATOR . __CLASS__ . '.lock', 'w')) && flock($fd, LOCK_EX);
-		if (($res = $this->query($sql)) === false) {
-			$ret = false;
-		} else {
-			$ret = $ids = array();
-			while ($row = $res->fetch_assoc()) {
-				$ids[] = $row['id'];
-				$ret[] = $row['url'];
-			}
-			$res->free();
-			if (count($ids) > 0) {
-				$sql = 'UPDATE ' . $this->_table . ' SET select_time = ' . $now . ' ';
-				$sql .= 'WHERE id IN (\'' . implode('\', \'', $ids) . '\')';
-				$this->query($sql);
-			}
-		}
-		$fd && flock($fd, LOCK_UN) && fclose($fd);
-		return $ret;
-	}
-
-	public function addUrl($url, $rank = 0)
-	{
-		$id = md5($url);
-		if ($this->inAddCache($id)) {
-			return false;
-		}
-		$url = $this->real_escape_string($url);
-		$sql = 'INSERT INTO ' . $this->_table . ' (id, url, rank) ';
-		$sql .= 'VALUES (\'' . $id . '\', \'' . $url . '\', ' . intval($rank) . ')';
-		return $this->query($sql);
-	}
-
-	public function updateUrl($url, $status = 200)
-	{
-		$now = time();
-		$sql = 'UPDATE ' . $this->_table . ' SET status = ' . intval($status) . ', update_time = ' . $now . ' ';
-		$sql .= 'WHERE id = \'' . md5($url) . '\'';
-		return $this->query($sql);
-	}
-
-	public function delUrl($url)
-	{
-		$sql = 'DELETE FROM ' . $this->_table . ' WHERE id = \'' . md5($url) . '\'';
-		return $this->query($sql) && $this->affected_rows === 1;
-	}
-
-	public function query($query, $mode = MYSQLI_STORE_RESULT)
-	{
-		$this->ping();
-		$res = parent::query($query, $mode);
-		return $res;
-	}
-
-	protected function test()
-	{
-		if ($this->connect_error) {
-			return trigger_error($this->connect_error, E_USER_ERROR);
-		}
-		$url = 'http://' . uniqid() . '.com/';
-		if (!$this->addUrl($url)) {
-			return trigger_error($this->error, E_USER_ERROR);
-		}
-		$this->delUrl($url);
-		return true;
-	}
-
-	private function inAddCache($id)
-	{
-		$now = time();
-		if (isset($this->_addCache[$id])) {
-			$this->_addCache[$id] = $now;
-			return true;
-		}
-		$this->_addCache[$id] = $now;
-		if (count($this->_addCache) > 20000) {
-			$cache = array();
-			$expire = $now - 3600;
-			foreach ($this->_addCache as $key => $value) {
-				if ($value > $expire) {
-					$cache[$key] = $value;
-				}
-			}
-			$this->_addCache = $cache;
-		}
-		return false;
-	}
-}
-
-/**
- * 带 URL 提取功能的解析器基础类
- *
- * 设置是 URL 过滤排除规则:
- * 规则语法支持局部字符串匹配,或正则匹配(必须是 # 开头)
- * 1. 若是默认允许的外站域名,则检测 disallowDomain 匹配一条则直接排除
- * 2. 若是默认不允许的外站域名,则检测 allowDomain,匹配任何一条则通过继续检测
- * 3. 检测 disallow 规则,匹配其中一条则立即排除
- * 4. 检测 allow 规则,若为空则直接通过,否则必须至少满足其中一条
- * 5. 检测 disallowExt 规则,匹配不允许的扩展名则直接排除
- * 6. 最终通过 ^-^
- */
-class UrlParser implements ParseInterface
-{
-	private $_timeBegin, $_numAdd, $_numUpdate, $_numFilter;
-	private $_followExternal;
-	private $_disallowDomain, $_allowDomain, $_disallow, $_allow;
-	private $_allowRank;
-	private $_nofollow;
-	private $_disallowExt = array(
-		'.tar' => true, '.gz' => true, '.tgz' => true, '.zip' => true, '.Z' => true, '.7z' => true,
-		'.rpm' => true, '.deb' => true, '.ps' => true, '.dvi' => true, '.pdf' => true, '.smi' => true,
-		'.png' => true, '.jpg' => true, '.jpeg' => true, '.bmp' => true, '.tiff' => true, '.gif' => true,
-		'.mov' => true, '.avi' => true, '.mpeg' => true, '.mpg' => true, '.mp3' => true, '.qt' => true,
-		'.wav' => true, '.ram' => true, '.rm' => true, '.rmvb' => true, '.jar' => true, '.java' => true,
-		'.class' => true, '.diff' => true, '.doc' => true, '.docx' => true, '.xls' => true, '.ppt' => true,
-		'.mdb' => true, '.rtf' => true, '.exe' => true, '.pps' => true, '.so' => true, '.psd' => true,
-		'.css' => true, '.js' => true, '.ico' => true, '.dll' => true, '.bz2' => true, '.rar' => true,
-	);
-	private $_ut;
-
-	/**
-	 * @param UrlTable $ut
-	 */
-	public function __construct(UrlTable $ut)
-	{
-		$this->_ut = $ut;
-		$this->_timeBegin = time();
-		$this->_numAdd = $this->_numUpdate = $this->_numFilter = 0;
-		// apply default filters for extending
-		$this->resetFilter();
-		$this->defaultFilter();
-	}
-
-	public function __destruct()
-	{
-		$this->_ut = null;
-	}
-
-	/**
-	 * @return UrlTable
-	 */
-	public function getUrlTable()
-	{
-		return $this->_ut;
-	}
-
-	/**
-	 * 扩展该类时在此应用默认的 URL 过滤规则
-	 */
-	public function defaultFilter()
-	{
-
-	}
-
-	/**
-	 * 重置所有过滤规则,但不包含后缀过滤规则
-	 */
-	public function resetFilter()
-	{
-		$this->_followExternal = false;
-		$this->_disallowDomain = array();
-		$this->_allowDomain = array();
-		$this->_disallow = array();
-		$this->_allow = array();
-		$this->_allowRank = array();
-		$this->_nofollow = array();
-	}
-
-	/**
-	 * @param boolean $on 设置是否处理站外 URL,默认为 false
-	 */
-	public function followExternal($on = true)
-	{
-		$this->_followExternal = $on === true ? true : false;
-	}
-
-	/**
-	 * @param string $rule 不允许的域名规则,支持正则表达式
-	 */
-	public function disallowDomain($rule)
-	{
-		$this->saveMatchRule($this->_disallowDomain, $rule);
-	}
-
-	/**
-	 * @param string $rule 允许的域名规则,支持正则表达式
-	 */
-	public function allowDomain($rule)
-	{
-		$this->saveMatchRule($this->_allowDomain, $rule);
-	}
-
-	/**
-	 * @param string $rule 不允许的 URL 规则,支持正则表达式
-	 */
-	public function disallow($rule)
-	{
-		$this->saveMatchRule($this->_disallow, $rule);
-	}
-
-	/**
-	 * @param string $rule 允许的 URL 规则,支持正则表达式
-	 * @param int $rank 匹配此规则的 URL 的权重值
-	 * @param boolean $follow 是否跟随分析此规则页面中的链接
-	 */
-	public function allow($rule, $rank = null, $follow = true)
-	{
-		$this->saveMatchRule($this->_allow, $rule);
-		if ($rank !== null) {
-			$this->_allowRank[$rule] = intval($rank);
-		}
-		if (!$follow) {
-			$this->saveMatchRule($this->_nofollow, $rule);
-		}
-	}
-
-	/**
-	 * @param string $name 不允许的 URL 扩展名,必须以 . 开头
-	 */
-	public function disallowExt($name)
-	{
-		$this->_disallowExt[strtolower($name)] = true;
-	}
-
-	/**
-	 * @param string $name 强制允许的 URL 扩展名,必须以 . 开头
-	 */
-	public function allowExt($name)
-	{
-		if (substr($name, 0, 1) === '.') {
-			$name = strtolower($name);
-			if (isset($this->_disallowExt[$name])) {
-				unset($this->_disallowExt[$name]);
-			}
-		}
-	}
-
-	/**
-	 * 打印或返回统计情况
-	 * @param boolean $output 是否直接输出结果
-	 */
-	public function stat($output = false)
-	{
-		// time
-		$time = time() - $this->_timeBegin;
-		$string = date('m-d H:i:s') . ' - Time cost: ';
-		if ($time > 3600) {
-			$string .= intval($time / 3600) . ' hours ';
-			$time %= 3600;
-		}
-		if ($time > 60) {
-			$string .= intval($time / 60) . ' mins ';
-			$time %= 60;
-		}
-		$string .= $time . ' secs, ';
-		// stats
-		$string .= sprintf('URLs total: %d, Add: %d, Update: %d, Filtered: %d', $this->_ut->getCount(), $this->_numAdd, $this->_numUpdate, $this->_numFilter);
-		if ($output !== true) {
-			return $string;
-		}
-		echo $string . "\n";
-	}
-
-	/**
-	 * 实现 HttpParser 中定义的方法
-	 * @param Response $res
-	 * @param Request $req
-	 * @param mixed $key
-	 */
-	public function parse(Response $res, Request $req, $key)
-	{
-		// update url
-		$rawUrl = $req->getRawUrl();
-		if ($this->_ut->updateUrl($rawUrl, $res->status)) {
-			$this->_numUpdate++;
-		}
-		// parse url from body
-		if ($res->status === 200 && $this->isFollowUrl($rawUrl)) {
-			// get baseUrl
-			$baseUrl = $req->getUrl();
-			if (preg_match('/<base\s+href=[\'"]?(.*?)[\s\'">]/i', $res->body, $match)) {
-				$baseUrl = $this->resetUrl($match[1], $baseUrl);
-			}
-			// href="xxx", href='xxx'
-			if (preg_match_all('/href=([\'"])(.*?)\1/i', $res->body, $matches) > 0) {
-				foreach ($matches[2] as $url) {
-					$this->processUrl($url, $baseUrl, $res->url);
-				}
-			}
-			// href=xxx
-			if (preg_match_all('/href=(?![\'"])(.*?)[\s>]/i', $res->body, $matches) > 0) {
-				foreach ($matches[1] as $url) {
-					$this->processUrl($url, $baseUrl, $res->url);
-				}
-			}
-		} elseif ($res->status === 301 || $res->status === 302) {
-			$url = $this->resetUrl($res->getHeader('location'), $req->getUrl());
-			$res->setHeader('location', $url); // overwrite formated url
-			// save url for permanent redirection
-			if ($res->status === 301) {
-				$this->processUrl($url, $res->url);
-			}
-		}
-	}
-
-	/**
-	 * @param string $url
-	 * @param string $rawUrl 原先的开始页面 URL,用于计算是否为站外
-	 * @param string &$rank
-	 * @return boolean 是否 URL 符合过滤规则需要排除,需要排除返回 true
-	 */
-	public function isDisallow($url, $rawUrl = null, &$rank = null)
-	{
-		// get domain
-		if (($pos1 = strpos($url, '://')) === false) {
-			return true;
-		}
-		$pos1 += 3;
-		$pos2 = strpos($url, '/', $pos1);
-		$domain = $pos2 === false ? substr($url, $pos1) : substr($url, $pos1, $pos2 - $pos1);
-		// external domain
-		if ($rawUrl !== null && !@strstr($rawUrl, $domain)) {
-			// disallow domain
-			if ($this->_followExternal && $this->isMatchRule($this->_disallowDomain, $domain)) {
-				return true;
-			}
-			// allow domain
-			if (!$this->_followExternal
-				&& (count($this->_allowDomain) === 0 || !$this->isMatchRule($this->_allowDomain, $domain))) {
-				return true;
-			}
-		}
-		// disallow
-		if ($this->isMatchRule($this->_disallow, $url)) {
-			return true;
-		}
-		// allow
-		if (count($this->_allow) > 0 && !$this->isMatchRule($this->_allow, $url, $rank)) {
-			return true;
-		}
-		// dislaowExt
-		if (($pos1 = strpos($url, '?')) === false) {
-			$pos1 = strlen($url);
-		}
-		if (($pos2 = strpos($url, '/', 8)) !== false
-			&& ($ext = strrchr(substr($url, $pos2, $pos1 - $pos2), '.'))) {
-			$ext = strtolower($ext);
-			if (isset($this->_disallowExt[$ext])) {
-				return true;
-			}
-		}
-		return false;
-	}
-
-	/**
-	 * @param string $url
-	 * @param string $baseUrl
-	 * @return string 返回处理好的标准 URL
-	 */
-	public function resetUrl($url, $baseUrl = null)
-	{
-		// 开头处理
-		if (!strncasecmp($url, 'http://http://', 14)) {
-			$url = substr($url, 7);
-		}
-		if (strncasecmp($url, 'http://', 7) && strncasecmp($url, 'https://', 8)) {
-			if ($baseUrl === null) {
-				$url = 'http://' . $url;
-			} else {
-				if (substr($url, 0, 1) === '/') {
-					$pos = @strpos($baseUrl, '/', 8);
-					$url = ($pos === false ? $baseUrl : substr($baseUrl, 0, $pos)) . $url;
-				} else {
-					$pos = @strrpos($baseUrl, '/', 8);
-					$url = ($pos === false ? $baseUrl . '/' : substr($baseUrl, 0, $pos + 1)) . $url;
-				}
-			}
-		}
-		// 统一 URL 格式,顶级网址以 / 结尾,去除 # 后的锚点
-		if (@strpos($url, '/', 8) === false) {
-			$url .= '/';
-		}
-		if (($pos = strrpos($url, '#')) !== false) {
-			$url = substr($url, 0, $pos);
-		}
-		// 计算并处理 '../../' 等多余的相对 URL
-		if (strpos($url, '/./') !== false || strpos($url, '/../') !== false) {
-			$parts = array();
-			$tmpa = explode('/', substr($url, 8));
-			for ($i = 0; $i < count($tmpa); $i++) {
-				if ($tmpa[$i] === '.' || ($tmpa[$i] === '' && isset($tmpa[$i + 1]))) {
-					continue;
-				} elseif ($tmpa[$i] !== '..') {
-					array_push($parts, $tmpa[$i]);
-				} elseif (count($parts) > 1) {
-					array_pop($parts);
-				}
-			}
-			$url = substr($url, 0, 8) . implode('/', $parts);
-		}
-		return $url;
-	}
-
-	/**
-	 * @param string $url
-	 * @return boolean 是否分析处理当前 URL 内容中的链接
-	 */
-	protected function isFollowUrl($url)
-	{
-		return !$this->isMatchRule($this->_nofollow, $url);
-	}
-
-	/**
-	 * @return mixed
-	 */
-	protected function processUrl($url, $baseUrl, $rawUrl = null)
-	{
-		if (substr($url, 0, 1) === '#' || !strncasecmp($url, 'javascript:', 11) || !strncasecmp($url, 'mailto:', 7)) {
-			return 'SKIP';
-		}
-		$url = $this->resetUrl($url, $baseUrl);
-		$rank = 0;
-		if ($this->isDisallow($url, $rawUrl === null ? $baseUrl : $rawUrl, $rank)) {
-			$this->_numFilter++;
-			return 'FILTER';
-		}
-		if ($this->_ut->addUrl($url, $rank)) {
-			$this->_numAdd++;
-			return 'ADD';
-		}
-		return 'SKIP';
-	}
-
-	private function saveMatchRule(&$array, $rule)
-	{
-		if ($rule === null) {
-			$array = array();
-		} elseif ($this->isRegexPattern($rule)) {
-			array_push($array, "\xff" . $rule);
-		} else {
-			array_unshift($array, $rule);
-		}
-	}
-
-	private function isMatchRule($rules, $input, &$rank = null)
-	{
-		foreach ($rules as $rule) {
-			if (ord($rule[0]) !== 0xff) {
-				$matched = stristr($input, $rule) !== false;
-			} else {
-				$rule = substr($rule, 1);
-				$matched = preg_match($rule, $input) > 0;
-			}
-			if ($matched === true) {
-				if (isset($this->_allowRank[$rule])) {
-					$rank = $this->_allowRank[$rule];
-				}
-				return true;
-			}
-		}
-		return false;
-	}
-
-	private function isRegexPattern($input)
-	{
-		if (strlen($input) > 2 && $input[0] === '#') {
-			for ($i = strlen($input) - 1; $i > 1; $i--) {
-				if ($input[$i] === $input[0]) {
-					return true;
-				}
-				if ($input[$i] !== 'i' && $input[$i] !== 'u') {
-					break;
-				}
-			}
-		}
-		return false;
-	}
-}