123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590 |
- <?php
- /**
- * 多功能 URL 采集管理及解析器
- *
- * @author hightman <hightman@twomice.net>
- * @link http://www.hightman.cn/
- * @copyright Copyright © 2008-2013 Twomice Studio
- */
- use hightman\http\ParseInterface;
- use hightman\http\Response;
- use hightman\http\Request;
- /**
- * URL 列表管理接口
- */
- interface UrlTable
- {
- /**
- * 同一 URL 连续处理的时间间隔
- */
- const DURATION = 3600;
- /**
- * @return int URL 列表总个数
- */
- public function getCount();
- /**
- * @param int $duration 同一
- * @return string 返回一个待处理的 URL,若无返回 null 出错则返回 false
- */
- public function getOne($duration = self::DURATION);
- /**
- * @param int $limit
- * @param int $duration
- * @return array 返回不超过指定个数的 URL 数组,若无返回空数组,出错则返回 false
- */
- public function getSome($limit = 5, $duration = self::DURATION);
- /**
- * @param string $url 要添加的 URL
- * @param int $rank 被取出处理的优先级
- * @return boolean 成功返回 true,若已存在或其它原因失败均返回 false
- */
- public function addUrl($url, $rank = 0);
- /**
- * @param string $url 要更新的 URL
- * @param int $status URL 处理后的状态码
- * @return boolean 成功返回 true, 失败返回 false
- */
- public function updateUrl($url, $status = 200);
- /**
- * @param string $url 要删除的 URL
- * @return boolean 成功返回 true,失败返回 false
- */
- public function delUrl($url);
- }
- /**
- * 基于 MySQLi 的 URL 列表管理,结构如下:
- * CREATE TABLE `_urls` (
- * `id` varchar(32) NOT NULL COMMENT 'md5 hash of URL',
- * `url` text NOT NULL,
- * `rank` smallint(6) NOT NULL COMMENT 'process prior level',
- * `status` smallint(6) NOT NULL COMMENT 'last http response status',
- * `select_time` bigint(20) NOT NULL COMMENT 'last process time',
- * `update_time` bigint(20) NOT NULL COMMENT 'last update time',
- * PRIMARY KEY (`id`)
- * ) ENGINE=MyISAM DEFAULT CHARSET=utf8 COMMENT='url table for pspider';
- */
- class UrlTableMySQL extends mysqli implements UrlTable
- {
- private $_table = '_urls';
- private $_addCache = array();
- /**
- * @param string $name 设置数据库表名,默认 _urls
- */
- public function setTableName($name)
- {
- $this->_table = $name;
- }
- public function getCount()
- {
- $res = $this->query('SELECT COUNT(*) AS count FROM ' . $this->_table);
- if ($res !== false) {
- $row = $res->fetch_assoc();
- $res->free();
- return $row['count'];
- }
- return 0;
- }
- public function getOne($duration = self::DURATION)
- {
- $urls = $this->getSome(1, $duration);
- if (!is_array($urls)) {
- return false;
- }
- return count($urls) > 0 ? $urls[0] : null;
- }
- public function getSome($limit = 5, $duration = self::DURATION)
- {
- $now = time();
- $sql = 'SELECT id, url, ((' . $now . ' - select_time) * (rank + 1) / (status + 1)) AS score FROM ' . $this->_table . ' ';
- $sql .= 'WHERE select_time < ' . ($now - $duration) . ' '; // expired
- $sql .= 'OR (select_time > update_time AND select_time < ' . ($now - 300) . ') '; // failed
- $sql .= 'ORDER BY score DESC LIMIT ' . intval($limit);
- ($fd = @fopen(sys_get_temp_dir() . DIRECTORY_SEPARATOR . __CLASS__ . '.lock', 'w')) && flock($fd, LOCK_EX);
- if (($res = $this->query($sql)) === false) {
- $ret = false;
- } else {
- $ret = $ids = array();
- while ($row = $res->fetch_assoc()) {
- $ids[] = $row['id'];
- $ret[] = $row['url'];
- }
- $res->free();
- if (count($ids) > 0) {
- $sql = 'UPDATE ' . $this->_table . ' SET select_time = ' . $now . ' ';
- $sql .= 'WHERE id IN (\'' . implode('\', \'', $ids) . '\')';
- $this->query($sql);
- }
- }
- $fd && flock($fd, LOCK_UN) && fclose($fd);
- return $ret;
- }
- public function addUrl($url, $rank = 0)
- {
- $id = md5($url);
- if ($this->inAddCache($id)) {
- return false;
- }
- $url = $this->real_escape_string($url);
- $sql = 'INSERT INTO ' . $this->_table . ' (id, url, rank) ';
- $sql .= 'VALUES (\'' . $id . '\', \'' . $url . '\', ' . intval($rank) . ')';
- return $this->query($sql);
- }
- public function updateUrl($url, $status = 200)
- {
- $now = time();
- $sql = 'UPDATE ' . $this->_table . ' SET status = ' . intval($status) . ', update_time = ' . $now . ' ';
- $sql .= 'WHERE id = \'' . md5($url) . '\'';
- return $this->query($sql);
- }
- public function delUrl($url)
- {
- $sql = 'DELETE FROM ' . $this->_table . ' WHERE id = \'' . md5($url) . '\'';
- return $this->query($sql) && $this->affected_rows === 1;
- }
- public function query($query, $mode = MYSQLI_STORE_RESULT)
- {
- $this->ping();
- $res = parent::query($query, $mode);
- return $res;
- }
- protected function test()
- {
- if ($this->connect_error) {
- return trigger_error($this->connect_error, E_USER_ERROR);
- }
- $url = 'http://' . uniqid() . '.com/';
- if (!$this->addUrl($url)) {
- return trigger_error($this->error, E_USER_ERROR);
- }
- $this->delUrl($url);
- return true;
- }
- private function inAddCache($id)
- {
- $now = time();
- if (isset($this->_addCache[$id])) {
- $this->_addCache[$id] = $now;
- return true;
- }
- $this->_addCache[$id] = $now;
- if (count($this->_addCache) > 20000) {
- $cache = array();
- $expire = $now - 3600;
- foreach ($this->_addCache as $key => $value) {
- if ($value > $expire) {
- $cache[$key] = $value;
- }
- }
- $this->_addCache = $cache;
- }
- return false;
- }
- }
- /**
- * 带 URL 提取功能的解析器基础类
- *
- * 设置是 URL 过滤排除规则:
- * 规则语法支持局部字符串匹配,或正则匹配(必须是 # 开头)
- * 1. 若是默认允许的外站域名,则检测 disallowDomain 匹配一条则直接排除
- * 2. 若是默认不允许的外站域名,则检测 allowDomain,匹配任何一条则通过继续检测
- * 3. 检测 disallow 规则,匹配其中一条则立即排除
- * 4. 检测 allow 规则,若为空则直接通过,否则必须至少满足其中一条
- * 5. 检测 disallowExt 规则,匹配不允许的扩展名则直接排除
- * 6. 最终通过 ^-^
- */
- class UrlParser implements ParseInterface
- {
- private $_timeBegin, $_numAdd, $_numUpdate, $_numFilter;
- private $_followExternal;
- private $_disallowDomain, $_allowDomain, $_disallow, $_allow;
- private $_allowRank;
- private $_nofollow;
- private $_disallowExt = array(
- '.tar' => true, '.gz' => true, '.tgz' => true, '.zip' => true, '.Z' => true, '.7z' => true,
- '.rpm' => true, '.deb' => true, '.ps' => true, '.dvi' => true, '.pdf' => true, '.smi' => true,
- '.png' => true, '.jpg' => true, '.jpeg' => true, '.bmp' => true, '.tiff' => true, '.gif' => true,
- '.mov' => true, '.avi' => true, '.mpeg' => true, '.mpg' => true, '.mp3' => true, '.qt' => true,
- '.wav' => true, '.ram' => true, '.rm' => true, '.rmvb' => true, '.jar' => true, '.java' => true,
- '.class' => true, '.diff' => true, '.doc' => true, '.docx' => true, '.xls' => true, '.ppt' => true,
- '.mdb' => true, '.rtf' => true, '.exe' => true, '.pps' => true, '.so' => true, '.psd' => true,
- '.css' => true, '.js' => true, '.ico' => true, '.dll' => true, '.bz2' => true, '.rar' => true,
- );
- private $_ut;
- /**
- * @param UrlTable $ut
- */
- public function __construct(UrlTable $ut)
- {
- $this->_ut = $ut;
- $this->_timeBegin = time();
- $this->_numAdd = $this->_numUpdate = $this->_numFilter = 0;
- // apply default filters for extending
- $this->resetFilter();
- $this->defaultFilter();
- }
- public function __destruct()
- {
- $this->_ut = null;
- }
- /**
- * @return UrlTable
- */
- public function getUrlTable()
- {
- return $this->_ut;
- }
- /**
- * 扩展该类时在此应用默认的 URL 过滤规则
- */
- public function defaultFilter()
- {
- }
- /**
- * 重置所有过滤规则,但不包含后缀过滤规则
- */
- public function resetFilter()
- {
- $this->_followExternal = false;
- $this->_disallowDomain = array();
- $this->_allowDomain = array();
- $this->_disallow = array();
- $this->_allow = array();
- $this->_allowRank = array();
- $this->_nofollow = array();
- }
- /**
- * @param boolean $on 设置是否处理站外 URL,默认为 false
- */
- public function followExternal($on = true)
- {
- $this->_followExternal = $on === true ? true : false;
- }
- /**
- * @param string $rule 不允许的域名规则,支持正则表达式
- */
- public function disallowDomain($rule)
- {
- $this->saveMatchRule($this->_disallowDomain, $rule);
- }
- /**
- * @param string $rule 允许的域名规则,支持正则表达式
- */
- public function allowDomain($rule)
- {
- $this->saveMatchRule($this->_allowDomain, $rule);
- }
- /**
- * @param string $rule 不允许的 URL 规则,支持正则表达式
- */
- public function disallow($rule)
- {
- $this->saveMatchRule($this->_disallow, $rule);
- }
- /**
- * @param string $rule 允许的 URL 规则,支持正则表达式
- * @param int $rank 匹配此规则的 URL 的权重值
- * @param boolean $follow 是否跟随分析此规则页面中的链接
- */
- public function allow($rule, $rank = null, $follow = true)
- {
- $this->saveMatchRule($this->_allow, $rule);
- if ($rank !== null) {
- $this->_allowRank[$rule] = intval($rank);
- }
- if (!$follow) {
- $this->saveMatchRule($this->_nofollow, $rule);
- }
- }
- /**
- * @param string $name 不允许的 URL 扩展名,必须以 . 开头
- */
- public function disallowExt($name)
- {
- $this->_disallowExt[strtolower($name)] = true;
- }
- /**
- * @param string $name 强制允许的 URL 扩展名,必须以 . 开头
- */
- public function allowExt($name)
- {
- if (substr($name, 0, 1) === '.') {
- $name = strtolower($name);
- if (isset($this->_disallowExt[$name])) {
- unset($this->_disallowExt[$name]);
- }
- }
- }
- /**
- * 打印或返回统计情况
- * @param boolean $output 是否直接输出结果
- */
- public function stat($output = false)
- {
- // time
- $time = time() - $this->_timeBegin;
- $string = date('m-d H:i:s') . ' - Time cost: ';
- if ($time > 3600) {
- $string .= intval($time / 3600) . ' hours ';
- $time %= 3600;
- }
- if ($time > 60) {
- $string .= intval($time / 60) . ' mins ';
- $time %= 60;
- }
- $string .= $time . ' secs, ';
- // stats
- $string .= sprintf('URLs total: %d, Add: %d, Update: %d, Filtered: %d', $this->_ut->getCount(), $this->_numAdd, $this->_numUpdate, $this->_numFilter);
- if ($output !== true) {
- return $string;
- }
- echo $string . "\n";
- }
- /**
- * 实现 HttpParser 中定义的方法
- * @param Response $res
- * @param Request $req
- * @param mixed $key
- */
- public function parse(Response $res, Request $req, $key)
- {
- // update url
- $rawUrl = $req->getRawUrl();
- if ($this->_ut->updateUrl($rawUrl, $res->status)) {
- $this->_numUpdate++;
- }
- // parse url from body
- if ($res->status === 200 && $this->isFollowUrl($rawUrl)) {
- // get baseUrl
- $baseUrl = $req->getUrl();
- if (preg_match('/<base\s+href=[\'"]?(.*?)[\s\'">]/i', $res->body, $match)) {
- $baseUrl = $this->resetUrl($match[1], $baseUrl);
- }
- // href="xxx", href='xxx'
- if (preg_match_all('/href=([\'"])(.*?)\1/i', $res->body, $matches) > 0) {
- foreach ($matches[2] as $url) {
- $this->processUrl($url, $baseUrl, $res->url);
- }
- }
- // href=xxx
- if (preg_match_all('/href=(?![\'"])(.*?)[\s>]/i', $res->body, $matches) > 0) {
- foreach ($matches[1] as $url) {
- $this->processUrl($url, $baseUrl, $res->url);
- }
- }
- } elseif ($res->status === 301 || $res->status === 302) {
- $url = $this->resetUrl($res->getHeader('location'), $req->getUrl());
- $res->setHeader('location', $url); // overwrite formated url
- // save url for permanent redirection
- if ($res->status === 301) {
- $this->processUrl($url, $res->url);
- }
- }
- }
- /**
- * @param string $url
- * @param string $rawUrl 原先的开始页面 URL,用于计算是否为站外
- * @param string &$rank
- * @return boolean 是否 URL 符合过滤规则需要排除,需要排除返回 true
- */
- public function isDisallow($url, $rawUrl = null, &$rank = null)
- {
- // get domain
- if (($pos1 = strpos($url, '://')) === false) {
- return true;
- }
- $pos1 += 3;
- $pos2 = strpos($url, '/', $pos1);
- $domain = $pos2 === false ? substr($url, $pos1) : substr($url, $pos1, $pos2 - $pos1);
- // external domain
- if ($rawUrl !== null && !@strstr($rawUrl, $domain)) {
- // disallow domain
- if ($this->_followExternal && $this->isMatchRule($this->_disallowDomain, $domain)) {
- return true;
- }
- // allow domain
- if (!$this->_followExternal
- && (count($this->_allowDomain) === 0 || !$this->isMatchRule($this->_allowDomain, $domain))) {
- return true;
- }
- }
- // disallow
- if ($this->isMatchRule($this->_disallow, $url)) {
- return true;
- }
- // allow
- if (count($this->_allow) > 0 && !$this->isMatchRule($this->_allow, $url, $rank)) {
- return true;
- }
- // dislaowExt
- if (($pos1 = strpos($url, '?')) === false) {
- $pos1 = strlen($url);
- }
- if (($pos2 = strpos($url, '/', 8)) !== false
- && ($ext = strrchr(substr($url, $pos2, $pos1 - $pos2), '.'))) {
- $ext = strtolower($ext);
- if (isset($this->_disallowExt[$ext])) {
- return true;
- }
- }
- return false;
- }
- /**
- * @param string $url
- * @param string $baseUrl
- * @return string 返回处理好的标准 URL
- */
- public function resetUrl($url, $baseUrl = null)
- {
- // 开头处理
- if (!strncasecmp($url, 'http://http://', 14)) {
- $url = substr($url, 7);
- }
- if (strncasecmp($url, 'http://', 7) && strncasecmp($url, 'https://', 8)) {
- if ($baseUrl === null) {
- $url = 'http://' . $url;
- } else {
- if (substr($url, 0, 1) === '/') {
- $pos = @strpos($baseUrl, '/', 8);
- $url = ($pos === false ? $baseUrl : substr($baseUrl, 0, $pos)) . $url;
- } else {
- $pos = @strrpos($baseUrl, '/', 8);
- $url = ($pos === false ? $baseUrl . '/' : substr($baseUrl, 0, $pos + 1)) . $url;
- }
- }
- }
- // 统一 URL 格式,顶级网址以 / 结尾,去除 # 后的锚点
- if (@strpos($url, '/', 8) === false) {
- $url .= '/';
- }
- if (($pos = strrpos($url, '#')) !== false) {
- $url = substr($url, 0, $pos);
- }
- // 计算并处理 '../../' 等多余的相对 URL
- if (strpos($url, '/./') !== false || strpos($url, '/../') !== false) {
- $parts = array();
- $tmpa = explode('/', substr($url, 8));
- for ($i = 0; $i < count($tmpa); $i++) {
- if ($tmpa[$i] === '.' || ($tmpa[$i] === '' && isset($tmpa[$i + 1]))) {
- continue;
- } elseif ($tmpa[$i] !== '..') {
- array_push($parts, $tmpa[$i]);
- } elseif (count($parts) > 1) {
- array_pop($parts);
- }
- }
- $url = substr($url, 0, 8) . implode('/', $parts);
- }
- return $url;
- }
- /**
- * @param string $url
- * @return boolean 是否分析处理当前 URL 内容中的链接
- */
- protected function isFollowUrl($url)
- {
- return !$this->isMatchRule($this->_nofollow, $url);
- }
- /**
- * @return mixed
- */
- protected function processUrl($url, $baseUrl, $rawUrl = null)
- {
- if (substr($url, 0, 1) === '#' || !strncasecmp($url, 'javascript:', 11) || !strncasecmp($url, 'mailto:', 7)) {
- return 'SKIP';
- }
- $url = $this->resetUrl($url, $baseUrl);
- $rank = 0;
- if ($this->isDisallow($url, $rawUrl === null ? $baseUrl : $rawUrl, $rank)) {
- $this->_numFilter++;
- return 'FILTER';
- }
- if ($this->_ut->addUrl($url, $rank)) {
- $this->_numAdd++;
- return 'ADD';
- }
- return 'SKIP';
- }
- private function saveMatchRule(&$array, $rule)
- {
- if ($rule === null) {
- $array = array();
- } elseif ($this->isRegexPattern($rule)) {
- array_push($array, "\xff" . $rule);
- } else {
- array_unshift($array, $rule);
- }
- }
- private function isMatchRule($rules, $input, &$rank = null)
- {
- foreach ($rules as $rule) {
- if (ord($rule[0]) !== 0xff) {
- $matched = stristr($input, $rule) !== false;
- } else {
- $rule = substr($rule, 1);
- $matched = preg_match($rule, $input) > 0;
- }
- if ($matched === true) {
- if (isset($this->_allowRank[$rule])) {
- $rank = $this->_allowRank[$rule];
- }
- return true;
- }
- }
- return false;
- }
- private function isRegexPattern($input)
- {
- if (strlen($input) > 2 && $input[0] === '#') {
- for ($i = strlen($input) - 1; $i > 1; $i--) {
- if ($input[$i] === $input[0]) {
- return true;
- }
- if ($input[$i] !== 'i' && $input[$i] !== 'u') {
- break;
- }
- }
- }
- return false;
- }
- }
|