UrlTable.php 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590
  1. <?php
  2. /**
  3. * 多功能 URL 采集管理及解析器
  4. *
  5. * @author hightman <hightman@twomice.net>
  6. * @link http://www.hightman.cn/
  7. * @copyright Copyright &copy; 2008-2013 Twomice Studio
  8. */
  9. use hightman\http\ParseInterface;
  10. use hightman\http\Response;
  11. use hightman\http\Request;
  12. /**
  13. * URL 列表管理接口
  14. */
  15. interface UrlTable
  16. {
  17. /**
  18. * 同一 URL 连续处理的时间间隔
  19. */
  20. const DURATION = 3600;
  21. /**
  22. * @return int URL 列表总个数
  23. */
  24. public function getCount();
  25. /**
  26. * @param int $duration 同一
  27. * @return string 返回一个待处理的 URL,若无返回 null 出错则返回 false
  28. */
  29. public function getOne($duration = self::DURATION);
  30. /**
  31. * @param int $limit
  32. * @param int $duration
  33. * @return array 返回不超过指定个数的 URL 数组,若无返回空数组,出错则返回 false
  34. */
  35. public function getSome($limit = 5, $duration = self::DURATION);
  36. /**
  37. * @param string $url 要添加的 URL
  38. * @param int $rank 被取出处理的优先级
  39. * @return boolean 成功返回 true,若已存在或其它原因失败均返回 false
  40. */
  41. public function addUrl($url, $rank = 0);
  42. /**
  43. * @param string $url 要更新的 URL
  44. * @param int $status URL 处理后的状态码
  45. * @return boolean 成功返回 true, 失败返回 false
  46. */
  47. public function updateUrl($url, $status = 200);
  48. /**
  49. * @param string $url 要删除的 URL
  50. * @return boolean 成功返回 true,失败返回 false
  51. */
  52. public function delUrl($url);
  53. }
  54. /**
  55. * 基于 MySQLi 的 URL 列表管理,结构如下:
  56. * CREATE TABLE `_urls` (
  57. * `id` varchar(32) NOT NULL COMMENT 'md5 hash of URL',
  58. * `url` text NOT NULL,
  59. * `rank` smallint(6) NOT NULL COMMENT 'process prior level',
  60. * `status` smallint(6) NOT NULL COMMENT 'last http response status',
  61. * `select_time` bigint(20) NOT NULL COMMENT 'last process time',
  62. * `update_time` bigint(20) NOT NULL COMMENT 'last update time',
  63. * PRIMARY KEY (`id`)
  64. * ) ENGINE=MyISAM DEFAULT CHARSET=utf8 COMMENT='url table for pspider';
  65. */
  66. class UrlTableMySQL extends mysqli implements UrlTable
  67. {
  68. private $_table = '_urls';
  69. private $_addCache = array();
  70. /**
  71. * @param string $name 设置数据库表名,默认 _urls
  72. */
  73. public function setTableName($name)
  74. {
  75. $this->_table = $name;
  76. }
  77. public function getCount()
  78. {
  79. $res = $this->query('SELECT COUNT(*) AS count FROM ' . $this->_table);
  80. if ($res !== false) {
  81. $row = $res->fetch_assoc();
  82. $res->free();
  83. return $row['count'];
  84. }
  85. return 0;
  86. }
  87. public function getOne($duration = self::DURATION)
  88. {
  89. $urls = $this->getSome(1, $duration);
  90. if (!is_array($urls)) {
  91. return false;
  92. }
  93. return count($urls) > 0 ? $urls[0] : null;
  94. }
  95. public function getSome($limit = 5, $duration = self::DURATION)
  96. {
  97. $now = time();
  98. $sql = 'SELECT id, url, ((' . $now . ' - select_time) * (rank + 1) / (status + 1)) AS score FROM ' . $this->_table . ' ';
  99. $sql .= 'WHERE select_time < ' . ($now - $duration) . ' '; // expired
  100. $sql .= 'OR (select_time > update_time AND select_time < ' . ($now - 300) . ') '; // failed
  101. $sql .= 'ORDER BY score DESC LIMIT ' . intval($limit);
  102. ($fd = @fopen(sys_get_temp_dir() . DIRECTORY_SEPARATOR . __CLASS__ . '.lock', 'w')) && flock($fd, LOCK_EX);
  103. if (($res = $this->query($sql)) === false) {
  104. $ret = false;
  105. } else {
  106. $ret = $ids = array();
  107. while ($row = $res->fetch_assoc()) {
  108. $ids[] = $row['id'];
  109. $ret[] = $row['url'];
  110. }
  111. $res->free();
  112. if (count($ids) > 0) {
  113. $sql = 'UPDATE ' . $this->_table . ' SET select_time = ' . $now . ' ';
  114. $sql .= 'WHERE id IN (\'' . implode('\', \'', $ids) . '\')';
  115. $this->query($sql);
  116. }
  117. }
  118. $fd && flock($fd, LOCK_UN) && fclose($fd);
  119. return $ret;
  120. }
  121. public function addUrl($url, $rank = 0)
  122. {
  123. $id = md5($url);
  124. if ($this->inAddCache($id)) {
  125. return false;
  126. }
  127. $url = $this->real_escape_string($url);
  128. $sql = 'INSERT INTO ' . $this->_table . ' (id, url, rank) ';
  129. $sql .= 'VALUES (\'' . $id . '\', \'' . $url . '\', ' . intval($rank) . ')';
  130. return $this->query($sql);
  131. }
  132. public function updateUrl($url, $status = 200)
  133. {
  134. $now = time();
  135. $sql = 'UPDATE ' . $this->_table . ' SET status = ' . intval($status) . ', update_time = ' . $now . ' ';
  136. $sql .= 'WHERE id = \'' . md5($url) . '\'';
  137. return $this->query($sql);
  138. }
  139. public function delUrl($url)
  140. {
  141. $sql = 'DELETE FROM ' . $this->_table . ' WHERE id = \'' . md5($url) . '\'';
  142. return $this->query($sql) && $this->affected_rows === 1;
  143. }
  144. public function query($query, $mode = MYSQLI_STORE_RESULT)
  145. {
  146. $this->ping();
  147. $res = parent::query($query, $mode);
  148. return $res;
  149. }
  150. protected function test()
  151. {
  152. if ($this->connect_error) {
  153. return trigger_error($this->connect_error, E_USER_ERROR);
  154. }
  155. $url = 'http://' . uniqid() . '.com/';
  156. if (!$this->addUrl($url)) {
  157. return trigger_error($this->error, E_USER_ERROR);
  158. }
  159. $this->delUrl($url);
  160. return true;
  161. }
  162. private function inAddCache($id)
  163. {
  164. $now = time();
  165. if (isset($this->_addCache[$id])) {
  166. $this->_addCache[$id] = $now;
  167. return true;
  168. }
  169. $this->_addCache[$id] = $now;
  170. if (count($this->_addCache) > 20000) {
  171. $cache = array();
  172. $expire = $now - 3600;
  173. foreach ($this->_addCache as $key => $value) {
  174. if ($value > $expire) {
  175. $cache[$key] = $value;
  176. }
  177. }
  178. $this->_addCache = $cache;
  179. }
  180. return false;
  181. }
  182. }
  183. /**
  184. * 带 URL 提取功能的解析器基础类
  185. *
  186. * 设置是 URL 过滤排除规则:
  187. * 规则语法支持局部字符串匹配,或正则匹配(必须是 # 开头)
  188. * 1. 若是默认允许的外站域名,则检测 disallowDomain 匹配一条则直接排除
  189. * 2. 若是默认不允许的外站域名,则检测 allowDomain,匹配任何一条则通过继续检测
  190. * 3. 检测 disallow 规则,匹配其中一条则立即排除
  191. * 4. 检测 allow 规则,若为空则直接通过,否则必须至少满足其中一条
  192. * 5. 检测 disallowExt 规则,匹配不允许的扩展名则直接排除
  193. * 6. 最终通过 ^-^
  194. */
  195. class UrlParser implements ParseInterface
  196. {
  197. private $_timeBegin, $_numAdd, $_numUpdate, $_numFilter;
  198. private $_followExternal;
  199. private $_disallowDomain, $_allowDomain, $_disallow, $_allow;
  200. private $_allowRank;
  201. private $_nofollow;
  202. private $_disallowExt = array(
  203. '.tar' => true, '.gz' => true, '.tgz' => true, '.zip' => true, '.Z' => true, '.7z' => true,
  204. '.rpm' => true, '.deb' => true, '.ps' => true, '.dvi' => true, '.pdf' => true, '.smi' => true,
  205. '.png' => true, '.jpg' => true, '.jpeg' => true, '.bmp' => true, '.tiff' => true, '.gif' => true,
  206. '.mov' => true, '.avi' => true, '.mpeg' => true, '.mpg' => true, '.mp3' => true, '.qt' => true,
  207. '.wav' => true, '.ram' => true, '.rm' => true, '.rmvb' => true, '.jar' => true, '.java' => true,
  208. '.class' => true, '.diff' => true, '.doc' => true, '.docx' => true, '.xls' => true, '.ppt' => true,
  209. '.mdb' => true, '.rtf' => true, '.exe' => true, '.pps' => true, '.so' => true, '.psd' => true,
  210. '.css' => true, '.js' => true, '.ico' => true, '.dll' => true, '.bz2' => true, '.rar' => true,
  211. );
  212. private $_ut;
  213. /**
  214. * @param UrlTable $ut
  215. */
  216. public function __construct(UrlTable $ut)
  217. {
  218. $this->_ut = $ut;
  219. $this->_timeBegin = time();
  220. $this->_numAdd = $this->_numUpdate = $this->_numFilter = 0;
  221. // apply default filters for extending
  222. $this->resetFilter();
  223. $this->defaultFilter();
  224. }
  225. public function __destruct()
  226. {
  227. $this->_ut = null;
  228. }
  229. /**
  230. * @return UrlTable
  231. */
  232. public function getUrlTable()
  233. {
  234. return $this->_ut;
  235. }
  236. /**
  237. * 扩展该类时在此应用默认的 URL 过滤规则
  238. */
  239. public function defaultFilter()
  240. {
  241. }
  242. /**
  243. * 重置所有过滤规则,但不包含后缀过滤规则
  244. */
  245. public function resetFilter()
  246. {
  247. $this->_followExternal = false;
  248. $this->_disallowDomain = array();
  249. $this->_allowDomain = array();
  250. $this->_disallow = array();
  251. $this->_allow = array();
  252. $this->_allowRank = array();
  253. $this->_nofollow = array();
  254. }
  255. /**
  256. * @param boolean $on 设置是否处理站外 URL,默认为 false
  257. */
  258. public function followExternal($on = true)
  259. {
  260. $this->_followExternal = $on === true ? true : false;
  261. }
  262. /**
  263. * @param string $rule 不允许的域名规则,支持正则表达式
  264. */
  265. public function disallowDomain($rule)
  266. {
  267. $this->saveMatchRule($this->_disallowDomain, $rule);
  268. }
  269. /**
  270. * @param string $rule 允许的域名规则,支持正则表达式
  271. */
  272. public function allowDomain($rule)
  273. {
  274. $this->saveMatchRule($this->_allowDomain, $rule);
  275. }
  276. /**
  277. * @param string $rule 不允许的 URL 规则,支持正则表达式
  278. */
  279. public function disallow($rule)
  280. {
  281. $this->saveMatchRule($this->_disallow, $rule);
  282. }
  283. /**
  284. * @param string $rule 允许的 URL 规则,支持正则表达式
  285. * @param int $rank 匹配此规则的 URL 的权重值
  286. * @param boolean $follow 是否跟随分析此规则页面中的链接
  287. */
  288. public function allow($rule, $rank = null, $follow = true)
  289. {
  290. $this->saveMatchRule($this->_allow, $rule);
  291. if ($rank !== null) {
  292. $this->_allowRank[$rule] = intval($rank);
  293. }
  294. if (!$follow) {
  295. $this->saveMatchRule($this->_nofollow, $rule);
  296. }
  297. }
  298. /**
  299. * @param string $name 不允许的 URL 扩展名,必须以 . 开头
  300. */
  301. public function disallowExt($name)
  302. {
  303. $this->_disallowExt[strtolower($name)] = true;
  304. }
  305. /**
  306. * @param string $name 强制允许的 URL 扩展名,必须以 . 开头
  307. */
  308. public function allowExt($name)
  309. {
  310. if (substr($name, 0, 1) === '.') {
  311. $name = strtolower($name);
  312. if (isset($this->_disallowExt[$name])) {
  313. unset($this->_disallowExt[$name]);
  314. }
  315. }
  316. }
  317. /**
  318. * 打印或返回统计情况
  319. * @param boolean $output 是否直接输出结果
  320. */
  321. public function stat($output = false)
  322. {
  323. // time
  324. $time = time() - $this->_timeBegin;
  325. $string = date('m-d H:i:s') . ' - Time cost: ';
  326. if ($time > 3600) {
  327. $string .= intval($time / 3600) . ' hours ';
  328. $time %= 3600;
  329. }
  330. if ($time > 60) {
  331. $string .= intval($time / 60) . ' mins ';
  332. $time %= 60;
  333. }
  334. $string .= $time . ' secs, ';
  335. // stats
  336. $string .= sprintf('URLs total: %d, Add: %d, Update: %d, Filtered: %d', $this->_ut->getCount(), $this->_numAdd, $this->_numUpdate, $this->_numFilter);
  337. if ($output !== true) {
  338. return $string;
  339. }
  340. echo $string . "\n";
  341. }
  342. /**
  343. * 实现 HttpParser 中定义的方法
  344. * @param Response $res
  345. * @param Request $req
  346. * @param mixed $key
  347. */
  348. public function parse(Response $res, Request $req, $key)
  349. {
  350. // update url
  351. $rawUrl = $req->getRawUrl();
  352. if ($this->_ut->updateUrl($rawUrl, $res->status)) {
  353. $this->_numUpdate++;
  354. }
  355. // parse url from body
  356. if ($res->status === 200 && $this->isFollowUrl($rawUrl)) {
  357. // get baseUrl
  358. $baseUrl = $req->getUrl();
  359. if (preg_match('/<base\s+href=[\'"]?(.*?)[\s\'">]/i', $res->body, $match)) {
  360. $baseUrl = $this->resetUrl($match[1], $baseUrl);
  361. }
  362. // href="xxx", href='xxx'
  363. if (preg_match_all('/href=([\'"])(.*?)\1/i', $res->body, $matches) > 0) {
  364. foreach ($matches[2] as $url) {
  365. $this->processUrl($url, $baseUrl, $res->url);
  366. }
  367. }
  368. // href=xxx
  369. if (preg_match_all('/href=(?![\'"])(.*?)[\s>]/i', $res->body, $matches) > 0) {
  370. foreach ($matches[1] as $url) {
  371. $this->processUrl($url, $baseUrl, $res->url);
  372. }
  373. }
  374. } elseif ($res->status === 301 || $res->status === 302) {
  375. $url = $this->resetUrl($res->getHeader('location'), $req->getUrl());
  376. $res->setHeader('location', $url); // overwrite formated url
  377. // save url for permanent redirection
  378. if ($res->status === 301) {
  379. $this->processUrl($url, $res->url);
  380. }
  381. }
  382. }
  383. /**
  384. * @param string $url
  385. * @param string $rawUrl 原先的开始页面 URL,用于计算是否为站外
  386. * @param string &$rank
  387. * @return boolean 是否 URL 符合过滤规则需要排除,需要排除返回 true
  388. */
  389. public function isDisallow($url, $rawUrl = null, &$rank = null)
  390. {
  391. // get domain
  392. if (($pos1 = strpos($url, '://')) === false) {
  393. return true;
  394. }
  395. $pos1 += 3;
  396. $pos2 = strpos($url, '/', $pos1);
  397. $domain = $pos2 === false ? substr($url, $pos1) : substr($url, $pos1, $pos2 - $pos1);
  398. // external domain
  399. if ($rawUrl !== null && !@strstr($rawUrl, $domain)) {
  400. // disallow domain
  401. if ($this->_followExternal && $this->isMatchRule($this->_disallowDomain, $domain)) {
  402. return true;
  403. }
  404. // allow domain
  405. if (!$this->_followExternal
  406. && (count($this->_allowDomain) === 0 || !$this->isMatchRule($this->_allowDomain, $domain))) {
  407. return true;
  408. }
  409. }
  410. // disallow
  411. if ($this->isMatchRule($this->_disallow, $url)) {
  412. return true;
  413. }
  414. // allow
  415. if (count($this->_allow) > 0 && !$this->isMatchRule($this->_allow, $url, $rank)) {
  416. return true;
  417. }
  418. // dislaowExt
  419. if (($pos1 = strpos($url, '?')) === false) {
  420. $pos1 = strlen($url);
  421. }
  422. if (($pos2 = strpos($url, '/', 8)) !== false
  423. && ($ext = strrchr(substr($url, $pos2, $pos1 - $pos2), '.'))) {
  424. $ext = strtolower($ext);
  425. if (isset($this->_disallowExt[$ext])) {
  426. return true;
  427. }
  428. }
  429. return false;
  430. }
  431. /**
  432. * @param string $url
  433. * @param string $baseUrl
  434. * @return string 返回处理好的标准 URL
  435. */
  436. public function resetUrl($url, $baseUrl = null)
  437. {
  438. // 开头处理
  439. if (!strncasecmp($url, 'http://http://', 14)) {
  440. $url = substr($url, 7);
  441. }
  442. if (strncasecmp($url, 'http://', 7) && strncasecmp($url, 'https://', 8)) {
  443. if ($baseUrl === null) {
  444. $url = 'http://' . $url;
  445. } else {
  446. if (substr($url, 0, 1) === '/') {
  447. $pos = @strpos($baseUrl, '/', 8);
  448. $url = ($pos === false ? $baseUrl : substr($baseUrl, 0, $pos)) . $url;
  449. } else {
  450. $pos = @strrpos($baseUrl, '/', 8);
  451. $url = ($pos === false ? $baseUrl . '/' : substr($baseUrl, 0, $pos + 1)) . $url;
  452. }
  453. }
  454. }
  455. // 统一 URL 格式,顶级网址以 / 结尾,去除 # 后的锚点
  456. if (@strpos($url, '/', 8) === false) {
  457. $url .= '/';
  458. }
  459. if (($pos = strrpos($url, '#')) !== false) {
  460. $url = substr($url, 0, $pos);
  461. }
  462. // 计算并处理 '../../' 等多余的相对 URL
  463. if (strpos($url, '/./') !== false || strpos($url, '/../') !== false) {
  464. $parts = array();
  465. $tmpa = explode('/', substr($url, 8));
  466. for ($i = 0; $i < count($tmpa); $i++) {
  467. if ($tmpa[$i] === '.' || ($tmpa[$i] === '' && isset($tmpa[$i + 1]))) {
  468. continue;
  469. } elseif ($tmpa[$i] !== '..') {
  470. array_push($parts, $tmpa[$i]);
  471. } elseif (count($parts) > 1) {
  472. array_pop($parts);
  473. }
  474. }
  475. $url = substr($url, 0, 8) . implode('/', $parts);
  476. }
  477. return $url;
  478. }
  479. /**
  480. * @param string $url
  481. * @return boolean 是否分析处理当前 URL 内容中的链接
  482. */
  483. protected function isFollowUrl($url)
  484. {
  485. return !$this->isMatchRule($this->_nofollow, $url);
  486. }
  487. /**
  488. * @return mixed
  489. */
  490. protected function processUrl($url, $baseUrl, $rawUrl = null)
  491. {
  492. if (substr($url, 0, 1) === '#' || !strncasecmp($url, 'javascript:', 11) || !strncasecmp($url, 'mailto:', 7)) {
  493. return 'SKIP';
  494. }
  495. $url = $this->resetUrl($url, $baseUrl);
  496. $rank = 0;
  497. if ($this->isDisallow($url, $rawUrl === null ? $baseUrl : $rawUrl, $rank)) {
  498. $this->_numFilter++;
  499. return 'FILTER';
  500. }
  501. if ($this->_ut->addUrl($url, $rank)) {
  502. $this->_numAdd++;
  503. return 'ADD';
  504. }
  505. return 'SKIP';
  506. }
  507. private function saveMatchRule(&$array, $rule)
  508. {
  509. if ($rule === null) {
  510. $array = array();
  511. } elseif ($this->isRegexPattern($rule)) {
  512. array_push($array, "\xff" . $rule);
  513. } else {
  514. array_unshift($array, $rule);
  515. }
  516. }
  517. private function isMatchRule($rules, $input, &$rank = null)
  518. {
  519. foreach ($rules as $rule) {
  520. if (ord($rule[0]) !== 0xff) {
  521. $matched = stristr($input, $rule) !== false;
  522. } else {
  523. $rule = substr($rule, 1);
  524. $matched = preg_match($rule, $input) > 0;
  525. }
  526. if ($matched === true) {
  527. if (isset($this->_allowRank[$rule])) {
  528. $rank = $this->_allowRank[$rule];
  529. }
  530. return true;
  531. }
  532. }
  533. return false;
  534. }
  535. private function isRegexPattern($input)
  536. {
  537. if (strlen($input) > 2 && $input[0] === '#') {
  538. for ($i = strlen($input) - 1; $i > 1; $i--) {
  539. if ($input[$i] === $input[0]) {
  540. return true;
  541. }
  542. if ($input[$i] !== 'i' && $input[$i] !== 'u') {
  543. break;
  544. }
  545. }
  546. }
  547. return false;
  548. }
  549. }