skel.inc.php 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. <?php
  2. /**
  3. * pspider - custom template file
  4. *
  5. * @author hightman <hightman@twomice.net>
  6. * @link http://www.hightman.cn/
  7. * @copyright Copyright &copy; 2008-2013 Twomice Studio
  8. */
  9. use hightman\http\Response;
  10. use hightman\http\Request;
  11. /// --- custom 并发抓取数量
  12. define('PSP_NUM_PARALLEL', 5);
  13. /// --- custom 同一 URL 连续抓取间隔
  14. define('PSP_CRAWL_PERIOD', 3600);
  15. /// --- Adding StringHelper on need
  16. /// require_once __DIR__ . '/../lib/StringHelper.php';
  17. /**
  18. * 设置 MySQL 参数,要求带有 _urls 表,并采用以下结构:
  19. CREATE TABLE `_urls` (
  20. `id` varchar(32) NOT NULL COMMENT 'md5 hash of URL',
  21. `url` text,
  22. `rank` smallint NOT NULL default '0' COMMENT 'process prior level',
  23. `status` smallint NOT NULL default '0' COMMENT 'last http response status',
  24. `select_time` int unsigned NOT NULL default '0' COMMENT 'last process time',
  25. `update_time` int unsigned NOT NULL default '0' COMMENT 'last update time',
  26. PRIMARY KEY (`id`)
  27. ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='url table for pspider';
  28. */
  29. class UrlTableCustom extends UrlTableMySQL
  30. {
  31. public function __construct()
  32. {
  33. /// --- custom setting BEGIN
  34. $host = 'localhost';
  35. $user = 'root';
  36. $pass = '';
  37. $dbname = 'test';
  38. /// --- custom setting END
  39. parent::__construct($host, $user, $pass, $dbname);
  40. $this->test();
  41. }
  42. }
  43. /**
  44. * 自定义解析器
  45. */
  46. class UrlParserCustom extends UrlParser
  47. {
  48. /**
  49. * 在这个方法内添加抓取内容解析处理代码
  50. */
  51. public function parse(Response $res, Request $req, $key)
  52. {
  53. parent::parse($res, $req, $key);
  54. if ($res->status === 200) {
  55. /// --- custom code BEGIN ---
  56. echo "PROCESSING: " . $req->getUrl() . "\n";
  57. /// --- custom code END ---
  58. }
  59. }
  60. /**
  61. * 在这个方法内添加新 URL 过滤规则,主要是调用以下方法:
  62. * followExternal()
  63. * allowDomain(), disallowDomain()
  64. * allow(), disallow(), disallowExt()
  65. *
  66. * 注意:allow() 支持第三在数指定此规则下的页面是否跟随分析
  67. */
  68. public function defaultFilter()
  69. {
  70. parent::defaultFilter();
  71. /// --- custom filter BEGIN ---
  72. $this->followExternal(false);
  73. $this->disallow('.php?q=');
  74. /// --- custom filter END ---
  75. }
  76. /**
  77. * 在这个方法内定义是否分析处理该 url 内容中的链接
  78. * @param string $url
  79. * @return boolean
  80. */
  81. protected function isFollowUrl($url)
  82. {
  83. return parent::isFollowUrl($url);
  84. }
  85. }