ParserDom.php 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374
  1. <?php
  2. namespace Qii\Library;
  3. /**
  4. * Copyright (c) 2013, 俊杰Jerry
  5. * All rights reserved.
  6. *
  7. * @description: html解析器
  8. * @author : 俊杰Jerry<bupt1987@gmail.com>
  9. * @date : 2013-6-10
  10. */
  11. class ParserDom {
  12. /**
  13. * @var \DOMNode
  14. */
  15. public $node;
  16. /**
  17. * @var array
  18. */
  19. private $_lFind = [];
  20. /**
  21. * @param \DOMNode|string $node
  22. * @throws \Exception
  23. */
  24. public function __construct($node = NULL) {
  25. if ($node !== NULL) {
  26. if ($node instanceof \DOMNode) {
  27. $this->node = $node;
  28. } else {
  29. $dom = new \DOMDocument();
  30. $dom->preserveWhiteSpace = FALSE;
  31. $dom->strictErrorChecking = FALSE;
  32. if (@$dom->loadHTML($node)) {
  33. $this->node = $dom;
  34. } else {
  35. throw new \Exception('load html error');
  36. }
  37. }
  38. }
  39. }
  40. /**
  41. * 初始化的时候可以不用传入html,后面可以多次使用
  42. * @param null $node
  43. * @throws \Exception
  44. */
  45. public function load($node = NULL) {
  46. if ($node instanceof \DOMNode) {
  47. $this->node = $node;
  48. } else {
  49. $dom = new \DOMDocument();
  50. $dom->preserveWhiteSpace = FALSE;
  51. $dom->strictErrorChecking = FALSE;
  52. if (@$dom->loadHTML($node)) {
  53. $this->node = $dom;
  54. } else {
  55. throw new \Exception('load html error');
  56. }
  57. }
  58. }
  59. /**
  60. * @codeCoverageIgnore
  61. * @param string $name
  62. * @return mixed
  63. */
  64. function __get($name) {
  65. switch ($name) {
  66. case 'outertext':
  67. return $this->outerHtml();
  68. case 'innertext':
  69. return $this->innerHtml();
  70. case 'plaintext':
  71. return $this->getPlainText();
  72. case 'href':
  73. return $this->getAttr("href");
  74. case 'src':
  75. return $this->getAttr("src");
  76. default:
  77. return NULL;
  78. }
  79. }
  80. /**
  81. * 深度优先查询
  82. *
  83. * @param string $selector
  84. * @param number $idx 找第几个,从0开始计算,null 表示都返回, 负数表示倒数第几个
  85. * @return self|self[]
  86. */
  87. public function find($selector, $idx = NULL) {
  88. if (empty($this->node->childNodes)) {
  89. return FALSE;
  90. }
  91. $selectors = $this->parse_selector($selector);
  92. if (($count = count($selectors)) === 0) {
  93. return FALSE;
  94. }
  95. for ($c = 0; $c < $count; $c++) {
  96. if (($level = count($selectors [$c])) === 0) {
  97. return FALSE;
  98. }
  99. $this->search($this->node, $idx, $selectors [$c], $level);
  100. }
  101. $found = $this->_lFind;
  102. $this->_lFind = [];
  103. if ($idx !== NULL) {
  104. if ($idx < 0) {
  105. $idx = count($found) + $idx;
  106. }
  107. if (isset($found[$idx])) {
  108. return $found[$idx];
  109. } else {
  110. return FALSE;
  111. }
  112. }
  113. return $found;
  114. }
  115. /**
  116. * 返回文本信息
  117. *
  118. * @return string
  119. */
  120. public function getPlainText() {
  121. return $this->text($this->node);
  122. }
  123. /**
  124. * 获取innerHtml
  125. * @return string
  126. */
  127. public function innerHtml() {
  128. $innerHTML = "";
  129. $children = $this->node->childNodes;
  130. foreach ($children as $child) {
  131. $innerHTML .= $this->node->ownerDocument->saveHTML($child) ?: '';
  132. }
  133. return $innerHTML;
  134. }
  135. /**
  136. * 获取outerHtml
  137. * @return string|bool
  138. */
  139. public function outerHtml() {
  140. $doc = new \DOMDocument();
  141. $doc->appendChild($doc->importNode($this->node, TRUE));
  142. return $doc->saveHTML($doc);
  143. }
  144. /**
  145. * 获取html的元属值
  146. *
  147. * @param string $name
  148. * @return string|null
  149. */
  150. public function getAttr($name) {
  151. $oAttr = $this->node->attributes->getNamedItem($name);
  152. if (isset($oAttr)) {
  153. return $oAttr->nodeValue;
  154. }
  155. return NULL;
  156. }
  157. /**
  158. * 匹配
  159. *
  160. * @param string $exp
  161. * @param string $pattern
  162. * @param string $value
  163. * @return boolean|number
  164. */
  165. private function match($exp, $pattern, $value) {
  166. $pattern = strtolower($pattern);
  167. $value = strtolower($value);
  168. switch ($exp) {
  169. case '=' :
  170. return ($value === $pattern);
  171. case '!=' :
  172. return ($value !== $pattern);
  173. case '^=' :
  174. return preg_match("/^" . preg_quote($pattern, '/') . "/", $value);
  175. case '$=' :
  176. return preg_match("/" . preg_quote($pattern, '/') . "$/", $value);
  177. case '*=' :
  178. if ($pattern [0] == '/') {
  179. return preg_match($pattern, $value);
  180. }
  181. return preg_match("/" . $pattern . "/i", $value);
  182. }
  183. return FALSE;
  184. }
  185. /**
  186. * 分析查询语句
  187. *
  188. * @param string $selector_string
  189. * @return array
  190. */
  191. private function parse_selector($selector_string) {
  192. $pattern = '/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)["\']?(.*?)["\']?)?\])?([\/, ]+)/is';
  193. preg_match_all($pattern, trim($selector_string) . ' ', $matches, PREG_SET_ORDER);
  194. $selectors = [];
  195. $result = [];
  196. foreach ($matches as $m) {
  197. $m [0] = trim($m [0]);
  198. if ($m [0] === '' || $m [0] === '/' || $m [0] === '//')
  199. continue;
  200. if ($m [1] === 'tbody')
  201. continue;
  202. list ($tag, $key, $val, $exp, $no_key) = [$m [1], NULL, NULL, '=', FALSE];
  203. if (!empty ($m [2])) {
  204. $key = 'id';
  205. $val = $m [2];
  206. }
  207. if (!empty ($m [3])) {
  208. $key = 'class';
  209. $val = $m [3];
  210. }
  211. if (!empty ($m [4])) {
  212. $key = $m [4];
  213. }
  214. if (!empty ($m [5])) {
  215. $exp = $m [5];
  216. }
  217. if (!empty ($m [6])) {
  218. $val = $m [6];
  219. }
  220. // convert to lowercase
  221. $tag = strtolower($tag);
  222. $key = strtolower($key);
  223. // elements that do NOT have the specified attribute
  224. if (isset ($key [0]) && $key [0] === '!') {
  225. $key = substr($key, 1);
  226. $no_key = TRUE;
  227. }
  228. $result [] = [$tag, $key, $val, $exp, $no_key];
  229. if (trim($m [7]) === ',') {
  230. $selectors [] = $result;
  231. $result = [];
  232. }
  233. }
  234. if (count($result) > 0) {
  235. $selectors [] = $result;
  236. }
  237. return $selectors;
  238. }
  239. /**
  240. * 深度查询
  241. *
  242. * @param \DOMNode $search
  243. * @param $idx
  244. * @param $selectors
  245. * @param $level
  246. * @param int $search_level
  247. * @return bool
  248. */
  249. private function search(&$search, $idx, $selectors, $level, $search_level = 0) {
  250. if ($search_level >= $level) {
  251. $rs = $this->seek($search, $selectors, $level - 1);
  252. if ($rs !== FALSE && $idx !== NULL) {
  253. if ($idx == count($this->_lFind)) {
  254. $this->_lFind[] = new self($rs);
  255. return TRUE;
  256. } else {
  257. $this->_lFind[] = new self($rs);
  258. }
  259. } elseif ($rs !== FALSE) {
  260. $this->_lFind[] = new self($rs);
  261. }
  262. }
  263. if (!empty($search->childNodes)) {
  264. foreach ($search->childNodes as $val) {
  265. if ($this->search($val, $idx, $selectors, $level, $search_level + 1)) {
  266. return TRUE;
  267. }
  268. }
  269. }
  270. return FALSE;
  271. }
  272. /**
  273. * 获取tidy_node文本
  274. *
  275. * @param \DOMNode $node
  276. * @return string
  277. */
  278. private function text(&$node) {
  279. return $node->textContent;
  280. }
  281. /**
  282. * 匹配节点,由于采取的倒序查找,所以时间复杂度为n+m*l n为总节点数,m为匹配最后一个规则的个数,l为规则的深度,
  283. * @codeCoverageIgnore
  284. * @param \DOMNode $search
  285. * @param array $selectors
  286. * @param int $current
  287. * @return boolean|\DOMNode
  288. */
  289. private function seek($search, $selectors, $current) {
  290. if (!($search instanceof \DOMElement)) {
  291. return FALSE;
  292. }
  293. list ($tag, $key, $val, $exp, $no_key) = $selectors [$current];
  294. $pass = TRUE;
  295. if ($tag === '*' && !$key) {
  296. exit('tag为*时,key不能为空');
  297. }
  298. if ($tag && $tag != $search->tagName && $tag !== '*') {
  299. $pass = FALSE;
  300. }
  301. if ($pass && $key) {
  302. if ($no_key) {
  303. if ($search->hasAttribute($key)) {
  304. $pass = FALSE;
  305. }
  306. } else {
  307. if ($key != "plaintext" && !$search->hasAttribute($key)) {
  308. $pass = FALSE;
  309. }
  310. }
  311. }
  312. if ($pass && $key && $val && $val !== '*') {
  313. if ($key == "plaintext") {
  314. $nodeKeyValue = $this->text($search);
  315. } else {
  316. $nodeKeyValue = $search->getAttribute($key);
  317. }
  318. $check = $this->match($exp, $val, $nodeKeyValue);
  319. if (!$check && strcasecmp($key, 'class') === 0) {
  320. foreach (explode(' ', $search->getAttribute($key)) as $k) {
  321. if (!empty ($k)) {
  322. $check = $this->match($exp, $val, $k);
  323. if ($check) {
  324. break;
  325. }
  326. }
  327. }
  328. }
  329. if (!$check) {
  330. $pass = FALSE;
  331. }
  332. }
  333. if ($pass) {
  334. $current--;
  335. if ($current < 0) {
  336. return $search;
  337. } elseif ($this->seek($this->getParent($search), $selectors, $current)) {
  338. return $search;
  339. } else {
  340. return FALSE;
  341. }
  342. } else {
  343. return FALSE;
  344. }
  345. }
  346. /**
  347. * 获取父亲节点
  348. *
  349. * @param \DOMNode $node
  350. * @return \DOMNode
  351. */
  352. private function getParent($node) {
  353. return $node->parentNode;
  354. }
  355. }