You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
355 lines
10 KiB
355 lines
10 KiB
<?php |
|
namespace HtmlParser; |
|
/** |
|
* Copyright (c) 2013, 俊杰Jerry |
|
* All rights reserved. |
|
* |
|
* @description: html解析器 |
|
* @author : 俊杰Jerry<bupt1987@gmail.com> |
|
* @date : 2013-6-10 |
|
*/ |
|
class ParserDom { |
|
/** |
|
* @var \DOMNode |
|
*/ |
|
public $node; |
|
/** |
|
* @var array |
|
*/ |
|
private $_lFind = []; |
|
/** |
|
* @param \DOMNode|string $node |
|
* @throws \Exception |
|
*/ |
|
public function __construct($node = NULL) { |
|
if ($node !== NULL) { |
|
if ($node instanceof \DOMNode) { |
|
$this->node = $node; |
|
} else { |
|
$dom = new \DOMDocument(); |
|
$dom->preserveWhiteSpace = FALSE; |
|
$dom->strictErrorChecking = FALSE; |
|
if (@$dom->loadHTML($node)) { |
|
$this->node = $dom; |
|
} else { |
|
throw new \Exception('load html error'); |
|
} |
|
} |
|
} |
|
} |
|
/** |
|
* 初始化的时候可以不用传入html,后面可以多次使用 |
|
* @param null $node |
|
* @throws \Exception |
|
*/ |
|
public function load($node = NULL) { |
|
if ($node instanceof \DOMNode) { |
|
$this->node = $node; |
|
} else { |
|
$dom = new \DOMDocument(); |
|
$dom->preserveWhiteSpace = FALSE; |
|
$dom->strictErrorChecking = FALSE; |
|
if (@$dom->loadHTML($node)) { |
|
$this->node = $dom; |
|
} else { |
|
throw new \Exception('load html error'); |
|
} |
|
} |
|
} |
|
/** |
|
* @codeCoverageIgnore |
|
* @param string $name |
|
* @return mixed |
|
*/ |
|
function __get($name) { |
|
switch ($name) { |
|
case 'outertext': |
|
return $this->outerHtml(); |
|
case 'innertext': |
|
return $this->innerHtml(); |
|
case 'plaintext': |
|
return $this->getPlainText(); |
|
case 'href': |
|
return $this->getAttr("href"); |
|
case 'src': |
|
return $this->getAttr("src"); |
|
default: |
|
return NULL; |
|
} |
|
} |
|
/** |
|
* 深度优先查询 |
|
* |
|
* @param string $selector |
|
* @param number $idx 找第几个,从0开始计算,null 表示都返回, 负数表示倒数第几个 |
|
* @return self|self[] |
|
*/ |
|
public function find($selector, $idx = NULL) { |
|
if (empty($this->node->childNodes)) { |
|
return FALSE; |
|
} |
|
$selectors = $this->parse_selector($selector); |
|
if (($count = count($selectors)) === 0) { |
|
return FALSE; |
|
} |
|
for ($c = 0; $c < $count; $c++) { |
|
if (($level = count($selectors [$c])) === 0) { |
|
return FALSE; |
|
} |
|
$this->search($this->node, $idx, $selectors [$c], $level); |
|
} |
|
$found = $this->_lFind; |
|
$this->_lFind = []; |
|
if ($idx !== NULL) { |
|
if ($idx < 0) { |
|
$idx = count($found) + $idx; |
|
} |
|
if (isset($found[$idx])) { |
|
return $found[$idx]; |
|
} else { |
|
return FALSE; |
|
} |
|
} |
|
return $found; |
|
} |
|
/** |
|
* 返回文本信息 |
|
* |
|
* @return string |
|
*/ |
|
public function getPlainText() { |
|
return $this->text($this->node); |
|
} |
|
/** |
|
* 获取innerHtml |
|
* @return string |
|
*/ |
|
public function innerHtml() { |
|
$innerHTML = ""; |
|
$children = $this->node->childNodes; |
|
foreach ($children as $child) { |
|
$innerHTML .= $this->node->ownerDocument->saveHTML($child) ?: ''; |
|
} |
|
return $innerHTML; |
|
} |
|
/** |
|
* 获取outerHtml |
|
* @return string|bool |
|
*/ |
|
public function outerHtml() { |
|
$doc = new \DOMDocument(); |
|
$doc->appendChild($doc->importNode($this->node, TRUE)); |
|
return $doc->saveHTML($doc); |
|
} |
|
/** |
|
* 获取html的元属值 |
|
* |
|
* @param string $name |
|
* @return string|null |
|
*/ |
|
public function getAttr($name) { |
|
$oAttr = $this->node->attributes->getNamedItem($name); |
|
if (isset($oAttr)) { |
|
return $oAttr->nodeValue; |
|
} |
|
return NULL; |
|
} |
|
/** |
|
* 匹配 |
|
* |
|
* @param string $exp |
|
* @param string $pattern |
|
* @param string $value |
|
* @return boolean|number |
|
*/ |
|
private function match($exp, $pattern, $value) { |
|
$pattern = strtolower($pattern); |
|
$value = strtolower($value); |
|
switch ($exp) { |
|
case '=' : |
|
return ($value === $pattern); |
|
case '!=' : |
|
return ($value !== $pattern); |
|
case '^=' : |
|
return preg_match("/^" . preg_quote($pattern, '/') . "/", $value); |
|
case '$=' : |
|
return preg_match("/" . preg_quote($pattern, '/') . "$/", $value); |
|
case '*=' : |
|
if ($pattern [0] == '/') { |
|
return preg_match($pattern, $value); |
|
} |
|
return preg_match("/" . $pattern . "/i", $value); |
|
} |
|
return FALSE; |
|
} |
|
/** |
|
* 分析查询语句 |
|
* |
|
* @param string $selector_string |
|
* @return array |
|
*/ |
|
private function parse_selector($selector_string) { |
|
$pattern = '/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)["\']?(.*?)["\']?)?\])?([\/, ]+)/is'; |
|
preg_match_all($pattern, trim($selector_string) . ' ', $matches, PREG_SET_ORDER); |
|
$selectors = []; |
|
$result = []; |
|
foreach ($matches as $m) { |
|
$m [0] = trim($m [0]); |
|
if ($m [0] === '' || $m [0] === '/' || $m [0] === '//') |
|
continue; |
|
if ($m [1] === 'tbody') |
|
continue; |
|
list ($tag, $key, $val, $exp, $no_key) = [$m [1], NULL, NULL, '=', FALSE]; |
|
if (!empty ($m [2])) { |
|
$key = 'id'; |
|
$val = $m [2]; |
|
} |
|
if (!empty ($m [3])) { |
|
$key = 'class'; |
|
$val = $m [3]; |
|
} |
|
if (!empty ($m [4])) { |
|
$key = $m [4]; |
|
} |
|
if (!empty ($m [5])) { |
|
$exp = $m [5]; |
|
} |
|
if (!empty ($m [6])) { |
|
$val = $m [6]; |
|
} |
|
// convert to lowercase |
|
$tag = strtolower($tag); |
|
$key = strtolower($key); |
|
// elements that do NOT have the specified attribute |
|
if (isset ($key [0]) && $key [0] === '!') { |
|
$key = substr($key, 1); |
|
$no_key = TRUE; |
|
} |
|
$result [] = [$tag, $key, $val, $exp, $no_key]; |
|
if (trim($m [7]) === ',') { |
|
$selectors [] = $result; |
|
$result = []; |
|
} |
|
} |
|
if (count($result) > 0) { |
|
$selectors [] = $result; |
|
} |
|
return $selectors; |
|
} |
|
/** |
|
* 深度查询 |
|
* |
|
* @param \DOMNode $search |
|
* @param $idx |
|
* @param $selectors |
|
* @param $level |
|
* @param int $search_level |
|
* @return bool |
|
*/ |
|
private function search(&$search, $idx, $selectors, $level, $search_level = 0) { |
|
if ($search_level >= $level) { |
|
$rs = $this->seek($search, $selectors, $level - 1); |
|
if ($rs !== FALSE && $idx !== NULL) { |
|
if ($idx == count($this->_lFind)) { |
|
$this->_lFind[] = new self($rs); |
|
return TRUE; |
|
} else { |
|
$this->_lFind[] = new self($rs); |
|
} |
|
} elseif ($rs !== FALSE) { |
|
$this->_lFind[] = new self($rs); |
|
} |
|
} |
|
if (!empty($search->childNodes)) { |
|
foreach ($search->childNodes as $val) { |
|
if ($this->search($val, $idx, $selectors, $level, $search_level + 1)) { |
|
return TRUE; |
|
} |
|
} |
|
} |
|
return FALSE; |
|
} |
|
/** |
|
* 获取tidy_node文本 |
|
* |
|
* @param \DOMNode $node |
|
* @return string |
|
*/ |
|
private function text(&$node) { |
|
return $node->textContent; |
|
} |
|
/** |
|
* 匹配节点,由于采取的倒序查找,所以时间复杂度为n+m*l n为总节点数,m为匹配最后一个规则的个数,l为规则的深度, |
|
* @codeCoverageIgnore |
|
* @param \DOMNode $search |
|
* @param array $selectors |
|
* @param int $current |
|
* @return boolean|\DOMNode |
|
*/ |
|
private function seek($search, $selectors, $current) { |
|
if (!($search instanceof \DOMElement)) { |
|
return FALSE; |
|
} |
|
list ($tag, $key, $val, $exp, $no_key) = $selectors [$current]; |
|
$pass = TRUE; |
|
if ($tag === '*' && !$key) { |
|
exit('tag为*时,key不能为空'); |
|
} |
|
if ($tag && $tag != $search->tagName && $tag !== '*') { |
|
$pass = FALSE; |
|
} |
|
if ($pass && $key) { |
|
if ($no_key) { |
|
if ($search->hasAttribute($key)) { |
|
$pass = FALSE; |
|
} |
|
} else { |
|
if ($key != "plaintext" && !$search->hasAttribute($key)) { |
|
$pass = FALSE; |
|
} |
|
} |
|
} |
|
if ($pass && $key && $val && $val !== '*') { |
|
if ($key == "plaintext") { |
|
$nodeKeyValue = $this->text($search); |
|
} else { |
|
$nodeKeyValue = $search->getAttribute($key); |
|
} |
|
$check = $this->match($exp, $val, $nodeKeyValue); |
|
if (!$check && strcasecmp($key, 'class') === 0) { |
|
foreach (explode(' ', $search->getAttribute($key)) as $k) { |
|
if (!empty ($k)) { |
|
$check = $this->match($exp, $val, $k); |
|
if ($check) { |
|
break; |
|
} |
|
} |
|
} |
|
} |
|
if (!$check) { |
|
$pass = FALSE; |
|
} |
|
} |
|
if ($pass) { |
|
$current--; |
|
if ($current < 0) { |
|
return $search; |
|
} elseif ($this->seek($this->getParent($search), $selectors, $current)) { |
|
return $search; |
|
} else { |
|
return FALSE; |
|
} |
|
} else { |
|
return FALSE; |
|
} |
|
} |
|
/** |
|
* 获取父亲节点 |
|
* |
|
* @param \DOMNode $node |
|
* @return \DOMNode |
|
*/ |
|
private function getParent($node) { |
|
return $node->parentNode; |
|
} |
|
} |