You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
599 lines
18 KiB
599 lines
18 KiB
<?php |
|
require_once(dirname(__FILE__) . '/wfAPI.php'); |
|
require_once(dirname(__FILE__) . '/wfArray.php'); |
|
class wordfenceURLHoover { |
|
private $debug = false; |
|
public $errorMsg = false; |
|
private $hostsToAdd = false; |
|
private $table = ''; |
|
private $apiKey = false; |
|
private $wordpressVersion = false; |
|
private $useDB = true; |
|
private $hostKeys = array(); |
|
private $hostList = array(); |
|
public $currentHooverID = false; |
|
private $_foundSome = false; |
|
private $_excludedHosts = array(); |
|
private $api = false; |
|
private $db = false; |
|
|
|
public static function standardExcludedHosts() { |
|
static $standardExcludedHosts = null; |
|
if ($standardExcludedHosts !== null) { |
|
return $standardExcludedHosts; |
|
} |
|
|
|
global $wpdb; |
|
$excludedHosts = array(); |
|
if (is_multisite()) { |
|
$blogIDs = $wpdb->get_col("SELECT blog_id FROM {$wpdb->blogs}"); //Can't use wp_get_sites or get_sites because they return empty at 10k sites |
|
foreach ($blogIDs as $id) { |
|
$homeURL = get_home_url($id); |
|
$host = parse_url($homeURL, PHP_URL_HOST); |
|
if ($host) { |
|
$excludedHosts[$host] = 1; |
|
} |
|
$siteURL = get_site_url($id); |
|
$host = parse_url($siteURL, PHP_URL_HOST); |
|
if ($host) { |
|
$excludedHosts[$host] = 1; |
|
} |
|
} |
|
} |
|
else { |
|
$homeURL = wfUtils::wpHomeURL(); |
|
$host = parse_url($homeURL, PHP_URL_HOST); |
|
if ($host) { |
|
$excludedHosts[$host] = 1; |
|
} |
|
$siteURL = wfUtils::wpSiteURL(); |
|
$host = parse_url($siteURL, PHP_URL_HOST); |
|
if ($host) { |
|
$excludedHosts[$host] = 1; |
|
} |
|
} |
|
|
|
$standardExcludedHosts = array_keys($excludedHosts); |
|
return $standardExcludedHosts; |
|
} |
|
|
|
public function __sleep() { |
|
$this->writeHosts(); |
|
return array('debug', 'errorMsg', 'table', 'apiKey', 'wordpressVersion'); |
|
} |
|
|
|
public function __wakeup() { |
|
$this->hostsToAdd = new wfArray(array('owner', 'host', 'path', 'hostKey')); |
|
$this->api = new wfAPI($this->apiKey, $this->wordpressVersion); |
|
$this->db = new wfDB(); |
|
} |
|
|
|
public function __construct($apiKey, $wordpressVersion, $db = false, $continuation = false) { |
|
$this->hostsToAdd = new wfArray(array('owner', 'host', 'path', 'hostKey')); |
|
$this->apiKey = $apiKey; |
|
$this->wordpressVersion = $wordpressVersion; |
|
$this->api = new wfAPI($apiKey, $wordpressVersion); |
|
if($db){ |
|
$this->db = $db; |
|
} else { |
|
$this->db = new wfDB(); |
|
} |
|
global $wpdb; |
|
if(isset($wpdb)){ |
|
$this->table = wfDB::networkTable('wfHoover'); |
|
} else { |
|
$this->table = 'wp_wfHoover'; |
|
} |
|
|
|
if (!$continuation) { |
|
$this->cleanup(); |
|
} |
|
} |
|
|
|
public function cleanup() { |
|
$this->db->truncate($this->table); |
|
} |
|
|
|
public function hoover($id, $data, $excludedHosts = array()) { |
|
$this->currentHooverID = $id; |
|
$this->_foundSome = 0; |
|
$this->_excludedHosts = $excludedHosts; |
|
@preg_replace_callback('/\b((?:[a-z][\w-]+:(?:\/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}\/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))/i', array($this, 'captureURL'), $data); |
|
$this->writeHosts(); |
|
return $this->_foundSome; |
|
} |
|
|
|
private function dbg($msg) { |
|
if ($this->debug) { wordfence::status(4, 'info', $msg); } |
|
} |
|
|
|
public function captureURL($matches) { |
|
$id = $this->currentHooverID; |
|
$url = $matches[0]; |
|
$components = parse_url($url); |
|
if (!isset($components['scheme']) || !preg_match('/^https?$/i', $components['scheme'])) { |
|
return; |
|
} |
|
foreach ($this->_excludedHosts as $h) { |
|
if (strcasecmp($h, $components['host']) === 0) { |
|
return; |
|
} |
|
} |
|
if (!filter_var($url, FILTER_VALIDATE_URL)) { |
|
return; |
|
} |
|
|
|
$this->_foundSome++; |
|
|
|
$host = (isset($components['host']) ? $components['host'] : ''); |
|
$path = (isset($components['path']) && !empty($components['path']) ? $components['path'] : '/'); |
|
$hashes = $this->_generateHashes($url); |
|
foreach ($hashes as $h) { |
|
$this->hostsToAdd->push(array('owner' => $id, 'host' => $host, 'path' => $path, 'hostKey' => wfUtils::substr($h, 0, 4))); |
|
} |
|
|
|
if($this->hostsToAdd->size() > 1000){ $this->writeHosts(); } |
|
} |
|
|
|
private function writeHosts() { |
|
if ($this->hostsToAdd->size() < 1) { return; } |
|
if ($this->useDB) { |
|
$sql = "INSERT INTO " . $this->table . " (owner, host, path, hostKey) VALUES "; |
|
while ($elem = $this->hostsToAdd->shift()) { |
|
//This may be an issue for hyperDB or other abstraction layers, but leaving it for now. |
|
$sql .= sprintf("('%s', '%s', '%s', '%s'),", |
|
$this->db->realEscape($elem['owner']), |
|
$this->db->realEscape($elem['host']), |
|
$this->db->realEscape($elem['path']), |
|
$this->db->realEscape($elem['hostKey']) |
|
); |
|
} |
|
$sql = rtrim($sql, ','); |
|
$this->db->queryWrite($sql); |
|
$this->hostsToAdd->collectGarbage(); |
|
} |
|
else { |
|
while ($elem = $this->hostsToAdd->shift()) { |
|
$keys = str_split($elem['hostKey'], 4); |
|
foreach ($keys as $k) { |
|
$this->hostKeys[] = $k; |
|
} |
|
$this->hostList[] = array( |
|
'owner' => $elem['owner'], |
|
'host' => $elem['host'], |
|
'path' => $elem['path'], |
|
'hostKey' => $elem['hostKey'] |
|
); |
|
} |
|
$this->hostsToAdd->collectGarbage(); |
|
} |
|
} |
|
public function getBaddies() { |
|
wordfence::status(4, 'info', "Gathering host keys."); |
|
$allHostKeys = ''; |
|
if ($this->useDB) { |
|
global $wpdb; |
|
$dbh = $wpdb->dbh; |
|
$useMySQLi = (is_object($dbh) && $wpdb->use_mysqli && wfConfig::get('allowMySQLi', true) && WORDFENCE_ALLOW_DIRECT_MYSQLI); |
|
if ($useMySQLi) { //If direct-access MySQLi is available, we use it to minimize the memory footprint instead of letting it fetch everything into an array first |
|
wordfence::status(4, 'info', "Using MySQLi directly."); |
|
$result = $dbh->query("SELECT DISTINCT hostKey FROM {$this->table} ORDER BY hostKey ASC LIMIT 100000"); /* We limit to 100,000 prefixes since more than that cannot be reliably checked within the default max_execution_time */ |
|
if (!is_object($result)) { |
|
$this->errorMsg = "Unable to query database"; |
|
$this->dbg($this->errorMsg); |
|
return false; |
|
} |
|
while ($row = $result->fetch_assoc()) { |
|
$allHostKeys .= $row['hostKey']; |
|
} |
|
} |
|
else { |
|
$q1 = $this->db->querySelect("SELECT DISTINCT hostKey FROM {$this->table} ORDER BY hostKey ASC LIMIT 100000"); /* We limit to 100,000 prefixes since more than that cannot be reliably checked within the default max_execution_time */ |
|
foreach ($q1 as $hRec) { |
|
$allHostKeys .= $hRec['hostKey']; |
|
} |
|
} |
|
} |
|
else { |
|
$allHostKeys = implode('', array_values(array_unique($this->hostKeys))); |
|
} |
|
|
|
/** |
|
* Check hash prefixes first. Each one is a 4-byte binary prefix of a SHA-256 hash of the URL. The response will |
|
* be a binary list of 4-byte indices; The full URL for each index should be sent in the secondary query to |
|
* find the true good/bad status. |
|
*/ |
|
|
|
$allCount = wfUtils::strlen($allHostKeys) / 4; |
|
if ($allCount > 0) { |
|
if ($this->debug) { |
|
$this->dbg("Checking {$allCount} hostkeys"); |
|
for ($i = 0; $i < $allCount; $i++) { |
|
$key = wfUtils::substr($allHostKeys, $i * 4, 4); |
|
$this->dbg("Checking hostkey: " . bin2hex($key)); |
|
} |
|
} |
|
|
|
wordfence::status(2, 'info', "Checking {$allCount} host keys against Wordfence scanning servers."); |
|
$resp = $this->api->binCall('check_host_keys', $allHostKeys); |
|
wordfence::status(2, 'info', "Done host key check."); |
|
$this->dbg("Done host key check"); |
|
|
|
$badHostKeys = ''; |
|
if ($resp['code'] >= 200 && $resp['code'] <= 299) { |
|
$this->dbg("Host key response: " . bin2hex($resp['data'])); |
|
$dataLen = wfUtils::strlen($resp['data']); |
|
if ($dataLen > 0 && $dataLen % 2 == 0) { |
|
$this->dbg("Checking response indexes"); |
|
for ($i = 0; $i < $dataLen; $i += 2) { |
|
$idx = wfUtils::array_first(unpack('n', wfUtils::substr($resp['data'], $i, 2))); |
|
$this->dbg("Checking index {$idx}"); |
|
if ($idx < $allCount) { |
|
$prefix = wfUtils::substr($allHostKeys, $idx * 4, 4); |
|
$badHostKeys .= $prefix; |
|
$this->dbg("Got bad hostkey for record: " . bin2hex($prefix)); |
|
} |
|
else { |
|
$this->dbg("Bad allHostKeys index: {$idx}"); |
|
$this->errorMsg = "Bad allHostKeys index: {$idx}"; |
|
return false; |
|
} |
|
} |
|
} |
|
else if ($dataLen > 0) { |
|
$this->errorMsg = "Invalid data length received from Wordfence server: " . $dataLen; |
|
$this->dbg($this->errorMsg); |
|
return false; |
|
} |
|
} |
|
else { |
|
$this->errorMsg = "Wordfence server responded with an error. HTTP code " . $resp['code'] . " and data: " . $resp['data']; |
|
return false; |
|
} |
|
|
|
$badCount = wfUtils::strlen($badHostKeys) / 4; |
|
if ($badCount > 0) { |
|
$urlsToCheck = array(); |
|
$totalURLs = 0; |
|
|
|
//Reconcile flagged prefixes with their corresponding URLs |
|
for ($i = 0; $i < $badCount; $i++) { |
|
$prefix = wfUtils::substr($badHostKeys, $i * 4, 4); |
|
|
|
if ($this->useDB) { |
|
/** |
|
* Putting a 10000 limit in here for sites that have a huge number of items with the same URL |
|
* that repeats. This is an edge case. But if the URLs are malicious then presumably the admin |
|
* will fix the malicious URLs and on subsequent scans the items (owners) that are above the |
|
* 10000 limit will appear. |
|
*/ |
|
$q1 = $this->db->querySelect("SELECT DISTINCT owner, host, path FROM {$this->table} WHERE hostKey = %s LIMIT 10000", $prefix); |
|
foreach ($q1 as $rec) { |
|
$url = 'http://' . $rec['host'] . $rec['path']; |
|
if (!isset($urlsToCheck[$rec['owner']])) { |
|
$urlsToCheck[$rec['owner']] = array(); |
|
} |
|
if (!in_array($url, $urlsToCheck[$rec['owner']])) { |
|
$urlsToCheck[$rec['owner']][] = $url; |
|
$totalURLs++; |
|
} |
|
} |
|
} |
|
else { |
|
foreach ($this->hostList as $rec) { |
|
$pos = wfUtils::strpos($rec['hostKey'], $prefix); |
|
if ($pos !== false && $pos % 4 == 0) { |
|
$url = 'http://' . $rec['host'] . $rec['path']; |
|
if (!isset($urlsToCheck[$rec['owner']])) { |
|
$urlsToCheck[$rec['owner']] = array(); |
|
} |
|
if (!in_array($url, $urlsToCheck[$rec['owner']])) { |
|
$urlsToCheck[$rec['owner']][] = $url; |
|
$totalURLs++; |
|
} |
|
} |
|
} |
|
} |
|
if ($totalURLs > 10000) { break; } |
|
} |
|
|
|
if (count($urlsToCheck) > 0) { |
|
wordfence::status(2, 'info', "Checking " . $totalURLs . " URLs from " . sizeof($urlsToCheck) . " sources."); |
|
$badURLs = $this->api->call('check_bad_urls', array(), array('toCheck' => json_encode($urlsToCheck))); |
|
wordfence::status(2, 'info', "Done URL check."); |
|
$this->dbg("Done URL check"); |
|
if (is_array($badURLs) && count($badURLs) > 0) { |
|
$finalResults = array(); |
|
foreach ($badURLs as $file => $badSiteList) { |
|
if (!isset($finalResults[$file])) { |
|
$finalResults[$file] = array(); |
|
} |
|
foreach ($badSiteList as $badSite) { |
|
$finalResults[$file][] = array( |
|
'URL' => $badSite[0], |
|
'badList' => $badSite[1] |
|
); |
|
} |
|
} |
|
$this->dbg("Confirmed " . count($badURLs) . " bad URLs"); |
|
return $finalResults; |
|
} |
|
} |
|
} |
|
} |
|
|
|
return array(); |
|
} |
|
|
|
protected function _generateHashes($url) { |
|
//The GSB specification requires generating and sending hash prefixes for a number of additional similar URLs. See: https://developers.google.com/safe-browsing/v4/urls-hashing#suffixprefix-expressions |
|
|
|
$canonicalURL = $this->_canonicalizeURL($url); |
|
|
|
//Extract the scheme |
|
$scheme = 'http'; |
|
if (preg_match('~^([a-z]+[a-z0-9+\.\-]*)://(.*)$~i', $canonicalURL, $matches)) { |
|
$scheme = strtolower($matches[1]); |
|
$canonicalURL = $matches[2]; |
|
} |
|
|
|
//Separate URL and query string |
|
$query = ''; |
|
if (preg_match('/^([^?]+)(\??.*)/', $canonicalURL, $matches)) { |
|
$canonicalURL = $matches[1]; |
|
$query = $matches[2]; |
|
} |
|
|
|
//Separate host and path |
|
$path = ''; |
|
preg_match('~^(.*?)(?:(/.*)|$)~', $canonicalURL, $matches); |
|
$host = $matches[1]; |
|
if (isset($matches[2])) { |
|
$path = $matches[2]; |
|
} |
|
|
|
//Clean host |
|
$host = $this->_normalizeHost($host); |
|
|
|
//Generate hosts list |
|
$hosts = array(); |
|
if (filter_var(trim($host, '[]'), FILTER_VALIDATE_IP)) { |
|
$hosts[] = $host; |
|
} |
|
else { |
|
$hostComponents = explode('.', $host); |
|
|
|
$numComponents = count($hostComponents) - 7; |
|
if ($numComponents < 1) { |
|
$numComponents = 1; |
|
} |
|
|
|
$hosts[] = $host; |
|
for ($i = $numComponents; $i < count($hostComponents) - 1; $i++) { |
|
$hosts[] = implode('.', array_slice($hostComponents, $i)); |
|
} |
|
} |
|
|
|
//Generate paths list |
|
$paths = array('/'); |
|
$pathComponents = array_filter(explode('/', $path)); |
|
|
|
$numComponents = min(count($pathComponents), 4); |
|
for ($i = 1; $i < $numComponents; $i++) { |
|
$paths[] = '/' . implode('/', array_slice($pathComponents, 0, $i)) . '/'; |
|
} |
|
if ($path != '/') { |
|
$paths[] = $path; |
|
} |
|
if (strlen($query) > 0) { |
|
$paths[] = $path . '?' . $query; |
|
} |
|
$paths = array_reverse($paths); //So we start at the most specific and move to most generic |
|
|
|
//Generate hashes |
|
$hashes = array(); |
|
foreach ($hosts as $h) { |
|
$hashes[$h] = hash('sha256', $h, true); //WFSB compatibility -- it uses hashes without the path |
|
foreach ($paths as $p) { |
|
$key = $h . $p; |
|
$hashes[$key] = hash('sha256', $key, true); |
|
} |
|
} |
|
|
|
return $hashes; |
|
} |
|
|
|
protected function _canonicalizeURL($url) { //Based on https://developers.google.com/safe-browsing/v4/urls-hashing#canonicalization and Google's reference implementation https://github.com/google/safebrowsing/blob/master/urls.go |
|
//Strip fragment |
|
$url = $this->_array_first(explode('#', $url)); |
|
|
|
//Trim space |
|
$url = trim($url); |
|
|
|
//Remove tabs, CR, LF |
|
$url = preg_replace('/[\t\n\r]/', '', $url); |
|
|
|
//Normalize escapes |
|
$url = $this->_normalizeEscape($url); |
|
if ($url === false) { return false; } |
|
|
|
//Extract the scheme |
|
$scheme = 'http'; |
|
if (preg_match('~^([a-z]+[a-z0-9+\.\-]*)://(.*)$~i', $url, $matches)) { |
|
$scheme = strtolower($matches[1]); |
|
$url = $matches[2]; |
|
} |
|
|
|
//Separate URL and query string |
|
$query = ''; |
|
if (preg_match('/^([^?]+)(\??.*)/', $url, $matches)) { |
|
$url = $matches[1]; |
|
$query = $matches[2]; |
|
} |
|
$endsWithSlash = substr($url, -1) == '/'; |
|
|
|
//Separate host and path |
|
$path = ''; |
|
preg_match('~^(.*?)(?:(/.*)|$)~', $url, $matches); |
|
$host = $matches[1]; |
|
if (isset($matches[2])) { |
|
$path = $matches[2]; |
|
} |
|
|
|
//Clean host |
|
$host = $this->_normalizeHost($host); |
|
if ($host === false) { return false; } |
|
|
|
//Clean path |
|
$path = preg_replace('~//+~', '/', $path); //Multiple slashes -> single slash |
|
$path = preg_replace('~(?:^|/)\.(?:$|/)~', '/', $path); //. path components removed |
|
while (preg_match('~/(?!\.\./)[^/]+/\.\.(?:$|/)~', $path)) { //Resolve .. |
|
$path = preg_replace('~/(?!\.\./)[^/]+/\.\.(?:$|/)~', '/', $path, 1); |
|
} |
|
$path = preg_replace('~(?:^|/)\.\.(?:$|/)~', '/', $path); //Eliminate .. at the beginning |
|
$path = trim($path, '.'); |
|
$path = preg_replace('/\.\.+/', '.', $path); |
|
|
|
if ($path == '.' || $path == '') { |
|
$path = '/'; |
|
} |
|
else if ($endsWithSlash && substr($path, -1) != '/') { |
|
$path .= '/'; |
|
} |
|
|
|
return $scheme . '://' . $host . $path . $query; |
|
} |
|
|
|
protected function _normalizeEscape($url) { |
|
$maxDepth = 1024; |
|
$i = 0; |
|
while (preg_match('/%([0-9a-f]{2})/i', $url)) { |
|
$url = preg_replace_callback('/%([0-9a-f]{2})/i', array($this, '_hex2binCallback'), $url); |
|
$i++; |
|
|
|
if ($i > $maxDepth) { |
|
return false; |
|
} |
|
} |
|
|
|
return preg_replace_callback('/[\x00-\x20\x7f-\xff#%]/', array($this, '_bin2hexCallback'), $url); |
|
} |
|
|
|
protected function _hex2binCallback($matches) { |
|
return wfUtils::hex2bin($matches[1]); |
|
} |
|
|
|
protected function _bin2hexCallback($matches) { |
|
return '%' . bin2hex($matches[0]); |
|
} |
|
|
|
protected function _normalizeHost($host) { |
|
//Strip username:password |
|
$host = $this->_array_last(explode('@', $host)); |
|
|
|
//IPv6 literal |
|
if (substr($host, 0, 1) == '[') { |
|
if (strpos($host, ']') === false) { //No closing bracket |
|
return false; |
|
} |
|
} |
|
|
|
//Strip port |
|
$host = preg_replace('/:\d+$/', '', $host); |
|
|
|
//Unicode to IDNA |
|
$u = rawurldecode($host); |
|
if (preg_match('/[\x81-\xff]/', $u)) { //0x80 is technically Unicode, but the GSB canonicalization doesn't consider it one |
|
if (function_exists('idn_to_ascii')) { //Some PHP versions don't have this and we don't have a polyfill |
|
$host = idn_to_ascii($u); |
|
} |
|
} |
|
|
|
//Remove extra dots |
|
$host = trim($host, '.'); |
|
$host = preg_replace('/\.\.+/', '.', $host); |
|
|
|
//Canonicalize IP addresses |
|
if ($iphost = $this->_parseIP($host)) { |
|
return $iphost; |
|
} |
|
|
|
return strtolower($host); |
|
} |
|
|
|
protected function _parseIP($host) { |
|
// The Windows resolver allows a 4-part dotted decimal IP address to have a |
|
// space followed by any old rubbish, so long as the total length of the |
|
// string doesn't get above 15 characters. So, "10.192.95.89 xy" is |
|
// resolved to 10.192.95.89. If the string length is greater than 15 |
|
// characters, e.g. "10.192.95.89 xy.wildcard.example.com", it will be |
|
// resolved through DNS. |
|
if (strlen($host) <= 15) { |
|
$host = $this->_array_first(explode(' ', $host)); |
|
} |
|
|
|
if (!preg_match('/^((?:0x[0-9a-f]+|[0-9\.])+)$/i', $host)) { |
|
return false; |
|
} |
|
|
|
$parts = explode('.', $host); |
|
if (count($parts) > 4) { |
|
return false; |
|
} |
|
|
|
$strings = array(); |
|
foreach ($parts as $i => $p) { |
|
if ($i == count($parts) - 1) { |
|
$strings[] = $this->_canonicalNum($p, 5 - count($parts)); |
|
} |
|
else { |
|
$strings[] = $this->_canonicalNum($p, 1); |
|
} |
|
|
|
if ($strings[$i] == '') { |
|
return ''; |
|
} |
|
} |
|
|
|
return implode('.', $strings); |
|
} |
|
|
|
protected function _canonicalNum($part, $n) { |
|
if ($n <= 0 || $n > 4) { |
|
return ''; |
|
} |
|
|
|
if (preg_match('/^0x(\d+)$/i', $part, $matches)) { //hex |
|
$part = hexdec($matches[1]); |
|
} |
|
else if (preg_match('/^0(\d+)$/i', $part, $matches)) { //octal |
|
$part = octdec($matches[1]); |
|
} |
|
else { |
|
$part = (int) $part; |
|
} |
|
|
|
$strings = array_fill(0, $n, ''); |
|
for ($i = $n - 1; $i >= 0; $i--) { |
|
$strings[$i] = (string) ($part & 0xff); |
|
$part = $part >> 8; |
|
} |
|
return implode('.', $strings); |
|
} |
|
|
|
protected function _array_first($array) { |
|
if (empty($array)) { |
|
return null; |
|
} |
|
|
|
return $array[0]; |
|
} |
|
|
|
protected function _array_last($array) { |
|
if (empty($array)) { |
|
return null; |
|
} |
|
|
|
return $array[count($array) - 1]; |
|
} |
|
}
|
|
|