这是PHP网络蜘蛛提取Email地址脚本最后一章, 我将展示如何把收集到的资料储存在MySQL数据库。您可以把从多个网站收集到的资料储存在同一个数据库。
首先在您的电脑运行XAMPP,并确保激活Apache及MySQL。在浏览器输入URL "http://localhost/phpmyadmin/"。在顶部菜单栏选择"Database"。在这教程里我们的数据库为"email_collection"。 按"Create"钮可看到如下 图般。
您可在此下戴PHP cURL邮址提取脚本。
Note: Check out the sample code at bottom of this article.
进入"email_collection"数据库,可看到空库。选择"SQL", 把以下查询置入空白地方,按"Go"。
CREATE TABLE IF NOT EXISTS `contact_info` ( `id` int(12) NOT NULL AUTO_INCREMENT, `name` varchar(128) NOT NULL, `email` varchar(128) NOT NULL, `phone` varchar(128) NOT NULL, PRIMARY KEY (`id`), UNIQUE KEY `email` (`email`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
点击进入“contact_info”,内有4个数据列。在这里,每次一个新的条目( 相关的“name”,“email”和“phone”)添加到数据库表中列“id”会自动递增。“email”数据列被定义为“UNIQUE_KEY”,相同的电子邮件将不会存储在数据库中。例如,当你运行脚本于其他网站上,相同的电子邮件可能会在不同网站发现。在这种情况下,我们只是保持第一次被储存的资料,而忽略其他的。
<?php define('TARGET_BLOCK','~<div class="negotiators-wrapper">(.*?)</div>(\r\n)</div>~s'); define('NAME', '~<div class="negotiators-name"><a href="/negotiator/(.*?)">(.*?)</a></div>~'); define('EMAIL', '~<div class="negotiators-email">(.*?)</div>~'); define('PHONE', '~<div class="negotiators-phone">(.*?)</div>~'); define('LASTPAGE', '~<li class="pager-last last"><a href="/negotiators\?page=(.*?)"~'); define('PARSE_CONTENT', TRUE); interface MySQLTable { public function addData($info); } class EmailDatabase extends mysqli implements MySQLTable { private $_table = 'contact_info'; public function __construct() { $host = 'localhost'; $user = 'root'; $pass = ''; $dbname = 'email_collection'; parent::__construct($host, $user, $pass, $dbname); } public function setTableName($name) { $this->_table = $name; } public function addData($info) { $sql = 'INSERT IGNORE INTO ' . $this->_table . ' (name, email, phone) '; $sql .= 'VALUES (\'' . $info[name] . '\', \'' . $info[email] . '\', \'' . $info[phone]. '\')'; return $this->query($sql); } public function query($query, $mode = MYSQLI_STORE_RESULT) { $this->ping(); $res = parent::query($query, $mode); return $res; } } interface HttpScraper { public function parse($body, $head); } class Scraper implements HttpScraper { private $_table; public function __construct($t = null) { $this->setTable($t); } public function __destruct() { if ($this->_table !== null) { $this->_table = null; } } public function setTable($t) { if ($t === null || $t instanceof MySQLTable) $this->_table = $t; } public function getTable() { return $this->_table; } public function parse($body, $head) { if ($head == 200) { $p = preg_match_all(TARGET_BLOCK, $body, $blocks); if ($p) { foreach($blocks[0] as $block) { $agent[name] = $this->matchPattern(NAME, $block, 2); $agent[email] = $this->matchPattern(EMAIL, $block, 1); $agent[phone] = $this->matchPattern(PHONE, $block, 1); // echo "<pre>"; print_r($agent); echo "</pre>"; $this->_table->addData($agent); } } } } public function matchPattern($pattern, $content, $pos) { if (preg_match($pattern, $content, $match)) { return $match[$pos]; } } } class HttpCurl { protected $_cookie, $_parser, $_timeout; private $_ch, $_info, $_body, $_error; public function __construct($p = null) { if (!function_exists('curl_init')) { throw new Exception('cURL not enabled!'); } $this->setParser($p); } public function get($url, $status = FALSE) { $this->request($url); if ($status === TRUE) { return $this->runParser($this->_body, $this->getStatus()); } } protected function request($url) { $ch = curl_init($url); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE); curl_setopt($ch, CURLOPT_MAXREDIRS, 5); curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); curl_setopt($ch, CURLOPT_URL, $url); $this->_body = curl_exec($ch); $this->_info = curl_getinfo($ch); $this->_error = curl_error($ch); curl_close($ch); } public function getStatus() { return $this->_info[http_code]; } public function getHeader() { return $this->_info; } public function getBody() { return $this->_body; } public function __destruct() { } public function setParser($p) { if ($p === null || $p instanceof HttpScraper || is_callable($p)) $this->_parser = $p; } public function runParser($content, $header) { if ($this->_parser !== null) { if ($this->_parser instanceof HttpScraper) $this->_parser->parse($content, $header); else call_user_func($this->_parser, $content, $header); } } } ?>
类"EmailDatabase"的构造函数通过用户名及密码连接MySQL。在XAMPP环境里,host是localhost, username 是 root 及password为空,不需要密码。
public function __construct() { $host = 'localhost'; $user = 'root'; $pass = ''; $dbname = 'email_collection'; parent::__construct($host, $user, $pass, $dbname); }
public function addData($info) { $sql = 'INSERT IGNORE INTO ' . $this->_table . ' (name, email, phone) '; $sql .= 'VALUES (\'' . $info[name] . '\', \'' . $info[email] . '\', \'' . $info[phone]. '\')'; return $this->query($sql); } public function query($query, $mode = MYSQLI_STORE_RESULT) { $this->ping(); $res = parent::query($query, $mode); return $res; }
private $_table; public function __construct($t = null) { $this->setTable($t); } public function __destruct() { if ($this->_table !== null) { $this->_table = null; } } public function setTable($t) { if ($t === null || $t instanceof MySQLTable) $this->_table = $t; }
函数 parse()也有变动,加入了函数addData() ,取得资料后写入数据库表。
我们的 test.php:
<?php include 'httpcurl.php'; $target = "http://<domain name>/negotiators?page="; $startPage = $target . "1"; $scrapeContent = new Scraper; $firstPage = new HttpCurl(); $firstPage->get($startPage); if ($firstPage->getStatus() === 200) { $lastPage = $scrapeContent->matchPattern(LASTPAGE, $firstPage->getBody(), 1); } $db = new EmailDatabase; $scrapeContent = new Scraper($db); $pages = new HttpCurl($scrapeContent); for($i=1; $i <= $lastPage; $i++) { $targetPage = $target . $i; $pages->get($targetPage, PARSE_CONTENT); } ?>
只有小小变动。用类EmailDatabase创建对象$db然后传 $scrapeContent.
$db = new EmailDatabase; $scrapeContent = new Scraper($db); $pages = new HttpCurl($scrapeContent);
define('TARGET_BLOCK','~<div class="negotiators-wrapper">(.*?)</div>(\r\n)</div>~s'); define('NAME', '~<div class="negotiators-name"><a href="/negotiator/(.*?)">(.*?)</a></div>~'); define('EMAIL', '~<div class="negotiators-email">(.*?)</div>~'); define('PHONE', '~<div class="negotiators-phone">(.*?)</div>~'); define('LASTPAGE', '~<li class="pager-last last"><a href="/negotiators\?page=(.*?)"~'); define('PARSE_CONTENT', TRUE);
这就是我们的PHP email提取脚本。记得,不要用来发垃圾邮件!
1. httpcurl.php
<?php /******************************************************** * These are website specific matching pattern * * Change these matching patterns for each websites * * Else you will not get any results * ********************************************************/ define('TARGET_BLOCK','~<div class="negotiators-wrapper">(.*?)</div>(\r\n)</div>~s'); define('NAME', '~<div class="negotiators-name"><a href="/negotiator/(.*?)">(.*?)</a></div>~'); define('EMAIL', '~<div class="negotiators-email">(.*?)</div>~'); define('PHONE', '~<div class="negotiators-phone">(.*?)</div>~'); define('LASTPAGE', '~<li class="pager-last last"><a href="/negotiators\?page=(.*?)"~'); define('PARSE_CONTENT', TRUE); // Interface MySQLTable interface MySQLTable { public function addData($info); } // Class EmailDatabase // Use the code below to crease table /***************************************************** CREATE TABLE IF NOT EXISTS `contact_info` ( `id` int(12) NOT NULL AUTO_INCREMENT, `name` varchar(128) NOT NULL, `email` varchar(128) NOT NULL, `phone` varchar(128) NOT NULL, PRIMARY KEY (`id`), UNIQUE KEY `email` (`email`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8; *******************************************************/ class EmailDatabase extends mysqli implements MySQLTable { private $_table = 'contact_info'; // set default table // Connect to database public function __construct() { $host = 'localhost'; $user = 'root'; $pass = ''; $dbname = 'email_collection'; parent::__construct($host, $user, $pass, $dbname); } // Use this function to change to another table public function setTableName($name) { $this->_table = $name; } // Write data to table public function addData($info) { $sql = 'INSERT IGNORE INTO ' . $this->_table . ' (name, email, phone) '; $sql .= 'VALUES (\'' . $info[name] . '\', \'' . $info[email] . '\', \'' . $info[phone]. '\')'; return $this->query($sql); } // Execute MySQL query here public function query($query, $mode = MYSQLI_STORE_RESULT) { $this->ping(); $res = parent::query($query, $mode); return $res; } } // Interface HttpScraper interface HttpScraper { public function parse($body, $head); } // Class Scraper class Scraper implements HttpScraper { private $_table; // Store MySQL table if want to write to database. public function __construct($t = null) { $this->setTable($t); } // Delete table info at descructor public function __destruct() { if ($this->_table !== null) { $this->_table = null; } } // Set table info to private variable $_table public function setTable($t) { if ($t === null || $t instanceof MySQLTable) $this->_table = $t; } // Get table info public function getTable() { return $this->_table; } // Parse function public function parse($body, $head) { if ($head == 200) { $p = preg_match_all(TARGET_BLOCK, $body, $blocks); if ($p) { foreach($blocks[0] as $block) { $agent[name] = $this->matchPattern(NAME, $block, 2); $agent[email] = $this->matchPattern(EMAIL, $block, 1); $agent[phone] = $this->matchPattern(PHONE, $block, 1); // echo "<pre>"; print_r($agent); echo "</pre>"; $this->_table->addData($agent); } } } } // Return matched info public function matchPattern($pattern, $content, $pos) { if (preg_match($pattern, $content, $match)) { return $match[$pos]; } } } // Class HttpCurl class HttpCurl { protected $_cookie, $_parser, $_timeout; private $_ch, $_info, $_body, $_error; // Check curl activated // Set Parser as well public function __construct($p = null) { if (!function_exists('curl_init')) { throw new Exception('cURL not enabled!'); } $this->setParser($p); } // Get web page and run parser public function get($url, $status = FALSE) { $this->request($url); if ($status === TRUE) { return $this->runParser($this->_body, $this->getStatus()); } } // Run cURL to get web page source file protected function request($url) { $ch = curl_init($url); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE); curl_setopt($ch, CURLOPT_MAXREDIRS, 5); curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); curl_setopt($ch, CURLOPT_URL, $url); $this->_body = curl_exec($ch); $this->_info = curl_getinfo($ch); $this->_error = curl_error($ch); curl_close($ch); } // Get http_code public function getStatus() { return $this->_info[http_code]; } // Get web page header information public function getHeader() { return $this->_info; } // Get web page content public function getBody() { return $this->_body; } public function __destruct() { } // set parser, either object or callback function public function setParser($p) { if ($p === null || $p instanceof HttpScraper || is_callable($p)) $this->_parser = $p; } // Execute parser public function runParser($content, $header) { if ($this->_parser !== null) { if ($this->_parser instanceof HttpScraper) $this->_parser->parse($content, $header); else call_user_func($this->_parser, $content, $header); } } } ?>
2. test.php
<?php include 'httpcurl.php'; // include lib file $target = "http://<website domain>/negotiators?page="; // Set our target's url, remember not to include nu,ber in pagination $startPage = $target . "1"; // Set first page $scrapeContent = new Scraper; $firstPage = new HttpCurl(); $firstPage->get($startPage); // get first page content if ($firstPage->getStatus() === 200) { $lastPage = $scrapeContent->matchPattern(LASTPAGE, $firstPage->getBody(), 1); // get total page info from first page } $db = new EmailDatabase; // can be excluded if do not want to write to database $scrapeContent = new Scraper($db); // // can be excluded as well $pages = new HttpCurl($scrapeContent); // Looping from first page to last and parse each and every pages to database for($i=1; $i <= $lastPage; $i++) { $targetPage = $target . $i; $pages->get($targetPage, PARSE_CONTENT); } ?>