-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
165 additions
and
92 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
*.log | ||
*.lock | ||
*.pid | ||
*.sh | ||
.idea | ||
.buildpath | ||
.project | ||
.settings | ||
.DS_Store | ||
logs | ||
vendor |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
## 简介 | ||
|
||
webman的爬山虎插件,[PHPCreeper | 爬山虎](http://www.phpcreeper.com):让爬取工作变得更加简单。 | ||
webman的爬山虎插件,[PHPCreeper | 爬山虎](https://github.com/blogdaren/PHPCreeper):让爬取工作变得更加简单。 | ||
|
||
|
||
## 安装 | ||
|
@@ -9,14 +9,16 @@ composer require blogdaren/webman-phpcreeper | |
``` | ||
|
||
## 使用说明 | ||
* 首先要明确一个概念:爬山虎有三种容器分别是:生产器、下载器、解析器。 | ||
* 编写一个爬虫非常简单: 配置搞定以后,只需要在对应容器内的`onXXXX`回调方法内编写业务逻辑即可。 | ||
* 由于爬虫应用相对WEB应用而言比较独立,所以app内的爬虫目录结构请自行部署。 | ||
* 首先在自己的app项目下手动创建有效的爬虫目录。 | ||
* 在爬虫目录内创建相应的容器【生产器、下载器和解析器】句柄类Hanlder。 | ||
* 由于爬虫应用相对WEB应用而言比较独立,所以app内的爬虫目录结构建议自行独立部署。 | ||
* 首先在自己的app项目下手动创建有效的爬虫目录, 比如:app/spider。 | ||
* 然后在爬虫目录内(app/spider)创建相应的容器句柄类Hanlder。 | ||
* 最后在对应容器内的`onXXXX`回调方法内编写业务逻辑. | ||
|
||
## 举个栗子 | ||
## 举个例子 | ||
|
||
> 模拟需求是抓取未来7天内的天气预报 | ||
> 模拟需求是抓取未来7天内北京的天气预报 | ||
1、创建爬虫目录:app/spider | ||
|
||
|
@@ -27,40 +29,67 @@ composer require blogdaren/webman-phpcreeper | |
* @script Myproducer.php | ||
* @brief 生产器Handler | ||
* @author blogdaren<[email protected]> | ||
* @version 1.0.0 | ||
* @modify 2022-04-01 | ||
*/ | ||
namespace app\spider; | ||
use Workerman\Timer; | ||
use PHPCreeper\Timer; | ||
use PHPCreeper\Crontab; | ||
class Myproducer extends \Webman\PHPCreeper\Producer | ||
{ | ||
/** | ||
* @brief 抓取未来7天内的天气预报DEMO | ||
* @brief 抓取未来7天内北京的天气预报 | ||
* | ||
* @return mixed | ||
*/ | ||
public function makeTask() | ||
{ | ||
//Create One Task | ||
//注意:这里说的版本并不是爬山虎插件的版本,而是爬山虎引擎的版本. | ||
//注意:这里说的版本并不是爬山虎插件的版本,而是爬山虎引擎的版本. | ||
//注意:这里说的版本并不是爬山虎插件的版本,而是爬山虎引擎的版本. | ||
//在v1.6.0之前,爬山虎主要使用OOP风格的API来创建任务: | ||
//$producer->newTaskMan()->setXXX()->setXXX()->createTask() | ||
//$producer->newTaskMan()->setXXX()->setXXX()->createTask($task) | ||
//$producer->newTaskMan()->setXXX()->setXXX()->createMultiTask() | ||
//$producer->newTaskMan()->setXXX()->setXXX()->createMultiTask($task) | ||
//自v1.6.0开始,爬山虎提供了更加短小便捷的API来创建任务, 而且参数类型更加丰富: | ||
//注意:仅仅只是扩展,原有的API依然可以正常使用,提倡扩展就是为了保持向下兼容。 | ||
//1. 单任务API:$task参数类型可支持:[字符串 | 一维数组] | ||
//1. 单任务API:$producer->createTask($task); | ||
//2. 多任务API:$task参数类型可支持:[字符串 | 一维数组 | 二维数组] | ||
//2. 多任务API:$producer->createMultiTask($task); | ||
//使用字符串:不推荐使用,配置受限,需要自行处理抓取结果 | ||
//$task = "http://www.weather.com.cn/weather/101010100.shtml"; | ||
//$producer->createTask($task); | ||
//$producer->createMultiTask($task); | ||
$task = array( | ||
'url' => 'http://www.weather.com.cn/weather/101010100.shtml', | ||
'rule' => array( | ||
'time' => ['div#7d ul.t.clearfix h1', 'text'], | ||
'active' => true, //是否激活当前任务,只有配置为false才会冻结任务,默认true | ||
'url' => 'http://www.weather.com.cn/weather/101010100.shtml', | ||
"rule" => array( //如果该字段留空默认将返回原始下载数据 | ||
'time' => ['div#7d ul.t.clearfix h1', 'text', [], 'function($field_name, $data){ | ||
return "具体日子: " . $data; | ||
}'], //关于回调字符串的用法务必详看官方手册 | ||
'wea' => ['div#7d ul.t.clearfix p.wea', 'text'], | ||
'tem' => ['div#7d ul.t.clearfix p.tem', 'text'], | ||
'wind' => ['div#7d ul.t.clearfix p.win i', 'text'], | ||
), | ||
'context' => array( | ||
'cache_enabled' => true, | ||
'cache_directory' => '/tmp/DownloadCache4PHPCreeper/download/', | ||
'allow_url_repeat' => true, | ||
), | ||
), | ||
'rule_name' => '', //如果留空将使用md5($task_id)作为规则名 | ||
'refer' => '', | ||
'type' => 'text', //可以自由设定类型 | ||
'method' => 'get', | ||
'context' => $context??[], //任务私有context,其上下文成员与全局context完全相同,最终会采用合并覆盖策略 | ||
); | ||
$this->newTaskMan()->createTask($task); | ||
$this->createTask($task); | ||
} | ||
/** | ||
|
@@ -72,8 +101,15 @@ class Myproducer extends \Webman\PHPCreeper\Producer | |
*/ | ||
public function onProducerStart($producer) | ||
{ | ||
//$this->makeTask(); | ||
Timer::add(2, [$this, "makeTask"], [], true); | ||
$this->makeTask(); | ||
//使用Timer定时器创建任务 | ||
//Timer::add(5, [$this, "makeTask"], [], true); | ||
//使用Crontab定时器创建任务 | ||
//new Crontab('*/5 * * * * *', function(){ | ||
//$this->makeTask(); | ||
//}); | ||
} | ||
/** | ||
|
@@ -99,6 +135,7 @@ class Myproducer extends \Webman\PHPCreeper\Producer | |
} | ||
} | ||
``` | ||
|
||
3、创建下载器句柄类文件 app/spider/Mydownloader.php | ||
|
@@ -108,8 +145,7 @@ class Myproducer extends \Webman\PHPCreeper\Producer | |
* @script Mydownloader.php | ||
* @brief 下载器Handler | ||
* @author blogdaren<[email protected]> | ||
* @version 1.0.0 | ||
* @modify 2022-04-01 | ||
* @create 2022-04-01 | ||
*/ | ||
namespace app\spider; | ||
|
@@ -216,8 +252,7 @@ class Mydownloader extends \Webman\PHPCreeper\Downloader | |
* @script Myparser.php | ||
* @brief 解析器Handler | ||
* @author blogdaren<[email protected]> | ||
* @version 1.0.0 | ||
* @modify 2022-04-01 | ||
* @create 2022-04-01 | ||
*/ | ||
namespace app\spider; | ||
|
@@ -268,13 +303,7 @@ class Myparser extends \Webman\PHPCreeper\Parser | |
*/ | ||
public function onParserMessage($parser, $connection, $download_data) | ||
{ | ||
/* | ||
*$rule = array( | ||
* 'hotline' => ['div.qxfw-body > p:eq(1)', 'text'], | ||
*); | ||
*$data = $parser->extractor->setHtml($download_data)->setRule($rule)->extract(); | ||
*pprint($data, __METHOD__); | ||
*/ | ||
//pprint(strlen($download_data), __METHOD__); | ||
} | ||
/** | ||
|
@@ -341,7 +370,7 @@ return [ | |
``` | ||
|
||
## 注意事项 | ||
* 爬虫自有的配置文件要保持相对独立; | ||
* 爬虫应用自有的配置文件要保持相对独立; | ||
* process配置内的关于进程构造函数的配置一般不要动; | ||
* 目前需要手动设置下载器的$downloader->setClientSocketAddress([]); | ||
* 依赖redis服务,所以务必启动redis-server; | ||
|
@@ -354,6 +383,6 @@ return [ | |
* 爬山虎中文官方网站:[http://www.phpcreeper.com](http://www.phpcreeper.com) | ||
* 中文开发文档主节点:[http://www.blogdaren.com/docs/](http://www.blogadren.com/docs/) | ||
* 中文开发文档备节点:[http://www.phpcreeper.com/docs/](http://www.phpcreeper.com/docs/) | ||
* 爬山虎内核项目地址:[https://github.com/blogdaren/PHPCreeper](https://github.com/blogdaren/PHPCreeper) | ||
* 爬山虎开源项目地址:[https://github.com/blogdaren/PHPCreeper](https://github.com/blogdaren/PHPCreeper) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,8 +3,7 @@ | |
* @script Downloader.php | ||
* @brief wrapper for PHPCreeper.Downloader | ||
* @author blogdaren<[email protected]> | ||
* @version 1.0.1 | ||
* @modify 2022-04-08 | ||
* @create 2022-04-08 | ||
*/ | ||
|
||
namespace Webman\PHPCreeper; | ||
|
@@ -16,10 +15,11 @@ class Downloader extends \PHPCreeper\Downloader | |
/** | ||
* @brief procuder callback | ||
*/ | ||
const CALLBACK_MAPS = [ | ||
public const CALLBACK_MAPS = [ | ||
'onBeforeDownload', | ||
'onStartDownload', | ||
'onAfterDownload', | ||
'onFailDownload', | ||
'onDownloaderMessage', | ||
'onDownloaderReload', | ||
'onDownloaderStart', | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,8 +3,7 @@ | |
* @script Parser.php | ||
* @brief wrapper for PHPCreeper.Parser | ||
* @author blogdaren<[email protected]> | ||
* @version 1.0.1 | ||
* @modify 2022-04-08 | ||
* @create 2022-04-08 | ||
*/ | ||
|
||
namespace Webman\PHPCreeper; | ||
|
@@ -16,10 +15,11 @@ class Parser extends \PHPCreeper\Parser | |
/** | ||
* @brief procuder callback | ||
*/ | ||
const CALLBACK_MAPS = [ | ||
public const CALLBACK_MAPS = [ | ||
'onParserStart', | ||
'onParserStop', | ||
'onParserReload', | ||
'onParserConnect', | ||
'onParserExtractField', | ||
'onParserFindUrl', | ||
'onParserMessage', | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,8 +3,7 @@ | |
* @script Producer.php | ||
* @brief wrapper for PHPCreeper.Producer | ||
* @author blogdaren<[email protected]> | ||
* @version 1.0.1 | ||
* @modify 2022-04-08 | ||
* @create 2022-04-08 | ||
*/ | ||
|
||
namespace Webman\PHPCreeper; | ||
|
@@ -16,7 +15,7 @@ class Producer extends \PHPCreeper\Producer | |
/** | ||
* @brief procuder callback | ||
*/ | ||
const CALLBACK_MAPS = [ | ||
public const CALLBACK_MAPS = [ | ||
'onProducerStart', | ||
'onProducerStop', | ||
'onProducerReload', | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,18 @@ | ||
<?php | ||
//务必参照爬山虎插件文档来运行模拟DEMO: | ||
//1. 首先在自己的app项目下手动创建有效的爬虫目录; | ||
//2. 在爬虫目录内创建相应的生产器、下载器和解析器Hanlder | ||
/** | ||
* @script process.php | ||
* @brief 自定义进程配置 | ||
* | ||
* 务必参照爬山虎插件文档来运行DEMO | ||
* | ||
* 1. 首先在自己的应用项目下手动创建有效的爬虫目录, 比如: app/spider | ||
* 2. 在爬虫目录(app/spider)内创建相应的生产器、下载器和解析器Hanlder | ||
* | ||
* @author blogdaren<[email protected]> | ||
* @link http://www.phpcreeper.com | ||
* @create 2022-04-08 | ||
*/ | ||
|
||
use app\spider\Myproducer; | ||
use app\spider\Mydownloader; | ||
use app\spider\Myparser; | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,16 +3,22 @@ | |
* @script database.php | ||
* @brief 爬虫独立数据库配置文件: 内置支持Medoo | ||
* @author blogdaren<[email protected]> | ||
* @version 1.0.0 | ||
* @modify 2022-04-01 | ||
* @link http://www.phpcreeper.com | ||
* @create 2022-04-08 | ||
*/ | ||
|
||
|
||
return array( | ||
'redis' => array( | ||
'prefix' => 'Demo', | ||
'host' => '127.0.0.1', | ||
'port' => 6379, | ||
'database' => 0, | ||
'host' => '127.0.0.1', | ||
'port' => 6379, | ||
'database' => '0', | ||
'auth' => false, | ||
'pass' => 'guest', | ||
'prefix' => 'PHPCreeper', | ||
'connection_timeout' => 5, | ||
'read_write_timeout' => 0, | ||
//'use_red_lock' => true, //默认使用更安全的分布式红锁 | ||
), | ||
'dbo' => array( | ||
'test' => array( | ||
|
Oops, something went wrong.