使用phpQuery采集图片示例-php采集方式之一

下载地址:http://code.google.com/p/phpquery/

链接：https://pan.baidu.com/s/16uaFFwKtcdo6xBAqGl-_cw
提取码：xm9f
复制这段内容后打开百度网盘手机App，操作更方便哦

phpQuery是一个开源的项目，使用的是跟python的beautifulsoup很像

phpQuery是一个基于PHP的服务端开源项目，它可以让PHP开发人员轻松处理DOM文档内容。更有意思的是，它采用了jQuery的思想，使得可以像使用jQuery一样处理页面内容，获取想要的页面信息。

一样网上phpQuery使用示例：

<?php
//设置编码
header(“Content-Type: text/html;charset=utf-8”);
//导入核心库
require ‘phpQuery/phpQuery.php’;
//导入文件下载库
require ‘phpQuery/FileUtil.php’;

//图片存储地址
$base_path = dirname(__FILE__).’/photo’;
//网络请求数据
$eg2=phpQuery::newDocumentFile(“http://www.doutula.com/article/list/?page=1”);
//提取标题
$title=pq(“title”)->text();
print(“标题”.$title);
//提取图片地址
print(“\n”);
$content=pq(“.lazy”);
$i=0;
$download_config = array();
foreach($content as $li){
$i++;
$download_config[]= array(pq($li)->attr(“data-original”), $base_path.’/’.$i.’.jpg’);
}

print(“数据获取完毕，本页一共”.count($download_config).”个图片，正在下载”);
print(“\n”);
$obj = new FileUtil($download_config, 2, 10);
$handle_num = $obj->download();
print “下载完成数目：”.$handle_num;

print(“\n”);
?>

FileUtil.php

<?php
/**
* 多进程批量下载文件（使用php curl_multi_exec实现）
* @黑小马
* public download 下载处理
* public process 多进程下载
* private to_log 将执行结果写入日志文件
*/
class FileUtil {

// 下载文件设置
private $download_config = array();

// 最大开启进程数量
private $max_process_num = 10;

// 超时秒数
private $timeout = 10;

// 日志文件
private $logfile = null;

/**
* 初始化
* @param Array $download_config 下载的文件设置
* @param Int $max_process_num 最大开启的进程数量
* @param Int $timeout 超时秒数
* @param String $logfile 日志文件路径
*/
public function __construct($download_config, $max_process_num=10, $timeout=10, $logfile=”){
$this->download_config = $download_config;
$this->max_process_num = $max_process_num;
$this->timeout = $timeout;

// 日志文件
if($logfile){
$this->logfile = $logfile;
}else{
$this->logfile = dirname(__FILE__).’/batch_download_’.date(‘Ymd’).’.log’;
}
}

/**
* 执行下载
* @result Int
*/
public function download(){

// 已处理的数量
$handle_num = 0;

// 未处理完成
while(count($this->download_config)>0){

// 需要处理的大于最大进程数
if(count($this->download_config)>$this->max_process_num){
$process_num = $this->max_process_num;
// 需要处理的小于最大进程数
}else{
$process_num = count($this->download_config);
}

// 抽取指定数量进行下载
$tmp_download_config = array_splice($this->download_config, 0, $process_num);

// 执行下载
$result = $this->process($tmp_download_config);

// 写入日志
$this->to_log($tmp_download_config, $result);

// 记录已处理的数量
$handle_num += count($result);

}

return $handle_num;

}

/**
* 多进程下载文件
* @param Array $download_config 本次下载的设置
* @return Array
*/
public function process($download_config){

// 文件资源
$fp = array();

// curl会话
$ch = array();

// 执行结果
$result = array();

// 创建curl handle
$mh = curl_multi_init();

// 循环设定数量
foreach($download_config as $k=>$config){
$ch[$k] = curl_init();
$fp[$k] = fopen($config[1], ‘a’);

curl_setopt($ch[$k], CURLOPT_URL, $config[0]);
curl_setopt($ch[$k], CURLOPT_FILE, $fp[$k]);
curl_setopt($ch[$k], CURLOPT_HEADER, 0);
curl_setopt($ch[$k], CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch[$k], CURLOPT_USERAGENT, ‘Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)’);

// 加入处理
curl_multi_add_handle($mh, $ch[$k]);
}

$active = null;

do{
$mrc = curl_multi_exec($mh, $active);
} while($active);

// 获取数据
foreach($fp as $k=>$v){
fwrite($v, curl_multi_getcontent($ch[$k]));
}

// 关闭curl handle与文件资源
foreach($download_config as $k=>$config){
curl_multi_remove_handle($mh, $ch[$k]);
fclose($fp[$k]);

// 检查是否下载成功
if(file_exists($config[1])){
$result[$k] = true;
}else{
$result[$k] = false;
}
}

curl_multi_close($mh);

return $result;

}

/**
* 写入日志
* @param Array $data 下载文件数据
* @param Array $flag 下载文件状态数据
*/
private function to_log($data, $flag){

// 临时日志数据
$tmp_log = ”;

foreach($data as $k=>$v){
$tmp_log .= ‘[‘.date(‘Y-m-d H:i:s’).’] url:’.$v[0].’ file:’.$v[1].’ status:’.$flag[$k].PHP_EOL;
}

// 创建日志目录
if(!is_dir(dirname($this->logfile))){
mkdir(dirname($this->logfile), 0777, true);
}

// 写入日志文件
file_put_contents($this->logfile, $tmp_log, FILE_APPEND);
}

}
?>

猜你喜欢