php网页爬虫-简单的类

class PhpSpiders {
public $useragent; //user-agent
public $title; // 标题
public $encoding; //编码
public $status; //状态码
public $url;
public $text; //内容
public $content; //源代码
//规则
public $pattern_key = array(
‘title’ => ‘/<title>(\s*.*)<\/title>/isum’, //获取title
‘descriptions’ => ‘/<meta +name=”[d|D]escription” +content=”(.*)” +\/>/’, //获取描述
‘charset’=>‘/charset=\”?([\w-]+)\”?/i’, // 获取charset 编码
);
function __construct($url){
$this ->url = $url;
$this ->useragent = “Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36”;
}
function getData() {
$curl = curl_init();
curl_setopt($curl, CURLOPT_AUTOREFERER,1);
curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, 10);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); //信息流形式返回数据
curl_setopt($curl, CURLOPT_USERAGENT, $this ->useragent);
curl_setopt($curl, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_0);
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, 0);//禁用后cURL将终止从服务端进行验证
curl_setopt($curl, CURLOPT_URL, $this->url);
try{
$this ->content = curl_exec($curl);
curl_close($curl);
} catch (Exception $e) {
echo $e.getMessage();
return false;
}
}
function getEncoding(){ //获取编码
if ($this ->content) {
preg_match($this ->pattern_key[‘charset’], $this->content, $this ->encoding);
return $this ->encoding[1];
}
}
function getTitle() {
$this ->getData();
preg_match($this->pattern_key[‘title’], $this ->content, $keyContent);
return $keyContent[1];
}
}
$url = “http://php.net/manual/en/function.curl-setopt.php”;
$spider = new PhpSpiders($url);
$spider->getData();
print_r($spider->getTitle());
微信公众号
手机浏览(小程序)
0
分享到:
没有账号? 忘记密码?