class PhpSpiders {public $useragent; //user-agentpublic $title; // 标题public $encoding; //编码public $status; //状态码public $url;public $text; //内容public $content; //源代码//规则public $pattern_key = array(‘title’ => ‘/<title>(\s*.*)<\/title>/isum’, //获取title‘descriptions’ => ‘/<meta +name=”[d|D]escription” +content=”(.*)” +\/>/’, //获取描述‘charset’=>‘/charset=\”?([\w-]+)\”?/i’, // 获取charset 编码);function __construct($url){$this ->url = $url;$this ->useragent = “Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36”;}function getData() {$curl = curl_init();curl_setopt($curl, CURLOPT_AUTOREFERER,1);curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, 10);curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); //信息流形式返回数据curl_setopt($curl, CURLOPT_USERAGENT, $this ->useragent);curl_setopt($curl, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_0);curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, 0);//禁用后cURL将终止从服务端进行验证curl_setopt($curl, CURLOPT_URL, $this->url);try{$this ->content = curl_exec($curl);curl_close($curl);} catch (Exception $e) {echo $e.getMessage();return false;}}function getEncoding(){ //获取编码if ($this ->content) {preg_match($this ->pattern_key[‘charset’], $this->content, $this ->encoding);return $this ->encoding[1];}}function getTitle() {$this ->getData();preg_match($this->pattern_key[‘title’], $this ->content, $keyContent);return $keyContent[1];}}$url = “http://php.net/manual/en/function.curl-setopt.php”;$spider = new PhpSpiders($url);$spider->getData();print_r($spider->getTitle());
php网页爬虫-简单的类
微信公众号
手机浏览(小程序)