关键代码如下,下面这个是成功采集过的数据
运行方式:
scrapy crawl bch -o items.json
import scrapy
import re
from bs4 import BeautifulSoup
import js2xml
from lxml import etree
import requests
from bch.items import BchItemclass BchSpider(scrapy.Spider):
name = “bch”
allowed_domains = [“www.xxxx.cn”]
start_urls = [
‘xxx’
]
for i in range(2,17):
start_urls.append(‘xxx’+str(i) +’.html’)def parse(self, response):
#links = response.xpath(‘//a[re:test(@href,”\xxx”)]/@href’).extract()
links = response.xpath(‘//a[@class=”video_img”]/@href’).extract()
print ‘hello’
print links,response
items = []
for url in links:
item = BchItem()
res = requests.get(‘xxx’ + url)
#print res,’endsss’
res.encoding = ‘utf-8’
soup = BeautifulSoup(res.text, “lxml”)
#print ‘lxml’,soup
item[‘title’] = (soup.select(‘title’)[0].text).strip()
item[‘status’] = (soup.select(‘title’)[0].text).strip()
src = soup.select(“body script”)[11].string #
print ‘script’,src
src_text = js2xml.parse(src, debug=False)
src_tree = js2xml.pretty_print(src_text)
print ‘src_tree’,src_tree
selector = etree.HTML(src_tree)
item[‘link’] = ‘xxx’ + selector.xpath(“//property[@name = ‘f’]/string/text()”)[0]
items.append(item)
yield item
#return items
参考:https://www.cnblogs.com/sthu/p/8319072.html