1.爬虫文件
class MeiShiSpider(scrapy.Spider):
name = 'meishi'
allowed_domains = ['baidu.com']
start_urls = ['https://tieba.baidu.com/f?kw=美食']
def parse(self, response):
# 获取视频地址在网页的数据范围
data = re.findall(r'(<ul id="thread_list".*?)<div class="thread_list_bottom clearfix">', response.text, re.S)[0]
# 获取一页中的所有的视频地址
video_urls = re.findall(r'data-video="(.*?)"', data)
for url in video_urls:
item = {}
# 从视频地址中提取出文件名与格式
item['name'] = url.split('?')[0].split('/')[-1]
item['url'] = url # 视频地址
yield item
2.管道文件
from scrapy.pipelines.files import FilesPipeline
# 自定义一个类,继承FilesPipeline这个父类
class VideoDownloadPipeline(FilesPipeline):
def get_media_requests(self, item, info):
# 依次对视频地址发送请求,meta用于传递视频的文件名
yield scrapy.Request(url=item['url'], meta={'name': item['name']})
def file_path(self, request, response=None, info=None, *, item=None):
filename = request.meta['name'] # 获取视频文件名
return filename # 返回下载的视频文件名
def item_completed(self, results, item, info):
return item
3.设置文件
USER_AGENT = UserAgent().random # 随机请求头
LOG_LEVEL = 'WARNING' # 设置日志级别
FILES_STORE = r'G:\视频' # 保存视频文件的文件夹
CONCURRENT_REQUESTS = 3 # 设置并发数
DOWNLOAD_DELAY = 1 # 设置下载延时
COOKIES_ENABLED = False # 关闭cookie
ITEM_PIPELINES = {
'video.pipelines.VideoDownloadPipeline': 300, # 打开管道
}