直接上代码:
脱敏后自用的py采集代码,
#!/usr/bin/env python # -*- coding:utf-8 -*- """ @author:Andy @file:xxx.py @time:下午05:50 @desc:采集的文章数据进博客 """ import os import re import time import requests from bs4 import BeautifulSoup, SoupStrainer from requests.exceptions import RequestException from hashlib import md5 from urllib.parse import urlparse import urllib headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36' } def get_content(): url = 'http://ask.xxxx.com/question/xxxx' # url response = requests.get(url, headers=headers).text.replace('<i class="fa fa-paperclip"></i>', '') soup = BeautifulSoup(response, 'lxml') # div = soup.select('#aw-mod-body ueditor-p-reset') pattern = re.compile('<a\shref="(http://ask.apelearn.com/file.*?)".*?>(.*?)</a>', re.S) p = soup.find_all('a') for item in p: # print(str(item)) result = re.findall(pattern, str(item)) if result: # print(result) for i in result: url, name = i # print(i) yield { 'url': url, 'name': name } def mkdir(path): # 去除首位空格 path=path.strip() # 去除尾部 \ 符号 path=path.rstrip("\\") # 判断路径是否存在 # 存在 True # 不存在 False isExists=os.path.exists(path) # 判断结果 if not isExists: # 如果不存在则创建目录 # 创建目录操作函数 os.makedirs(path) print(path+' 创建成功') return True else: # 如果目录存在则不创建,并提示目录已存在 print(path+' 目录已存在') return False def getUrl(html): #patterncss = '<link href="(.*?)"' patternjs = '<script src="(.*?)"' patternimg = '<img src="(.*?)"' #href = re.compile(patterncss, re.S).findall(html) href = re.compile(patternimg, re.S).findall(html) href += re.compile(patternjs, re.S).findall(html) return href def getCssUrl(html): patterncss = '<link href="(.*?)"' href = re.compile(patterncss, re.S).findall(html) return href # 下载网页 def download_html(root_path, url): a = urlparse(url) file_path = a.path file_name = os.path.basename(file_path) _, file_suffix = os.path.splitext(file_name) if file_suffix != '.html': file_name_real = file_name + '.html' else: file_name_real = file_name file_path_real = file_path.replace(file_name, '') file_path_reals = file_path_real.replace('/', "\\") all_file_path_real = root_path + file_path_reals + file_name_real headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} re = requests.get(url, headers = headers) re.encoding = "utf-8" itemurl = getUrl(re.text) for item1 in itemurl: download_commonimgjs(root_path, item1) itemcssurl = getCssUrl(re.text) for item2 in itemcssurl: download_css(root_path, item2) new_text = re.text.replace('https://www.xxxxxx.com', 'http://www.xxxxx.com') new_texts = new_text.replace('xxxxxx.com', '3cinno.shanhubei.com') with open(all_file_path_real, "w+", encoding="utf-8") as html_file: html_file.write(new_texts) def download_commonimgjs(root_path, url): if str(url[:1]) == r"/": imgurl = "https://www.xxxxxx.com" + url else: imgurl = url a = urlparse(imgurl) file_path = a.path file_name = os.path.basename(file_path) _, file_suffix = os.path.splitext(file_name) # print(os.path.curdir(file_path)) match_url = file_path.replace(file_name, '') match_url_new = match_url.replace('/', "\\") newmkpath = root_path + match_url_new if os.path.isfile(newmkpath + file_name): return # 调用函数 mkdir(newmkpath) try: opener = urllib.request.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36')] urllib.request.install_opener(opener) urllib.request.urlretrieve(imgurl, newmkpath + file_name) except urllib.error.HTTPError: print('error') def download_img(root_path, url): if str(url[:1]) == r"/": imgurl = "https://www.xxxxxx.com" + url else: imgurl = url a = urlparse(imgurl) file_path = a.path file_name = os.path.basename(file_path) _, file_suffix = os.path.splitext(file_name) # print(os.path.curdir(file_path)) match_url = file_path.replace(file_name, '') match_url_new = match_url.replace('/', "\\") newmkpath = root_path + match_url_new # 调用函数 mkdir(newmkpath) opener = urllib.request.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36')] urllib.request.install_opener(opener) urllib.request.urlretrieve(imgurl, newmkpath + file_name) def download_js(root_path, url): if str(url[:1]) == r"/": imgurl = "https://www.xxxxxx.com" + url else: imgurl = url a = urlparse(imgurl) file_path = a.path file_name = os.path.basename(file_path) _, file_suffix = os.path.splitext(file_name) # print(os.path.curdir(file_path)) match_url = file_path.replace(file_name, '') match_url_new = match_url.replace('/', "\\") newmkpath = root_path + match_url_new # 调用函数 mkdir(newmkpath) opener = urllib.request.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36')] urllib.request.install_opener(opener) urllib.request.urlretrieve(imgurl, newmkpath + file_name) def download_css(root_path, url): if str(url[:1]) == r"/": imgurl = "https://www.xxxxxx.com" + url else: imgurl = url a = urlparse(imgurl) file_path = a.path file_name = os.path.basename(file_path) _, file_suffix = os.path.splitext(file_name) if file_suffix != '.css': return # print(os.path.curdir(file_path)) match_url = file_path.replace(file_name, '') match_url_new = match_url.replace('/', "\\") newmkpath = root_path + match_url_new if os.path.isfile(newmkpath + file_name): return # 调用函数 mkdir(newmkpath) try: opener = urllib.request.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36')] urllib.request.install_opener(opener) urllib.request.urlretrieve(imgurl, newmkpath + file_name) except urllib.error.HTTPError: print('error') def get_xml(): url = 'https://www.xxxxxx.com/sitemap-1.xml' headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} res = requests.get(url, headers=headers) res.encoding = "utf-8" # 根据你的文章链接格式写正则匹配,可能与我的不一样 r = re.compile(r'https://www.xxxxxx.com/\S*?') big = re.findall(r, res.text) for i in big: print(i) def main(): # get_content() # url = r'https://www.xxxxxx.com/news/xxxx-proje-20711498' url = r'https://www.xxxxxx.com/uploads/20218080/logo202107221507387902092.png' # 定义要创建的目录 root_path = "F:\\Project-cz\\shanhubei\\3cinno" #download_img(root_path, url) #htmlurl = r'https://www.xxxxxx.com/3d-clear-led-dmx-ball' #download_html(root_path, htmlurl) cssurl = r'https://www.xxxxxx.com/images/m184/black/style.css' #download_css(root_path, cssurl) #demourl = 'https://www.xxxxxx.com/Content/kcim/js/layim-public.js?t=20190404' #demo(demourl) get_xml() def demo(url): a = urlparse(url) file_path = a.path print(a.scheme) print(a.hostname) print('a.file_path=' + file_path) file_name = os.path.basename(file_path) print('file_name=' +file_name) _, file_suffix = os.path.splitext(file_name) print('a.file_suffix=' + file_suffix) if __name__ == '__main__': main()