Python 爬取单个网页所需要加载的URL地址和CSS、JS文件地址

直接上代码：
脱敏后自用的py采集代码，
#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""
@author:Andy
@file:xxx.py
@time:下午05:50
@desc:采集的文章数据进博客
"""
import os
import re
import time
import requests
from bs4 import BeautifulSoup, SoupStrainer
from requests.exceptions import RequestException
from hashlib import md5
from urllib.parse import urlparse
import urllib

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}


def get_content():
    url = 'http://ask.xxxx.com/question/xxxx'  # url
    response = requests.get(url, headers=headers).text.replace('<i class="fa fa-paperclip"></i>', '')
    soup = BeautifulSoup(response, 'lxml')
    # div = soup.select('#aw-mod-body ueditor-p-reset')
    pattern = re.compile('<a\shref="(http://ask.apelearn.com/file.*?)".*?>(.*?)</a>', re.S)
    p = soup.find_all('a')
    for item in p:
        # print(str(item))
        result = re.findall(pattern, str(item))
        if result:
            # print(result)
            for i in result:
                url, name = i
                # print(i)
                yield {
                    'url': url,
                    'name': name
                }


def mkdir(path):
    # 去除首位空格
    path=path.strip()
    # 去除尾部 \ 符号
    path=path.rstrip("\\")
    # 判断路径是否存在
    # 存在     True
    # 不存在   False
    isExists=os.path.exists(path)
    # 判断结果
    if not isExists:
        # 如果不存在则创建目录
        # 创建目录操作函数
        os.makedirs(path)
        print(path+' 创建成功')
        return True
    else:
        # 如果目录存在则不创建，并提示目录已存在
        print(path+' 目录已存在')
        return False

def getUrl(html):
    #patterncss = '<link href="(.*?)"'
    patternjs = '<script src="(.*?)"'
    patternimg = '<img src="(.*?)"'
    #href = re.compile(patterncss, re.S).findall(html)
    href = re.compile(patternimg, re.S).findall(html)
    href += re.compile(patternjs, re.S).findall(html)
    return href

def getCssUrl(html):
    patterncss = '<link href="(.*?)"'
    href = re.compile(patterncss, re.S).findall(html)
    return href

# 下载网页
def download_html(root_path, url):
    a = urlparse(url)
    file_path = a.path
    file_name = os.path.basename(file_path)
    _, file_suffix = os.path.splitext(file_name)
    if file_suffix != '.html':
        file_name_real = file_name + '.html'
    else:
        file_name_real = file_name
    file_path_real = file_path.replace(file_name, '')
    file_path_reals = file_path_real.replace('/', "\\")
    all_file_path_real = root_path + file_path_reals + file_name_real
    headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
    re = requests.get(url, headers = headers)
    re.encoding = "utf-8"

    itemurl = getUrl(re.text)
    for item1 in itemurl:
        download_commonimgjs(root_path, item1)

    itemcssurl = getCssUrl(re.text)
    for item2 in itemcssurl:
        download_css(root_path, item2)

    new_text = re.text.replace('https://www.xxxxxx.com', 'http://www.xxxxx.com')
    new_texts = new_text.replace('xxxxxx.com', '3cinno.shanhubei.com')
    with open(all_file_path_real, "w+", encoding="utf-8") as html_file:
        html_file.write(new_texts)

def download_commonimgjs(root_path, url):
    if str(url[:1]) == r"/":
        imgurl = "https://www.xxxxxx.com" + url
    else:
        imgurl = url
    a = urlparse(imgurl)
    file_path = a.path
    file_name = os.path.basename(file_path)
    _, file_suffix = os.path.splitext(file_name)
    # print(os.path.curdir(file_path))
    match_url = file_path.replace(file_name, '')
    match_url_new = match_url.replace('/', "\\")
    newmkpath = root_path + match_url_new
    if os.path.isfile(newmkpath + file_name):
        return
    # 调用函数
    mkdir(newmkpath)
    try:
        opener = urllib.request.build_opener()
        opener.addheaders = [('User-agent',
                              'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36')]
        urllib.request.install_opener(opener)
        urllib.request.urlretrieve(imgurl, newmkpath + file_name)
    except urllib.error.HTTPError:
        print('error')



def download_img(root_path, url):
    if str(url[:1]) == r"/":
        imgurl = "https://www.xxxxxx.com" + url
    else:
        imgurl = url
    a = urlparse(imgurl)
    file_path = a.path
    file_name = os.path.basename(file_path)
    _, file_suffix = os.path.splitext(file_name)
    # print(os.path.curdir(file_path))
    match_url = file_path.replace(file_name, '')
    match_url_new = match_url.replace('/', "\\")
    newmkpath = root_path + match_url_new
    # 调用函数
    mkdir(newmkpath)
    opener = urllib.request.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36')]
    urllib.request.install_opener(opener)
    urllib.request.urlretrieve(imgurl, newmkpath + file_name)

def download_js(root_path, url):
    if str(url[:1]) == r"/":
        imgurl = "https://www.xxxxxx.com" + url
    else:
        imgurl = url
    a = urlparse(imgurl)
    file_path = a.path
    file_name = os.path.basename(file_path)
    _, file_suffix = os.path.splitext(file_name)
    # print(os.path.curdir(file_path))
    match_url = file_path.replace(file_name, '')
    match_url_new = match_url.replace('/', "\\")
    newmkpath = root_path + match_url_new
    # 调用函数
    mkdir(newmkpath)
    opener = urllib.request.build_opener()
    opener.addheaders = [('User-agent',
                          'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36')]
    urllib.request.install_opener(opener)
    urllib.request.urlretrieve(imgurl, newmkpath + file_name)

def download_css(root_path, url):
    if str(url[:1]) == r"/":
        imgurl = "https://www.xxxxxx.com" + url
    else:
        imgurl = url
    a = urlparse(imgurl)
    file_path = a.path
    file_name = os.path.basename(file_path)
    _, file_suffix = os.path.splitext(file_name)
    if file_suffix != '.css':
        return
    # print(os.path.curdir(file_path))
    match_url = file_path.replace(file_name, '')
    match_url_new = match_url.replace('/', "\\")
    newmkpath = root_path + match_url_new
    if os.path.isfile(newmkpath + file_name):
        return
    # 调用函数
    mkdir(newmkpath)
    try:
        opener = urllib.request.build_opener()
        opener.addheaders = [('User-agent',
                              'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36')]
        urllib.request.install_opener(opener)
        urllib.request.urlretrieve(imgurl, newmkpath + file_name)
    except urllib.error.HTTPError:
        print('error')

def get_xml():
    url = 'https://www.xxxxxx.com/sitemap-1.xml'
    headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
    res = requests.get(url, headers=headers)
    res.encoding = "utf-8"
    # 根据你的文章链接格式写正则匹配，可能与我的不一样
    r = re.compile(r'https://www.xxxxxx.com/\S*?')
    big = re.findall(r, res.text)
    for i in big:
        print(i)


def main():
    # get_content()
    # url = r'https://www.xxxxxx.com/news/xxxx-proje-20711498'
    url = r'https://www.xxxxxx.com/uploads/20218080/logo202107221507387902092.png'
    # 定义要创建的目录
    root_path = "F:\\Project-cz\\shanhubei\\3cinno"
    #download_img(root_path, url)

    #htmlurl = r'https://www.xxxxxx.com/3d-clear-led-dmx-ball'
    #download_html(root_path, htmlurl)

    cssurl = r'https://www.xxxxxx.com/images/m184/black/style.css'
    #download_css(root_path, cssurl)

    #demourl = 'https://www.xxxxxx.com/Content/kcim/js/layim-public.js?t=20190404'
    #demo(demourl)

    get_xml()


def demo(url):
    a = urlparse(url)
    file_path = a.path
    print(a.scheme)
    print(a.hostname)
    print('a.file_path=' + file_path)
    file_name = os.path.basename(file_path)
    print('file_name=' +file_name)
    _, file_suffix = os.path.splitext(file_name)
    print('a.file_suffix=' + file_suffix)





if __name__ == '__main__':
    main()
猜你喜欢