站点链接有效性检查的python脚本

整站链接有效性检查的Python脚本,可用于Jenkins持续集成中进行站点链接有效性的扫描。

需要安装的模块有:BeautifulSoup和httplib2

如果需要登录,请根据实际情况修改登录接口。

Github地址:https://github.com/TronGeek/CheckLinks-Python

完整代码如下:

#!/usr/bin/env python
#coding=utf-8

#Todo: 页面链接有效性检查
#Author: 归根落叶
#Blog: http://this.ispenn.com

import os,sys
try:
    import httplib2  
except ImportError as e:
    os.system('pip install -U httplib2')
    import httplib2
try:
    from bs4 import BeautifulSoup
except ImportError as e:
    os.system('pip install -U beautifulsoup4')
    from bs4 import BeautifulSoup
from urllib.parse import urlencode  
import re
import logging

log_file = os.path.join(os.getcwd(),'result/checkLinks.csv')
log_format = '[%(asctime)s] [%(levelname)s] %(message)s'
logging.basicConfig(format=log_format,filename=log_file,filemode='w',level=logging.DEBUG)
console = logging.StreamHandler()
console.setLevel(logging.DEBUG)
formatter = logging.Formatter(log_format)
console.setFormatter(formatter)
logging.getLogger('').addHandler(console)

#获取页面链接列表
def getURL(url,session=None):
    urlLinks = []
    imgLinks = []
    jsLinks = []
    cssLinks = []
    urlParse = url.split('/')
    rootURL = urlParse[0] + '//' + urlParse[2]
    if session is None:
        headers = {'contentType':'text/html;charset=UTF-8',
                   'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.4 Safari/537.36'}
    else:
        headers = {'contentType':'text/html;charset=UTF-8',
                   'Cookie':'session=' + session,
                   'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.4 Safari/537.36'}
    http = httplib2.Http('.cache')
    response, content = http.request(url, 'GET', headers=headers)
    if response.status == 200:
        soup = BeautifulSoup(str(content),'html.parser',from_encoding='utf-8')
        #获取所有页面链接
        for links in soup.find_all('a'):
            if links is not None:
                link = links.get('href')
                if link is not None and link != '/' and not link.find('?t_=') > 0:
                    if re.search(r'^(\\\'|\\")',link):
                        link = link[2:-2]
                    if re.search(r'/$',link):
                        link = link[:-1]
                    if re.search(r'^(http|https)',link):
                        urlLinks.append(link)
                    elif re.search(r'^(//)',link):
                        link = urlParse[0] + link
                        urlLinks.append(link)
                    elif re.search(r'^/',link):
                        link = rootURL + link
                        urlLinks.append(link)
                    elif re.search(r'^[^(javascript|mailto|\\|#)]',link):
                        link = url + '/' + link
                        urlLinks.append(link)
        #获取所有图片链接
        for links in soup.find_all('img'):
            if links is not None:
                link = links.get('src')
                if link is not None and link != '/':
                    if re.search(r'^(\\\'|\\")',link):
                        link = link[2:-2]
                    if re.search(r'/$',link):
                        link = link[:-1]
                    if re.search(r'^(http|https)',link):
                        imgLinks.append(link)
                    elif re.search(r'^(//)',link):
                        link = urlParse[0] + link
                        imgLinks.append(link)
                    elif re.search(r'^/',link):
                        link = rootURL + link
                        imgLinks.append(link)
                    else:
                        link = url + '/' + link
                        imgLinks.append(link) 
        #获取所有js链接
        for links in soup.find_all('script'):
            if links is not None:
                link = links.get('src')
                if link is not None and link != '/':
                    if re.search(r'^(\\\'|\\")',link):
                        link = link[2:-2]
                    if re.search(r'/$',link):
                        link = link[:-1]
                    if re.search(r'^(http|https)',link):
                        jsLinks.append(link)
                    elif re.search(r'^(//)',link):
                        link = urlParse[0] + link
                        jsLinks.append(link)
                    elif re.search(r'^/',link):
                        link = rootURL + link
                        jsLinks.append(link)
                    else:
                        link = url + '/' + link
                        jsLinks.append(link) 
        #获取所有css链接
        for links in soup.find_all('link'):
            if links is not None:
                link = links.get('href')
                if link is not None and link != '/':
                    if re.search(r'^(\\\'|\\")',link):
                        link = link[2:-2]
                    if re.search(r'/$',link):
                        link = link[:-1]
                    if re.search(r'^(http|https)',link):
                        cssLinks.append(link)
                    elif re.search(r'^(//)',link):
                        link = urlParse[0] + link
                        cssLinks.append(link)
                    elif re.search(r'^/',link):
                        link = rootURL + link
                        cssLinks.append(link)
                    else:
                        link = url + '/' + link
                        cssLinks.append(link) 
        return response.status,(urlLinks,imgLinks,jsLinks,cssLinks)
    return response.status,url

#检查链接
def checkLink(url):
    headers = {'contentType':'text/html;charset=UTF-8',
               'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.4 Safari/537.36'}
    http = httplib2.Http('.cache')
    response, content = http.request(url, 'GET', headers=headers)
    if response.status == 200:
        logging.info(str(response.status) + ', ' + url)
    else:
        logging.error(str(response.status) + ', ' + url)
    return response.status,url

#链接分类 过滤掉站外链接
def classifyLinks(urlList,baseURL,checkList,checkedList,checkNext):
    for i in range(len(urlList)):
        if len(urlList[i]) > 0:
            for link in urlList[i]:
                if link.find(baseURL) > 0 and link not in checkList and link not in checkedList:
                    checkList.append(link)
                    if i == 0:
                        checkNext.append(link)
                    print(link)
    return checkList,checkedList,checkNext

#获取登录Session
def getSession(url, postData):
    headers = {'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
               'X-Requested-With':'XMLHttpRequest',
               'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.4 Safari/537.36'}
    http = httplib2.Http('.cache')
    response, content = http.request(url, 'POST', urlencode(postData), headers=headers)
    if response.status == 200:
        match = re.search(r'true,"message":"(\w*)"',str(content))
        if match is not None:
            session = match.group(1)
            return response.status,session
        else:
            return 0,str(content)
    else:
        return response.status,str(content)

def main():
    homePage = 'http://www.example.com' #首页链接
    urlParse = homePage.split('/')
    baseURL = urlParse[2][len(urlParse[2].split('.')[0])+1:] #获取根域名
    checkList = []
    checkedList = []
    checkNext = []
    errorLinks = []
    pageNum = 0
    ifLogin = 1 #是否登录开关
    session = None
    if ifLogin:
        url = homePage + '/admin/user/login'
        postData = {'username':'username@yunlai.cn',
                    'password':'password',
                    'remeber':'0'}
        status,session = getSession(url,postData)
        if status != 200:
            logging.error(session)
            session = None
        else:
            session = session
    status,urlList = getURL(homePage,session)
    if status == 200:
        checkList,checkedList,checkNext = classifyLinks(urlList,baseURL,checkList,checkedList,checkNext)
        while True:
            if len(checkList) > 0:
                pageNum += 1
                logging.info('开始检查第 ' + str(pageNum) + ' 层链接')
                for link in checkList:
                    status,url = checkLink(link)
                    if status != 200:
                        errorLinks.append((status,url))
                    checkedList.append(link)
                del checkList[:]
            if len(checkNext) > 0:
                checkNextN = []
                for link in checkNext:
                    status,urlList = getURL(link,session)
                    if status == 200:
                        checkList,checkedList,checkNextN = classifyLinks(urlList,baseURL,checkList,checkedList,checkNextN)
                checkNext = checkNextN
            else:
                logging.info('链接检查完毕,共检查 ' + str(len(checkedList)) + ' 个链接,其中有 ' + str(len(errorLinks)) + ' 个异常链接')
                break
        for link in errorLinks:
            print(link)
    else:
        logging.error('[ ' + str(status) + ' ] ' + urlList)
    
if __name__ == '__main__':
    main()

“站点链接有效性检查的python脚本”的2个回复

  1. Python 2.7.8
    Traceback (most recent call last):
    File “checkLinks.py”, line 21, in
    from urllib.parse import urlencode
    ImportError: No module named parse

发表评论

电子邮件地址不会被公开。 必填项已用*标注