小爬虫实验

最近在学python,一直想用python来做个爬虫,趁着这个周末没事就玩玩吧。

话说最近股市跌宕起伏,那就想着爬点股市信息来好了。

实验脚本目的:抓取所有股票信息的行业评级以及风险评估。

下面就贴代码了:

#coding:utf-8
import httplib2
import re
from bs4 import BeautifulSoup
import sqlite3

def getPrice(stockCode):
    url = 'http://www.icaifu.com/stock/doctora/' + stockCode + '.shtml'
    headers = {'contentType':'text/html;charset=UTF-8',
                'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.4 Safari/537.36',
                'Referer':'http://www.icaifu.com/'}
    http = httplib2.Http('.cache')
    response, content = http.request(url, 'GET', headers=headers)
    soup = BeautifulSoup(str(content.decode('utf-8')))
    guzhi = str(soup.find('div',{'class':'picL_nav'}).find_next(text=re.compile('估值结果显示')).find_next_sibling('span')).split()
    fengx = soup.find(text=re.compile('潜在风险')).find_next_sibling('span')
    fengx = str(list(fengx)[0]).split()[0] if fengx is not None else '--'
    stock = {'stockName':list(soup.find('div','up_l_nav').find_next('a'))[0],
             'stockCode':stockCode,
             'nowPrice':list(soup.find('div','up_2_nav').find_next('span',text=re.compile('^[+-][\d.]*')))[0],
             'upDown':str(list(soup.find('div','up_2_nav').find_next('span',text=re.compile('^[+-][\d.]*')).find_next_sibling('span'))[0]).split()[0],
             'lowPrice':str(soup.find('div','picL_top').find_next(text=re.compile('-'))).split()[0],
             'highPrice':str(soup.find('div','picL_top').find_next(text=re.compile('-'))).split()[2],
             'guzhi':(len(guzhi) > 1 and guzhi[2] or '--'),
             'fengx':fengx}
    print(stock)
    conn = sqlite3.connect('stockDoctora.db')
    cu = conn.cursor()
    sql = 'INSERT INTO `stockDoctora` (`stockName`,`stockCode`,`nowPrice`,`upDown`,`lowPrice`,`highPrice`,`guzhi`,`fengx`) VALUES("' +\
           stock['stockName'] + '","' + stock['stockCode'] + '","' + stock['nowPrice'] + '","' + stock['upDown'] + '","' +\
           stock['lowPrice'] + '","' + stock['highPrice'] + '","' + stock['guzhi'] + '","' + stock['fengx'] +'")'
    cu.execute(sql)
    conn.commit()
    cu.close()
    
def getCode(where,totalPage):
    headers = {'contentType':'text/html;charset=UTF-8',
               'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.4 Safari/537.36',
               'Referer':'http://www.icaifu.com/'}
    codeList = []
    http = httplib2.Http('.cache')
    for x in range(1,totalPage+1):
        url = 'http://www.icaifu.com/stock/marketcentera-' + where + '-0-0-0-0-0----------4--' + str(x) + '.shtml'
        response, content = http.request(url, 'GET', headers=headers)
        if response.status != 200:
            return list(set(codeList))
        soup = BeautifulSoup(str(content.decode('utf-8')))
        tmpList = soup.find_all('a',href=re.compile('/stock/quotes/[a-z0-9]*.shtml'))
        for i in range(len(tmpList)):
            codeList.append(re.findall('([a-z0-9]*).shtml',str(tmpList[i]))[0])
        print('Page ' + str(x) + ' is ok!')
    return list(set(codeList))

def initdb():
    conn = sqlite3.connect('stockDoctora.db')
    cu = conn.cursor()
    sql = '''CREATE TABLE `stockDoctora` (
    `id` INTEGER PRIMARY KEY AUTOINCREMENT,
    `stockName` varchar(20) NULL,
    `stockCode` varchar(20) NULL,
    `nowPrice` varchar(20) NULL,
    `upDown` varchar(20) NULL,
    `lowPrice` varchar(20) NULL,
    `highPrice` varchar(20) NULL,
    `guzhi` varchar(20) NULL,
    `fengx` varchar(20) NULL
    )'''
    cu.execute('DROP TABLE IF EXISTS stockDoctora')
    cu.execute(sql)
    conn.commit()
    cu.close()
    
def getResult(sql):
    conn = sqlite3.connect('stockDoctora.db')
    cu = conn.cursor()
    cu.execute(sql)
    result = cu.fetchall()
    cu.close()
    return result

def main():
    initdb()
    gc = getCode('0',30)
    print('Total ' + str(len(gc)) + ' stock is get!')
    for i in range(len(gc)):
        getPrice(gc[i])

if __name__ == '__main__':
    '''sql = 'SELECT stockName,stockCode,guzhi,nowPrice,upDown,highPrice,fengx FROM stockDoctora WHERE guzhi="相对低估" AND highPrice <> "--"'
    for i in getResult(sql):
        print(i)
    print(len(getResult(sql)))'''
    main()

难点在于解析html页面上,怎么精确获取自己想要的东西,比较费劲,还好在网上了解到BeautifulSoup模块的用法,变得容易多了。

结果存在sqlite数据库中,以下是结果:

sql语句:

SELECT stockName,stockCode,guzhi,nowPrice,upDown,highPrice,fengx FROM stockDoctora WHERE guzhi="相对低估" AND highPrice <> "--"

执行结果:

('特锐德', 'sz300001', '相对低估', '+20.78', '-10.0%', '25.5', '高')
('福星晓程', 'sz300139', '相对低估', '+21.29', '-9.98%', '46.8', '偏低')
('闽发铝业', 'sz002578', '相对低估', '+11.55', '-9.98%', '16.88', '偏高')
('长城汽车', 'sh601633', '相对低估', '+42.76', '-7.08%', '57.0', '偏高')
('拓日新能', 'sz002218', '相对低估', '+12.6', '-10.0%', '35.83', '偏高')
('键桥通讯', 'sz002316', '相对低估', '+11.01', '+', '16.96', '偏低')
('康缘药业', 'sh600557', '相对低估', '+27.64', '-10.0%', '34.46', '中')
('中原内配', 'sz002448', '相对低估', '+14.52', '-9.98%', '19.72', '偏高')
('长高集团', 'sz002452', '相对低估', '+10.99', '-9.99%', '20.62', '中')
('云煤能源', 'sh600792', '相对低估', '+7.96', '-9.95%', '12.76', '高')
('海螺水泥', 'sh600585', '相对低估', '+20.28', '-8.4%', '24.62', '偏高')
('风范股份', 'sh601700', '相对低估', '+11.42', '-10.01%', '23.7', '偏高')
('中环股份', 'sz002129', '相对低估', '+22.36', '-9.98%', '30.14', '偏低')
('恒瑞医药', 'sh600276', '相对低估', '+40.05', '-5.83%', '48.83', '偏低')
('江苏旷达', 'sz002516', '相对低估', '+13.95', '-10.0%', '20.46', '偏低')
('开山股份', 'sz300257', '相对低估', '+25.25', '-10.01%', '54.44', '偏低')
('江山股份', 'sh600389', '相对低估', '+33.4', '-10.0%', '119.36', '偏低')
('三聚环保', 'sz300072', '相对低估', '+30.43', '-8.97%', '50.38', '偏低')
('龙源技术', 'sz300105', '相对低估', '+16.12', '-9.99%', '20.85', '中')
('华峰氨纶', 'sz002064', '相对低估', '+9.3', '-9.97%', '36.79', '偏高')
('齐翔腾达', 'sz002408', '相对低估', '+17.22', '-9.98%', '25.34', '中')
('康美药业', 'sh600518', '相对低估', '+17.9', '-8.21%', '28.83', '中')
('长安汽车', 'sz000625', '相对低估', '+19.24', '-8.07%', '34.53', '中')
('银禧科技', 'sz300221', '相对低估', '+13.91', '-9.97%', '39.68', '中')
('齐星铁塔', 'sz002359', '相对低估', '+6.38', '+', '9.12', '偏低')
('潞安环能', 'sh601699', '相对低估', '+9.41', '-9.95%', '13.09', '高')
('恒泰艾普', 'sz300157', '相对低估', '+14.45', '-10.02%', '27.3', '偏高')
('老板电器', 'sz002508', '相对低估', '+37.35', '-10.0%', '46.21', '中')
('天士力', 'sh600535', '相对低估', '+44.87', '-9.99%', '59.48', '偏低')
('浙报传媒', 'sh600633', '相对低估', '+20.16', '-10.0%', '24.64', '中')
('伊利股份', 'sh600887', '相对低估', '+17.62', '-5.98%', '28.2', '中')
('省广股份', 'sz002400', '相对低估', '+25.52', '-9.98%', '30.12', '偏低')
('金力泰', 'sz300225', '相对低估', '+9.5', '-10.04%', '12.49', '中')
('庞大集团', 'sh601258', '相对低估', '+5.61', '-9.95%', '16.56', '高')
('达实智能', 'sz002421', '相对低估', '+21.65', '-10.02%', '37.39', '偏低')
('奥克股份', 'sz300082', '相对低估', '+10.29', '-9.97%', '17.49', '偏低')
('置信电气', 'sh600517', '相对低估', '+16.21', '-9.99%', '30.07', '中')
('恒邦股份', 'sz002237', '相对低估', '+12.02', '-10.03%', '19.22', '中')
('中银绒业', 'sz000982', '相对低估', '+4.64', '-2.93%', '26.26', '低')
('世纪瑞尔', 'sz300150', '相对低估', '+13.46', '-9.97%', '15.14', '中')
('神州泰岳', 'sz300002', '相对低估', '+16.47', '-10.0%', '19.9', '中')
('海立美达', 'sz002537', '相对低估', '+16.07', '-9.97%', '28.61', '偏低')
('华峰超纤', 'sz300180', '相对低估', '+14.56', '-10.01%', '25.29', '偏低')
('立讯精密', 'sz002475', '相对低估', '+32.0', '-9.58%', '59.37', '偏高')
('联建光电', 'sz300269', '相对低估', '+32.01', '-10.01%', '50.67', '偏低')
('中国太保', 'sh601601', '相对低估', '+28.06', '-5.36%', '36.47', '高')
('格林美', 'sz002340', '相对低估', '+14.44', '-9.98%', '20.46', '偏高')
('峨眉山A', 'sz000888', '相对低估', '+14.05', '-9.99%', '23.78', '偏低')
('华夏幸福', 'sh600340', '相对低估', '+26.99', '-10.0%', '42.63', '偏高')
('东华科技', 'sz002140', '相对低估', '+22.64', '-9.98%', '28.95', '中')
('雅化集团', 'sz002497', '相对低估', '+8.6', '-10.04%', '16.97', '偏高')
('上海家化', 'sh600315', '相对低估', '+42.41', '-9.71%', '61.11', '中')
('*ST乐电', 'sh600644', '相对低估', '+11.76', '-5.01%', '16.45', '偏低')
('德赛电池', 'sz000049', '相对低估', '+43.06', '-9.99%', '63.53', '偏低')
('北京利尔', 'sz002392', '相对低估', '+9.32', '-9.95%', '17.82', '中')
('金利科技', 'sz002464', '相对低估', '+16.14', '-1.22%', '29.47', '中')
('新纶科技', 'sz002341', '相对低估', '+20.53', '-10.0%', '26.46', '偏低')
('白云山', 'sh600332', '相对低估', '+33.7', '-9.94%', '47.92', '偏低')
('华泽钴镍', 'sz000693', '相对低估', '+22.86', '-10.0%', '29.11', '偏高')
('三星电气', 'sh601567', '相对低估', '+14.38', '-10.01%', '16.4', '中')
('湖北宜化', 'sz000422', '相对低估', '+9.41', '-9.95%', '22.72', '偏高')
('欧菲光', 'sz002456', '相对低估', '+34.08', '-10.01%', '43.82', '中')
('抚顺特钢', 'sh600399', '相对低估', '+12.56', '-10.03%', '16.75', '偏高')
('东方雨虹', 'sz002271', '相对低估', '+26.53', '-2.75%', '34.33', '中')
('欣旺达', 'sz300207', '相对低估', '+20.02', '-8.92%', '55.2', '中')
('锌业股份', 'sz000751', '相对低估', '+9.29', '-9.81%', '21.97', '中')
('国新能源', 'sh600617', '相对低估', '+19.12', '-9.98%', '35.0', '偏低')
('银江股份', 'sz300020', '相对低估', '+28.69', '-10.01%', '35.27', '偏低')
('爱康科技', 'sz002610', '相对低估', '+14.5', '-9.99%', '18.94', '中')
('铁岭新城', 'sz000809', '相对低估', '+12.11', '-10.03%', '69.35', '中')
('承德露露', 'sz000848', '相对低估', '+18.16', '-10.01%', '49.29', '中')
('安诺其', 'sz300067', '相对低估', '+11.83', '-9.97%', '27.89', '中')
('沧州明珠', 'sz002108', '相对低估', '+13.83', '-8.77%', '17.08', '中')
('康得新', 'sz002450', '相对低估', '+30.6', '-10.0%', '45.71', '中')
('亿帆鑫富', 'sz002019', '相对低估', '+35.61', '-10.01%', '86.57', '偏低')
('云南旅游', 'sz002059', '相对低估', '+11.57', '-9.96%', '15.94', '中')
('退市长油', 'sh600087', '相对低估', '+0.83', '-100.0%', '1.59', '低')
('华邦颖泰', 'sz002004', '相对低估', '+13.5', '-10.0%', '26.57', '中')
('龙净环保', 'sh600388', '相对低估', '+19.12', '-9.98%', '34.89', '中')
('厦门钨业', 'sh600549', '相对低估', '+25.52', '-9.98%', '32.16', '偏高')
('江粉磁材', 'sz002600', '相对低估', '+9.86', '-9.95%', '33.6', '中')
('金种子酒', 'sh600199', '相对低估', '+12.77', '-9.82%', '16.31', '偏高')
('川投能源', 'sh600674', '相对低估', '+13.21', '-9.33%', '38.27', '高')
('东富龙', 'sz300171', '相对低估', '+29.68', '-10.01%', '39.27', '高')
('浙江医药', 'sh600216', '相对低估', '+13.24', '-9.81%', '17.33', '中')
('浪潮信息', 'sz000977', '相对低估', '+30.72', '-9.99%', '49.6', '中')
('章源钨业', 'sz002378', '相对低估', '+17.97', '-10.02%', '23.93', '偏高')
('兴森科技', 'sz002436', '相对低估', '+19.07', '-10.0%', '40.44', '中')

发表评论

电子邮件地址不会被公开。 必填项已用*标注