小爬虫实验

最近在学python,一直想用python来做个爬虫,趁着这个周末没事就玩玩吧。

话说最近股市跌宕起伏,那就想着爬点股市信息来好了。

实验脚本目的:抓取所有股票信息的行业评级以及风险评估。

下面就贴代码了:


#coding:utf-8 import httplib2 import re from bs4 import BeautifulSoup import sqlite3 def getPrice(stockCode): url = 'http://www.icaifu.com/stock/doctora/' + stockCode + '.shtml' headers = {'contentType':'text/html;charset=UTF-8', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.4 Safari/537.36', 'Referer':'http://www.icaifu.com/'} http = httplib2.Http('.cache') response, content = http.request(url, 'GET', headers=headers) soup = BeautifulSoup(str(content.decode('utf-8'))) guzhi = str(soup.find('div',{'class':'picL_nav'}).find_next(text=re.compile('估值结果显示')).find_next_sibling('span')).split() fengx = soup.find(text=re.compile('潜在风险')).find_next_sibling('span') fengx = str(list(fengx)[0]).split()[0] if fengx is not None else '--' stock = {'stockName':list(soup.find('div','up_l_nav').find_next('a'))[0], 'stockCode':stockCode, 'nowPrice':list(soup.find('div','up_2_nav').find_next('span',text=re.compile('^[+-][\d.]*')))[0], 'upDown':str(list(soup.find('div','up_2_nav').find_next('span',text=re.compile('^[+-][\d.]*')).find_next_sibling('span'))[0]).split()[0], 'lowPrice':str(soup.find('div','picL_top').find_next(text=re.compile('-'))).split()[0], 'highPrice':str(soup.find('div','picL_top').find_next(text=re.compile('-'))).split()[2], 'guzhi':(len(guzhi) > 1 and guzhi[2] or '--'), 'fengx':fengx} print(stock) conn = sqlite3.connect('stockDoctora.db') cu = conn.cursor() sql = 'INSERT INTO stockDoctora (stockName,stockCode,nowPrice,upDown,lowPrice,highPrice,guzhi,fengx) VALUES("' +\ stock['stockName'] + '","' + stock['stockCode'] + '","' + stock['nowPrice'] + '","' + stock['upDown'] + '","' +\ stock['lowPrice'] + '","' + stock['highPrice'] + '","' + stock['guzhi'] + '","' + stock['fengx'] +'")' cu.execute(sql) conn.commit() cu.close() def getCode(where,totalPage): headers = {'contentType':'text/html;charset=UTF-8', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.4 Safari/537.36', 'Referer':'http://www.icaifu.com/'} codeList = [] http = httplib2.Http('.cache') for x in range(1,totalPage+1): url = 'http://www.icaifu.com/stock/marketcentera-' + where + '-0-0-0-0-0----------4--' + str(x) + '.shtml' response, content = http.request(url, 'GET', headers=headers) if response.status != 200: return list(set(codeList)) soup = BeautifulSoup(str(content.decode('utf-8'))) tmpList = soup.find_all('a',href=re.compile('/stock/quotes/[a-z0-9]*.shtml')) for i in range(len(tmpList)): codeList.append(re.findall('([a-z0-9]*).shtml',str(tmpList[i]))[0]) print('Page ' + str(x) + ' is ok!') return list(set(codeList)) def initdb(): conn = sqlite3.connect('stockDoctora.db') cu = conn.cursor() sql = '''CREATE TABLE stockDoctora ( id INTEGER PRIMARY KEY AUTOINCREMENT, stockName varchar(20) NULL, stockCode varchar(20) NULL, nowPrice varchar(20) NULL, upDown varchar(20) NULL, lowPrice varchar(20) NULL, highPrice varchar(20) NULL, guzhi varchar(20) NULL, fengx varchar(20) NULL )''' cu.execute('DROP TABLE IF EXISTS stockDoctora') cu.execute(sql) conn.commit() cu.close() def getResult(sql): conn = sqlite3.connect('stockDoctora.db') cu = conn.cursor() cu.execute(sql) result = cu.fetchall() cu.close() return result def main(): initdb() gc = getCode('0',30) print('Total ' + str(len(gc)) + ' stock is get!') for i in range(len(gc)): getPrice(gc[i]) if __name__ == '__main__': '''sql = 'SELECT stockName,stockCode,guzhi,nowPrice,upDown,highPrice,fengx FROM stockDoctora WHERE guzhi="相对低估" AND highPrice <> "--"' for i in getResult(sql): print(i) print(len(getResult(sql)))''' main()

难点在于解析html页面上,怎么精确获取自己想要的东西,比较费劲,还好在网上了解到BeautifulSoup模块的用法,变得容易多了。

结果存在sqlite数据库中,以下是结果:

sql语句:

SELECT stockName,stockCode,guzhi,nowPrice,upDown,highPrice,fengx FROM stockDoctora WHERE guzhi="相对低估" AND highPrice <> "--"

执行结果:

('特锐德', 'sz300001', '相对低估', '+20.78', '-10.0%', '25.5', '高')
('福星晓程', 'sz300139', '相对低估', '+21.29', '-9.98%', '46.8', '偏低')
('闽发铝业', 'sz002578', '相对低估', '+11.55', '-9.98%', '16.88', '偏高')
('长城汽车', 'sh601633', '相对低估', '+42.76', '-7.08%', '57.0', '偏高')
('拓日新能', 'sz002218', '相对低估', '+12.6', '-10.0%', '35.83', '偏高')
('键桥通讯', 'sz002316', '相对低估', '+11.01', '+', '16.96', '偏低')
('康缘药业', 'sh600557', '相对低估', '+27.64', '-10.0%', '34.46', '中')
('中原内配', 'sz002448', '相对低估', '+14.52', '-9.98%', '19.72', '偏高')
('长高集团', 'sz002452', '相对低估', '+10.99', '-9.99%', '20.62', '中')
('云煤能源', 'sh600792', '相对低估', '+7.96', '-9.95%', '12.76', '高')
('海螺水泥', 'sh600585', '相对低估', '+20.28', '-8.4%', '24.62', '偏高')
('风范股份', 'sh601700', '相对低估', '+11.42', '-10.01%', '23.7', '偏高')
('中环股份', 'sz002129', '相对低估', '+22.36', '-9.98%', '30.14', '偏低')
('恒瑞医药', 'sh600276', '相对低估', '+40.05', '-5.83%', '48.83', '偏低')
('江苏旷达', 'sz002516', '相对低估', '+13.95', '-10.0%', '20.46', '偏低')
('开山股份', 'sz300257', '相对低估', '+25.25', '-10.01%', '54.44', '偏低')
('江山股份', 'sh600389', '相对低估', '+33.4', '-10.0%', '119.36', '偏低')
('三聚环保', 'sz300072', '相对低估', '+30.43', '-8.97%', '50.38', '偏低')
('龙源技术', 'sz300105', '相对低估', '+16.12', '-9.99%', '20.85', '中')
('华峰氨纶', 'sz002064', '相对低估', '+9.3', '-9.97%', '36.79', '偏高')
('齐翔腾达', 'sz002408', '相对低估', '+17.22', '-9.98%', '25.34', '中')
('康美药业', 'sh600518', '相对低估', '+17.9', '-8.21%', '28.83', '中')
('长安汽车', 'sz000625', '相对低估', '+19.24', '-8.07%', '34.53', '中')
('银禧科技', 'sz300221', '相对低估', '+13.91', '-9.97%', '39.68', '中')
('齐星铁塔', 'sz002359', '相对低估', '+6.38', '+', '9.12', '偏低')
('潞安环能', 'sh601699', '相对低估', '+9.41', '-9.95%', '13.09', '高')
('恒泰艾普', 'sz300157', '相对低估', '+14.45', '-10.02%', '27.3', '偏高')
('老板电器', 'sz002508', '相对低估', '+37.35', '-10.0%', '46.21', '中')
('天士力', 'sh600535', '相对低估', '+44.87', '-9.99%', '59.48', '偏低')
('浙报传媒', 'sh600633', '相对低估', '+20.16', '-10.0%', '24.64', '中')
('伊利股份', 'sh600887', '相对低估', '+17.62', '-5.98%', '28.2', '中')
('省广股份', 'sz002400', '相对低估', '+25.52', '-9.98%', '30.12', '偏低')
('金力泰', 'sz300225', '相对低估', '+9.5', '-10.04%', '12.49', '中')
('庞大集团', 'sh601258', '相对低估', '+5.61', '-9.95%', '16.56', '高')
('达实智能', 'sz002421', '相对低估', '+21.65', '-10.02%', '37.39', '偏低')
('奥克股份', 'sz300082', '相对低估', '+10.29', '-9.97%', '17.49', '偏低')
('置信电气', 'sh600517', '相对低估', '+16.21', '-9.99%', '30.07', '中')
('恒邦股份', 'sz002237', '相对低估', '+12.02', '-10.03%', '19.22', '中')
('中银绒业', 'sz000982', '相对低估', '+4.64', '-2.93%', '26.26', '低')
('世纪瑞尔', 'sz300150', '相对低估', '+13.46', '-9.97%', '15.14', '中')
('神州泰岳', 'sz300002', '相对低估', '+16.47', '-10.0%', '19.9', '中')
('海立美达', 'sz002537', '相对低估', '+16.07', '-9.97%', '28.61', '偏低')
('华峰超纤', 'sz300180', '相对低估', '+14.56', '-10.01%', '25.29', '偏低')
('立讯精密', 'sz002475', '相对低估', '+32.0', '-9.58%', '59.37', '偏高')
('联建光电', 'sz300269', '相对低估', '+32.01', '-10.01%', '50.67', '偏低')
('中国太保', 'sh601601', '相对低估', '+28.06', '-5.36%', '36.47', '高')
('格林美', 'sz002340', '相对低估', '+14.44', '-9.98%', '20.46', '偏高')
('峨眉山A', 'sz000888', '相对低估', '+14.05', '-9.99%', '23.78', '偏低')
('华夏幸福', 'sh600340', '相对低估', '+26.99', '-10.0%', '42.63', '偏高')
('东华科技', 'sz002140', '相对低估', '+22.64', '-9.98%', '28.95', '中')
('雅化集团', 'sz002497', '相对低估', '+8.6', '-10.04%', '16.97', '偏高')
('上海家化', 'sh600315', '相对低估', '+42.41', '-9.71%', '61.11', '中')
('*ST乐电', 'sh600644', '相对低估', '+11.76', '-5.01%', '16.45', '偏低')
('德赛电池', 'sz000049', '相对低估', '+43.06', '-9.99%', '63.53', '偏低')
('北京利尔', 'sz002392', '相对低估', '+9.32', '-9.95%', '17.82', '中')
('金利科技', 'sz002464', '相对低估', '+16.14', '-1.22%', '29.47', '中')
('新纶科技', 'sz002341', '相对低估', '+20.53', '-10.0%', '26.46', '偏低')
('白云山', 'sh600332', '相对低估', '+33.7', '-9.94%', '47.92', '偏低')
('华泽钴镍', 'sz000693', '相对低估', '+22.86', '-10.0%', '29.11', '偏高')
('三星电气', 'sh601567', '相对低估', '+14.38', '-10.01%', '16.4', '中')
('湖北宜化', 'sz000422', '相对低估', '+9.41', '-9.95%', '22.72', '偏高')
('欧菲光', 'sz002456', '相对低估', '+34.08', '-10.01%', '43.82', '中')
('抚顺特钢', 'sh600399', '相对低估', '+12.56', '-10.03%', '16.75', '偏高')
('东方雨虹', 'sz002271', '相对低估', '+26.53', '-2.75%', '34.33', '中')
('欣旺达', 'sz300207', '相对低估', '+20.02', '-8.92%', '55.2', '中')
('锌业股份', 'sz000751', '相对低估', '+9.29', '-9.81%', '21.97', '中')
('国新能源', 'sh600617', '相对低估', '+19.12', '-9.98%', '35.0', '偏低')
('银江股份', 'sz300020', '相对低估', '+28.69', '-10.01%', '35.27', '偏低')
('爱康科技', 'sz002610', '相对低估', '+14.5', '-9.99%', '18.94', '中')
('铁岭新城', 'sz000809', '相对低估', '+12.11', '-10.03%', '69.35', '中')
('承德露露', 'sz000848', '相对低估', '+18.16', '-10.01%', '49.29', '中')
('安诺其', 'sz300067', '相对低估', '+11.83', '-9.97%', '27.89', '中')
('沧州明珠', 'sz002108', '相对低估', '+13.83', '-8.77%', '17.08', '中')
('康得新', 'sz002450', '相对低估', '+30.6', '-10.0%', '45.71', '中')
('亿帆鑫富', 'sz002019', '相对低估', '+35.61', '-10.01%', '86.57', '偏低')
('云南旅游', 'sz002059', '相对低估', '+11.57', '-9.96%', '15.94', '中')
('退市长油', 'sh600087', '相对低估', '+0.83', '-100.0%', '1.59', '低')
('华邦颖泰', 'sz002004', '相对低估', '+13.5', '-10.0%', '26.57', '中')
('龙净环保', 'sh600388', '相对低估', '+19.12', '-9.98%', '34.89', '中')
('厦门钨业', 'sh600549', '相对低估', '+25.52', '-9.98%', '32.16', '偏高')
('江粉磁材', 'sz002600', '相对低估', '+9.86', '-9.95%', '33.6', '中')
('金种子酒', 'sh600199', '相对低估', '+12.77', '-9.82%', '16.31', '偏高')
('川投能源', 'sh600674', '相对低估', '+13.21', '-9.33%', '38.27', '高')
('东富龙', 'sz300171', '相对低估', '+29.68', '-10.01%', '39.27', '高')
('浙江医药', 'sh600216', '相对低估', '+13.24', '-9.81%', '17.33', '中')
('浪潮信息', 'sz000977', '相对低估', '+30.72', '-9.99%', '49.6', '中')
('章源钨业', 'sz002378', '相对低估', '+17.97', '-10.02%', '23.93', '偏高')
('兴森科技', 'sz002436', '相对低估', '+19.07', '-10.0%', '40.44', '中')