#!/usr/bin/python
# -*- coding: utf-8 -*-
#---------------------------------------------
# 抓個股集保資料
# Version : 0.1
# Author : Amin white
# Release Date : 2012-01-01
# Python version : 2.7.2
#---------------------------------------------
#引用函式庫
import urllib, csv, datetime, os, codecs, time
from datetime import date
from sgmllib import SGMLParser
def GetTdccinfo(stockid, stockname):
startdate = ""
enddate = ""
#取得今天的日期
today = datetime.datetime.today()
year = today.strftime('%Y')
month = today.strftime('%m')
day = today.strftime('%d')
#計算從1970到今天的總秒數
todaysec = time.mktime(datetime.datetime(int(year), int(month), int(day)).timetuple())
#指定儲存的路徑,可自行變更儲存路徑
workdir = 'D:\\Stock_Concentration\\'
#建立儲存資料夾
if not os.path.isdir(workdir):
os.makedirs(workdir)
savefile = workdir + stockid + '.csv'
print "寫入" + stockid + " " + stockname.encode('utf8') + " 集保資料到 " + savefile + " 開始..."
#開始寫入檔案準備
writefile = file(savefile, 'wb')
#指定檔案以UTF8儲存
writefile.write(codecs.BOM_UTF8)
#指定CSV檔分隔的方式
writer = csv.writer(writefile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
#寫入欄位說明,可自行變更儲存說明欄位
writer.writerow([u'日期'.encode('utf8'), u'股票代號'.encode('utf8'), u'股票名稱'.encode('utf8'), \
u'1-999'.encode('utf8'), u'1-999'.encode('utf8'), u'1-999'.encode('utf8'), \
u'1,000-5,000'.encode('utf8'), u'1,000-5,000'.encode('utf8'), u'1,000-5,000'.encode('utf8'), \
u'5,001-10,000'.encode('utf8'), u'5,001-10,000'.encode('utf8'), u'5,001-10,000'.encode('utf8'), \
u'10,001-15,000'.encode('utf8'), u'10,001-15,000'.encode('utf8'), u'10,001-15,000'.encode('utf8'),\
u'15,001-20,000'.encode('utf8'), u'15,001-20,000'.encode('utf8'), u'15,001-20,000'.encode('utf8'), \
u'20,001-30,000'.encode('utf8'), u'20,001-30,000'.encode('utf8'), u'20,001-30,000'.encode('utf8'), \
u'30,001-40,000'.encode('utf8'), u'30,001-40,000'.encode('utf8'), u'30,001-40,000'.encode('utf8'),\
u'40,001-50,000'.encode('utf8'), u'40,001-50,000'.encode('utf8'), u'40,001-50,000'.encode('utf8'), \
u'50,001-100,000'.encode('utf8'), u'50,001-100,000'.encode('utf8'), u'50,001-100,000'.encode('utf8'), \
u'100,001-200,000'.encode('utf8'), u'100,001-200,000'.encode('utf8'), u'100,001-200,000'.encode('utf8'), \
u'200,001-400,000'.encode('utf8'), u'200,001-400,000'.encode('utf8'), u'200,001-400,000'.encode('utf8'), \
u'400,001-600,000'.encode('utf8'), u'400,001-600,000'.encode('utf8'), u'400,001-600,000'.encode('utf8'), \
u'600,001-800,000'.encode('utf8'), u'600,001-800,000'.encode('utf8'), u'600,001-800,000'.encode('utf8'), \
u'800,001-1,000,000'.encode('utf8'), u'800,001-1,000,000'.encode('utf8'), u'800,001-1,000,000'.encode('utf8'),\
u'1,000,001以上'.encode('utf8'), u'1,000,001以上'.encode('utf8'), u'1,000,001以上'.encode('utf8')])
DataCount = 0
for y in range(int(year) - 1, int(year) + 1):
for m in range(1, 13):
cell = []
for d in range(1, 15):
#排除2011年特殊的日子
if y == 2011 and m == 5 and d == 2 and date(y, 5, 2).isoweekday() == 1:
continue;
#計算要取得指定日期的秒數
Revenuedaysec = time.mktime(datetime.datetime(y, m, d).timetuple())
#判別指定日期是否有超過今天的日期
if date(y, m, d).isoweekday() != 6 and date(y, m, d).isoweekday() != 7 and Revenuedaysec <= todaysec:
datestring = str(y) + str("%02d" % m) + str("%02d" % d)
url = "http://www.tdcc.com.tw/smWeb/QryStock.jsp?SCA_DATE=" + datestring + "&SqlMethod=StockNo&StockNo=" + stockid + "&StockName=&sub=%ACd%B8%DF"
cell = TdccData(url)
if len(cell) == 0 :
break
#抓取第一筆集保日期
if startdate == "":
startdate = datestring
#寫入集保資料,可自行變更儲存資料欄位
writer.writerow([datestring.encode('utf8'), stockid.encode('utf8'), stockname.encode('utf8'), cell[0].encode('utf8'), \
cell[1].encode('utf8'), cell[2].encode('utf8'), cell[3].encode('utf8'), cell[4].encode('utf8'), \
cell[5].encode('utf8'), cell[6].encode('utf8'), cell[7].encode('utf8'), cell[8].encode('utf8'), \
cell[9].encode('utf8'), cell[10].encode('utf8'), cell[11].encode('utf8'), cell[12].encode('utf8'), \
cell[13].encode('utf8'), cell[14].encode('utf8'), cell[15].encode('utf8'), cell[16].encode('utf8'), \
cell[17].encode('utf8'), cell[18].encode('utf8'), cell[19].encode('utf8'), cell[20].encode('utf8'), \
cell[21].encode('utf8'), cell[22].encode('utf8'), cell[23].encode('utf8'), cell[24].encode('utf8'), \
cell[25].encode('utf8'), cell[26].encode('utf8'), cell[27].encode('utf8'), cell[28].encode('utf8'), \
cell[29].encode('utf8'), cell[30].encode('utf8'), cell[31].encode('utf8'), cell[32].encode('utf8'), \
cell[33].encode('utf8'), cell[34].encode('utf8'), cell[35].encode('utf8'), cell[36].encode('utf8'), \
cell[37].encode('utf8'), cell[38].encode('utf8'), cell[39].encode('utf8'), cell[40].encode('utf8'), \
cell[41].encode('utf8'), cell[42].encode('utf8'), cell[43].encode('utf8'), cell[44].encode('utf8') ])
DataCount += 1
#抓取第最後一筆集保日期
enddate = datestring
#關閉檔案
writefile.close()
print stockname.encode('utf8') + " 集保資料 " + startdate + '~' + enddate + "寫入 " + savefile + " 中,共完成 " + str(DataCount) + " 筆寫入\n"
def TdccData(URL):
#解析網頁開始
webcode = urllib.urlopen(URL)
if webcode.code == 200:
Tdcc = ParseWebData()
Tdcc.parse(webcode.read())
Tdcc.close()
if len(Tdcc.cell) > 0 :
return Tdcc.cell
else:
return []
class ParseWebData(SGMLParser):
#初始化class等同constructor
def __init__(self):
SGMLParser.__init__(self)
#初始化變數數值
def reset(self):
SGMLParser.reset(self)
self.headname = False
self.center = False
self.right = False
self.cell = []
self.centercount = 0
self.rightcount = 0
#解析網頁
def parse(self,data):
self.feed(data)
self.close()
#解析網頁標籤為td的內容
def start_td(self, attrs):
for name,value in attrs:
if name =='class' and value == 'wuc9':
self.headname = True
if name =='align' and value == 'center':
self.center = True
self.centercount+=1
if name =='align' and value == 'right':
self.right = True
self.rightcount+=1
#開始讀取集保資料到暫存list中
def handle_data(self, text):
#print text
if self.headname:
#print text
self.headname = False
if self.center:
if self.centercount == 1:
#print "center : " + text
self.center = False
if self.centercount == 2:
#print "center : " + text
self.center = False
self.centercount = 0
if self.right :
if self.rightcount == 1:
#print "right : " + text
self.cell.append(text.strip().replace(",", ""))
self.right = False
if self.rightcount == 2:
#print "right : " + text
self.cell.append(text.strip().replace(",", ""))
self.right = False
if self.rightcount == 3:
#print "right : " + text
self.cell.append(text.strip())
self.rightcount = 0
self.right = False
class ParsestrModeWeb(SGMLParser):
#初始化變數數值
def reset(self):
SGMLParser.reset(self)
self.stockinfo = False
self.cellno = 0
self.stockid = []
self.stockname = []
self.cell1 = ""
#解析網頁標籤為td的內容
def start_td(self, attrs):
if len(attrs) == 1:
if attrs[0][0] == 'bgcolor' and attrs[0][1] == '#FAFAD2':
self.stockinfo = True
self.cellno+=1
self.cellno%=7
#開始讀取股票代碼
def handle_data(self, text):
if self.stockinfo:
if self.cellno == 1:
self.cell1 = text.strip()
elif self.cellno == 6 and text.strip() == 'ESVUFR':
data = self.cell1.strip().split(' \xa1@')
self.stockid.append(data[0].strip())
self.stockname.append(data[1].strip().decode('cp950'))
elif self.cellno == 7:
self.cellno = 0
self.stockinfo = False
def main():
#上市, 上櫃股票代碼網址
stocktype = [2,4]
for i in range(0, len(stocktype)):
url = "http://brk.twse.com.tw:8000/isin/C_public.jsp?strMode=" + str(stocktype[i])
#解析網頁開始
webcode = urllib.urlopen(url)
if webcode.code == 200:
stock = ParsestrModeWeb()
stock.feed(webcode.read())
stock.close()
#取得個股集保資料
for j in range(0, len(stock.stockid)):
GetTdccinfo(stock.stockid[j], stock.stockname[j])
if __name__ == "__main__":
main()
Python執行結果畫面
CSV檔案儲存內容如下