2012年8月14日 星期二

Python抓集保資料

使用Python透過解析網頁元素來取得個股集保資訊

#!/usr/bin/python
# -*- coding: utf-8 -*-

#---------------------------------------------
#   抓個股集保資料
#   Version : 0.1
#   Author : Amin white
#   Release Date : 2012-01-01
#   Python version : 2.7.2
#---------------------------------------------

#引用函式庫
import urllib, csv, datetime, os, codecs, time
from datetime import date
from sgmllib import SGMLParser

def GetTdccinfo(stockid, stockname):
    startdate = ""
    enddate = ""

    #取得今天的日期
    today = datetime.datetime.today()
    year = today.strftime('%Y')
    month = today.strftime('%m')
    day = today.strftime('%d')

    #計算從1970到今天的總秒數
    todaysec = time.mktime(datetime.datetime(int(year), int(month), int(day)).timetuple())
    
    #指定儲存的路徑,可自行變更儲存路徑
    workdir = 'D:\\Stock_Concentration\\'

    #建立儲存資料夾
    if not os.path.isdir(workdir):
        os.makedirs(workdir)
    
    savefile = workdir + stockid + '.csv'

    print "寫入" + stockid + " " + stockname.encode('utf8') + " 集保資料到 " + savefile + " 開始..."
    
    #開始寫入檔案準備
    writefile = file(savefile, 'wb')

    #指定檔案以UTF8儲存
    writefile.write(codecs.BOM_UTF8)

    #指定CSV檔分隔的方式
    writer = csv.writer(writefile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)

    #寫入欄位說明,可自行變更儲存說明欄位
    writer.writerow([u'日期'.encode('utf8'), u'股票代號'.encode('utf8'), u'股票名稱'.encode('utf8'), \
                     u'1-999'.encode('utf8'), u'1-999'.encode('utf8'), u'1-999'.encode('utf8'), \
                     u'1,000-5,000'.encode('utf8'), u'1,000-5,000'.encode('utf8'), u'1,000-5,000'.encode('utf8'), \
                     u'5,001-10,000'.encode('utf8'), u'5,001-10,000'.encode('utf8'), u'5,001-10,000'.encode('utf8'), \
                     u'10,001-15,000'.encode('utf8'), u'10,001-15,000'.encode('utf8'), u'10,001-15,000'.encode('utf8'),\
                     u'15,001-20,000'.encode('utf8'), u'15,001-20,000'.encode('utf8'), u'15,001-20,000'.encode('utf8'), \
                     u'20,001-30,000'.encode('utf8'), u'20,001-30,000'.encode('utf8'), u'20,001-30,000'.encode('utf8'), \
                     u'30,001-40,000'.encode('utf8'), u'30,001-40,000'.encode('utf8'), u'30,001-40,000'.encode('utf8'),\
                     u'40,001-50,000'.encode('utf8'), u'40,001-50,000'.encode('utf8'), u'40,001-50,000'.encode('utf8'), \
                     u'50,001-100,000'.encode('utf8'), u'50,001-100,000'.encode('utf8'), u'50,001-100,000'.encode('utf8'), \
                     u'100,001-200,000'.encode('utf8'), u'100,001-200,000'.encode('utf8'), u'100,001-200,000'.encode('utf8'), \
                     u'200,001-400,000'.encode('utf8'), u'200,001-400,000'.encode('utf8'), u'200,001-400,000'.encode('utf8'), \
                     u'400,001-600,000'.encode('utf8'), u'400,001-600,000'.encode('utf8'), u'400,001-600,000'.encode('utf8'), \
                     u'600,001-800,000'.encode('utf8'), u'600,001-800,000'.encode('utf8'), u'600,001-800,000'.encode('utf8'), \
                     u'800,001-1,000,000'.encode('utf8'), u'800,001-1,000,000'.encode('utf8'), u'800,001-1,000,000'.encode('utf8'),\
                     u'1,000,001以上'.encode('utf8'), u'1,000,001以上'.encode('utf8'), u'1,000,001以上'.encode('utf8')])

    DataCount = 0
    for y in range(int(year) - 1, int(year) + 1):
        for m in range(1, 13):

            cell = []
            for d in range(1, 15):

                #排除2011年特殊的日子
                if y == 2011 and m == 5 and d == 2 and date(y, 5, 2).isoweekday() == 1:
                        continue;

                #計算要取得指定日期的秒數
                Revenuedaysec = time.mktime(datetime.datetime(y, m, d).timetuple())

                #判別指定日期是否有超過今天的日期
                if date(y, m, d).isoweekday() != 6 and date(y, m, d).isoweekday() != 7 and Revenuedaysec <= todaysec:

                    datestring = str(y) + str("%02d" % m) + str("%02d" % d)
                    
                    url = "http://www.tdcc.com.tw/smWeb/QryStock.jsp?SCA_DATE=" + datestring + "&SqlMethod=StockNo&StockNo=" + stockid + "&StockName=&sub=%ACd%B8%DF"
                    cell = TdccData(url)
                    if len(cell) == 0 :
                        break

                    #抓取第一筆集保日期
                    if startdate == "":
                        startdate = datestring
                        
                    #寫入集保資料,可自行變更儲存資料欄位
                    writer.writerow([datestring.encode('utf8'), stockid.encode('utf8'), stockname.encode('utf8'), cell[0].encode('utf8'), \
                                     cell[1].encode('utf8'), cell[2].encode('utf8'), cell[3].encode('utf8'), cell[4].encode('utf8'), \
                                     cell[5].encode('utf8'), cell[6].encode('utf8'), cell[7].encode('utf8'), cell[8].encode('utf8'), \
                                     cell[9].encode('utf8'), cell[10].encode('utf8'), cell[11].encode('utf8'), cell[12].encode('utf8'), \
                                     cell[13].encode('utf8'), cell[14].encode('utf8'), cell[15].encode('utf8'), cell[16].encode('utf8'), \
                                     cell[17].encode('utf8'), cell[18].encode('utf8'), cell[19].encode('utf8'), cell[20].encode('utf8'), \
                                     cell[21].encode('utf8'), cell[22].encode('utf8'), cell[23].encode('utf8'), cell[24].encode('utf8'), \
                                     cell[25].encode('utf8'), cell[26].encode('utf8'), cell[27].encode('utf8'), cell[28].encode('utf8'), \
                                     cell[29].encode('utf8'), cell[30].encode('utf8'), cell[31].encode('utf8'), cell[32].encode('utf8'), \
                                     cell[33].encode('utf8'), cell[34].encode('utf8'), cell[35].encode('utf8'), cell[36].encode('utf8'), \
                                     cell[37].encode('utf8'), cell[38].encode('utf8'), cell[39].encode('utf8'), cell[40].encode('utf8'), \
                                     cell[41].encode('utf8'), cell[42].encode('utf8'), cell[43].encode('utf8'), cell[44].encode('utf8') ])
                    DataCount += 1

                    #抓取第最後一筆集保日期
                    enddate = datestring

    #關閉檔案            
    writefile.close()
    print stockname.encode('utf8') + " 集保資料 " + startdate + '~' + enddate + "寫入 " + savefile + " 中,共完成 " + str(DataCount) + " 筆寫入\n"
    

def TdccData(URL):

    #解析網頁開始
    webcode = urllib.urlopen(URL)
    if webcode.code == 200:
        Tdcc = ParseWebData()
        Tdcc.parse(webcode.read())
        Tdcc.close()

    if len(Tdcc.cell) > 0 :
        return Tdcc.cell
    else:
        return []
        
    
class ParseWebData(SGMLParser):

    #初始化class等同constructor
    def __init__(self):
        SGMLParser.__init__(self)

    #初始化變數數值
    def reset(self):
        SGMLParser.reset(self)
        self.headname = False
        self.center = False
        self.right = False
        self.cell = []
        self.centercount = 0
        self.rightcount = 0

    #解析網頁    
    def parse(self,data):
        self.feed(data)
        self.close()

    #解析網頁標籤為td的內容    
    def start_td(self, attrs):
        for name,value in attrs:
            if name =='class' and value == 'wuc9':
                self.headname = True
                
            if name =='align' and value == 'center':
                self.center = True
                self.centercount+=1
 
            if name =='align' and value == 'right':
                self.right = True
                self.rightcount+=1

    #開始讀取集保資料到暫存list中            
    def handle_data(self, text):
        #print text
        if self.headname:
            #print text
            self.headname = False
            
        if self.center:
            if self.centercount == 1:
                #print "center : " + text
                self.center = False

            if self.centercount == 2:
                #print "center : " + text
                self.center = False
                self.centercount = 0

        if self.right :
            if self.rightcount == 1:
                #print "right : " + text
                self.cell.append(text.strip().replace(",", ""))
                self.right = False

            if self.rightcount == 2:
                #print "right : " + text
                self.cell.append(text.strip().replace(",", ""))
                self.right = False

            if self.rightcount == 3:
                #print "right : "  + text
                self.cell.append(text.strip())
                self.rightcount = 0
                self.right = False


class ParsestrModeWeb(SGMLParser):

    #初始化變數數值
    def reset(self):
        SGMLParser.reset(self)
        self.stockinfo = False
        self.cellno = 0
        self.stockid = []
        self.stockname = []
        self.cell1 = ""

    #解析網頁標籤為td的內容    
    def start_td(self, attrs):
        if len(attrs) == 1:
            if attrs[0][0] == 'bgcolor' and attrs[0][1] == '#FAFAD2':
                self.stockinfo = True
                self.cellno+=1
                self.cellno%=7

    #開始讀取股票代碼          
    def handle_data(self, text):
            if self.stockinfo:
                if self.cellno == 1:
                    self.cell1 = text.strip()
                elif self.cellno == 6 and text.strip() == 'ESVUFR':
                    data = self.cell1.strip().split('      \xa1@')
                    self.stockid.append(data[0].strip())                    
                    self.stockname.append(data[1].strip().decode('cp950'))                    
                elif self.cellno == 7:
                    self.cellno = 0
                self.stockinfo = False
                
def main():
    
    #上市, 上櫃股票代碼網址
    stocktype = [2,4]
    
    for i in range(0, len(stocktype)):
        url = "http://brk.twse.com.tw:8000/isin/C_public.jsp?strMode=" + str(stocktype[i])  
    
        #解析網頁開始
        webcode = urllib.urlopen(url)
        if webcode.code == 200:
            stock = ParsestrModeWeb()
            stock.feed(webcode.read())
            stock.close()

        #取得個股集保資料
        for j in range(0, len(stock.stockid)):
            GetTdccinfo(stock.stockid[j], stock.stockname[j])
                    
if __name__ == "__main__":
    main()

Python執行結果畫面

CSV檔案儲存內容如下