2012年10月5日 星期五

Python 抓取上櫃交易明細(1)

使用Python以解析網頁的方式,抓取上櫃每日交易明細,再將資料存入CSV檔 。
#!/usr/bin/python
# -*- coding: utf-8 -*-
#---------------------------------------------
#   抓上櫃交易明細
#   Version : 1.1
#   Author : Amin white
#   Release Date : 2012-06-27
#   Python version : 2.7.2
#---------------------------------------------

import csv, os, datetime, urllib, codecs, time
from sgmllib import SGMLParser

#每日上櫃交易資訊
class brokerBS_otc(SGMLParser):
    def __init__(self):
        SGMLParser.__init__(self)

    #初始化變數
    def reset(self):
        SGMLParser.reset(self)
        self.webexist = False
        self.broker = False        
        self.brokerid = []        
        self.pricevol = False
        self.price_vol = 0
        self.count = 0
        self.price = []
        self.sellvol = []
        self.buyvol = []

    #解析網頁元素
    def parse(self,data):
        self.feed(data)
        self.close()

    #對網頁表格控制
    def start_td(self, attrs):
        if len(attrs) == 2:
            if attrs[0][1] == 'table-mainbody-left' and attrs[1][1] == '2':
                self.webexist = True
        elif len(attrs) == 1:
            if attrs[0][1] == 'table-body-left' or attrs[0][1] == 'page_table-body-LEFT':
                self.broker = True
            elif attrs[0][1] == 'table-body-right' or attrs[0][1] == 'page_table-body-RIGHT':
                self.pricevol = True
                self.price_vol += 1
                self.price_vol %= 3

    #處理網頁資料
    def handle_data(self, text):

        #取得劵商代碼
        if self.broker:
            data = text.split(" ")
            self.brokerid.append(data[0])
            self.broker = False
            self.count += 1

        #取得買買交易明細
        if self.pricevol:            
            #取得買賣價格
            if self.price_vol == 1:                
                self.price.append(text)
                self.pricevol = False

            #取得買的數量   
            elif self.price_vol == 2:
                self.buyvol.append(text.replace(',', ''))
                self.pricevol = False

            #取得賣的數量 
            elif self.price_vol == 0:
                self.sellvol.append(text.replace(',', ''))
                self.pricevol = False
                self.brokerpric = 0
                
#上市櫃股票代碼與名稱
class Parser_strMode(SGMLParser):
    def __init__(self):
        SGMLParser.__init__(self)

    def reset(self):
        SGMLParser.reset(self)
        self.stockinfo = False
        self.no = 0
        self.stock = []
        self.col1 = ""
        self.col2 = ""
        self.col3 = ""
        
    def parse(self,data):
        self.feed(data)
        self.close()
        
    def start_td(self, attrs):
        for name, value in attrs:        
            if len(attrs) == 1:                
                if name == 'bgcolor' and value == '#FAFAD2':
                    self.stockinfo = True
                    self.no+=1
                    self.no%=7
               
    def handle_data(self, text):
            if self.stockinfo:                
                if self.no == 1:                   
                    data = text.split("    ")
                    if data[0].isalnum():                    
                        self.col1 = data[0]                        
                        self.col2 = data[1]                    
                elif self.no == 6:
                    self.col3 = text                    
                elif self.no == 7:
                    self.no = 0
                    
                if self.no == 6 and self.col3 == 'ESVUFR' or self.col3 == 'EUOMSR' or self.col3 == 'EMXXXA' or self.col3 == 'ESVUFA':
                    self.stock.append(self.col1)
 
                self.stockinfo = False


def Getstockid(webindex):
    url = "http://brk.twse.com.tw:8000/isin/C_public.jsp?strMode=%s" %webindex
    webcode = urllib.urlopen(url)
    if webcode.code == 200:
        stock = Parser_strMode()
        stock.parse(webcode.read())
        webcode.close()

    stock.stock.sort()
    if len(stock.stock) >0:
        return stock
    else:
        return 0

#將Stock id寫入CSV file
def GetStockidtoCSVfile(filepath, stock):
    #開啟檔案
    writefile = file(filepath, 'wb')

    #將檔案以UTF8的格式儲存
    writefile.write(codecs.BOM_UTF8)

    #將資料儲存至CSV檔中
    writer = csv.writer(writefile)

    #寫入抬頭名稱
    writer.writerow([u'股票代號'.encode('utf8')])

    #開始寫檔
    for i in range(0, len(stock)):
        writer.writerow(['%s' %stock[i].encode('utf8')])

    #關閉檔案
    writefile.close()

def GetTodayDate(mode):
    #取得今天的日期
    todaydate = datetime.datetime.today()

    #取得年
    year = todaydate.strftime('%Y')

    #取得月
    month = todaydate.strftime('%m')

    #取得日
    day = todaydate.strftime('%d')

    #組合年月日
    if mode == 1:   #西元
        todaydate = year + month + day
    elif  mode == 2:    #民國
        todaydate = str(int(year) - 1911) + month + day

    #回傳
    return todaydate

def main(startpos):
    #取得當天交易的日期,格式如20121001
    todaydate = GetTodayDate(1)
    todaydate = '20121005'
    #取得股票代碼
    stockid = Getstockid(4)
    if len(stockid.stock) == 0:
        return 0

    #當天交易明細存放的路徑,可自行更改
    workdir = 'D:\\stock_database\\test\\otc\\' + todaydate

    #建立交易明細存放的資料夾
    if not os.path.isdir(workdir):
        os.makedirs(workdir)

    #將股票代碼儲存至檔案裡
    StockidFile = workdir + '\\' + todaydate +'.csv'
    GetStockidtoCSVfile(StockidFile, stockid.stock)

    #開始對上櫃每個股票代碼抓取交易明細
    for i in range(startpos, len(stockid.stock)):
        stock_id = stockid.stock[i]

        #建立交易明細檔案的名稱
        CSVFile = workdir + '\\' + stock_id + '_' + todaydate +'.csv'

        #開起檔案準備寫檔
        fileobj = file(CSVFile, 'wb')

        #將檔案以UTF8的格式儲存
        fileobj.write(codecs.BOM_UTF8)

        #將交易明細寫入檔案前的格式設定
        writer = csv.writer(fileobj, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)

        #寫入明細的抬頭名稱
        writer.writerow([u'序號'.encode('utf8'), u'交易日期'.encode('utf8'), u'股票種類'.encode('utf8'), u'股票代號'.encode('utf8'), \
                         u'券商'.encode('utf8'), u'價格'.encode('utf8'), u'買進股數'.encode('utf8'), u'賣出股數'.encode('utf8')])

        #對交易筆數與讀取網頁次數的初始化
        serialnumber = 1
        Retrytime = 0

        #開始抓取網頁的交易明細
        for j in range(1, 5001):
            while True:
                url = "http://www.gretai.org.tw/ch/stock/aftertrading/broker_trading/brokerBS.php?stk_code=%s&topage=%s" %(stock_id, str(j))
                webcode = urllib.urlopen(url)

                #網頁順利開啟的回傳值200
                if webcode.code == 200:
                    stock = brokerBS_otc()
                    stock.parse(webcode.read())
                    webcode.close()
                    stock.close()

                #如果交易明細存在或是重複抓取網頁次數超過30次,則離開回圈
                if stock.webexist or Retrytime > 30 :
                    break;                
                time.sleep(1)
                
                print "Retry time : " + str(Retrytime) + " " + stock_id
                Retrytime+=1                              
              
            print 'No : ' + str(j) + ' stock kind : otc stockid : ' + stock_id + ' date : ' + todaydate  + u' 券商筆數 : ' + str(len(stock.brokerid))

            #將交易明細寫入檔案中
            for k in range(0, len(stock.brokerid)):
                writer.writerow([ '%d' %serialnumber, '%s' %todaydate, 'otc'.encode('utf8'), '%s' %stock_id, \
                                  '%s' %stock.brokerid[k], '%s' %stock.price[k], '%s' %stock.buyvol[k], '%s' %stock.sellvol[k]])
                serialnumber += 1

            #找出網頁交易筆數當小於100筆的就是交易明細的最後一頁
            if len(stock.brokerid) < 100:
                stocklog = 'stockid : ' + stock_id + ' Date : ' + todaydate + ' total web page : %s' %(str(j)) + ' progress rate : ' + str(i) + '/' + str(len(stockid.stock)) + '\n'
                print stocklog
                break;
                    
            print 'No : ' + str(j) + ' stockid : ' + stock_id + ' date : ' + todaydate + u" write to csv OK......"


        #關閉寫檔動作
        fileobj.close()           
    
if __name__ == "__main__":   
    main(0)

程式執行畫面

執行結果