2012年10月8日 星期一

Python 抓取上櫃交易明細(2)

使用Python以檔案下載的方式,抓取上櫃每日交易明細,將資料存入CSV檔 。
#!/usr/bin/python
# -*- coding: utf-8 -*-
#---------------------------------------------
#   用下載方式,抓上櫃交易明細
#   Version : 1.1
#   Author : Amin white
#   Release Date : 2012-06-27
#   Python version : 2.7.2
#---------------------------------------------

import csv, urllib, codecs, os, shutil, datetime
from urllib import urlretrieve, urlencode
from sgmllib import SGMLParser

def WriteSaledayetoCSV(saledate):
    #宣告一個空的序列
    saletemplist = []

    #交易日期檔案存放的路徑,可自行修改
    saledatefile = 'D:\stock_database\importMySQL_otc\OTC.csv'

    #以可讀可寫的方式開啟檔案
    fileoption = codecs.open(saledatefile, 'ab+')

    #讀取檔案中已存在的日期
    reader = csv.reader(fileoption)

    #將日期資料存放進序列中    
    row = 0
    for col in reader:
        if row > 1:
            saletemplist.append(col[0].strip())
        row += 1

    #當天的日期做格式轉換,從民國年月日轉回西元年月日
    tempdate = saledate.replace(saledate[0:3], str(int(saledate[0:3])+ 1911))

    #是別當天交易日的日期是否存在序列中,若不存在,則寫入當天的交易日期
    if not(tempdate in saletemplist):
        #fileoption.write(codecs.BOM_UTF8) #若使用了codecs.BOM_UTF8會在重新寫檔時,檔頭出現'\xef\xbb\xbf'

        #指定欲存入CSV的形式
        writer = csv.writer(fileoption)

        #將日期寫入檔案中
        writer.writerow(['%s' %tempdate.encode('utf8')])

    #關閉檔案
    fileoption.close()

#上市櫃股票代碼與名稱
class Parser_strMode(SGMLParser):
    def __init__(self):
        SGMLParser.__init__(self)

    def reset(self):
        SGMLParser.reset(self)
        self.stockinfo = False
        self.no = 0
        self.stock = []
        self.col1 = ""
        self.col2 = ""
        self.col3 = ""
        
    def parse(self,data):
        self.feed(data)
        self.close()
        
    def start_td(self, attrs):
        for name, value in attrs:        
            if len(attrs) == 1:                
                if name == 'bgcolor' and value == '#FAFAD2':
                    self.stockinfo = True
                    self.no+=1
                    self.no%=7
               
    def handle_data(self, text):
            if self.stockinfo:                
                if self.no == 1:                   
                    data = text.split("    ")
                    if data[0].isalnum():                    
                        self.col1 = data[0]                        
                        self.col2 = data[1]                    
                elif self.no == 6:
                    self.col3 = text                    
                elif self.no == 7:
                    self.no = 0
                    
                if self.no == 6 and self.col3 == 'ESVUFR' or self.col3 == 'EUOMSR' or self.col3 == 'EMXXXA' or self.col3 == 'ESVUFA':
                    self.stock.append(self.col1)
                    self.col3 = "" #[2014/8/3]修正
                self.stockinfo = False


def Getstockid(webindex):
    #上市上櫃股票代碼網址
    url = "http://isin.twse.com.tw/isin/C_public.jsp?strMode=%s" %webindex

    #開啟網址
    webcode = urllib.urlopen(url)

    #確認網頁成功連結
    if webcode.code == 200:
        
        #使用解析網頁的class
        stock = Parser_strMode()

        #解析網頁標籤
        stock.parse(webcode.read())
        webcode.close()

    #股票代碼排序
    stock.stock.sort()

    #確認股票代碼成功讀取
    if len(stock.stock) >0:
        return stock
    else:
        return 0

#將Stock id寫入CSV file
def GetStockidtoCSVfile(filepath, stock):
    #開啟檔案
    writefile = file(filepath, 'wb')

    #將檔案以UTF8的格式儲存
    writefile.write(codecs.BOM_UTF8)

    #將資料儲存至CSV檔中
    writer = csv.writer(writefile)

    #寫入抬頭名稱
    writer.writerow([u'股票代號'.encode('utf8')])

    #開始寫檔
    for i in range(0, len(stock)):
        writer.writerow(['%s' %stock[i].encode('utf8')])

    #關閉檔案
    writefile.close()

def GetTodayDate(mode):
    #取得今天的日期
    todaydate = datetime.datetime.today()

    #取得年
    year = todaydate.strftime('%Y')

    #取得月
    month = todaydate.strftime('%m')

    #取得日
    day = todaydate.strftime('%d')

    #組合年月日
    if mode == 1:   #西元
        todaydate = year + month + day
    elif  mode == 2:    #民國
        todaydate = str(int(year) - 1911) + month + day

    #回傳
    return todaydate

def GetCSVfilefromweb(todaydate, stock):
    targetstockiddir = 'D:\\stock_database\\test\\otc\\'+ todaydate +'\\'
        
    if not os.path.isdir(targetstockiddir):
        os.makedirs(targetstockiddir)

    csvfile = targetstockiddir + todaydate +'.csv'
    GetStockidtoCSVfile(csvfile, stock)

    for i in range(0, len(stock)):
        stockid = stock[i]
        args = urllib.urlencode({'curstk':stockid, 'stk_date':todaydate})
        CVSfile = targetstockiddir + str(stockid) + '_' + todaydate + '.csv'
        (a, b) = urlretrieve('http://www.gretai.org.tw/ch/stock/aftertrading/broker_trading/download_ALLCSV.php?', CVSfile, data=args)
        print '%03d' %i + ' : ' + CVSfile + ' ...... ' + '%03d' %i + '/' + str(len(stock))

def Transformcsv(stock):    
    saledate = GetTodayDate(1)  #西元
    todaydate = GetTodayDate(2) #民國
    
    sourcecsvdir = 'D:\\stock_database\\test\\otc\\%s\\' %todaydate
    targetdir = 'D:\\stock_database\\importMySQL_otc\\test\\%s\\' %saledate
    if not os.path.isdir(targetdir):
        os.makedirs(targetdir)

    GetStockidtoCSVfile(targetdir + saledate + '.csv', stock)

    for i in range(0, len(stock)):
        stockid = stock[i]
        sourcecsv = sourcecsvdir + stockid + "_" + todaydate +'.csv'
        targetfile = targetdir + stockid + "_" + saledate + '.csv' 

        writefile = file(targetfile, 'wb')
        writefile.write(codecs.BOM_UTF8)
        writer = csv.writer(writefile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
        writer.writerow([u'交易日期'.encode('utf8'), u'股票種類'.encode('utf8'), u'股票代號'.encode('utf8'), \
                         u'券商'.encode('utf8'), u'價格'.encode('utf8'), u'買進股數'.encode('utf8'), u'賣出股數'.encode('utf8')])

        readfile = codecs.open(sourcecsv, 'r') # open the file
        reader = csv.reader(readfile)
        row = 0
        salecount = 0
        for col in reader:
            if row >= 3 :
                if len(col) == 11:
                    for k in range(0, 2):
                        if col[6 * k]:
                            salecount+=1
                            brokerid = col[6 * k + 1].split("  ")[0]
                            price = col[6 * k + 2]
                            buyvol = col[6 * k + 3].replace(',', '')
                            sellvol = col[6 * k + 4].replace(',', '')
                            
                            writer.writerow([ '%s' %saledate.encode('utf8'), 'otc', '%s' %stockid.encode('utf8'), '%s' %brokerid.encode('utf8'), \
                                              '%s' %price.encode('utf8'), '%s' %buyvol.encode('utf8'), '%s' %sellvol.encode('utf8')])
                elif len(col) == 5:
                    salecount+=1
                    brokerid = col[1].split("  ")[0]
                    price = col[2]
                    buyvol = col[3].replace(',', '')
                    sellvol = col[4].replace(',', '')
                    
                    writer.writerow([ '%s' %saledate.encode('utf8'), 'otc', '%s' %stockid.encode('utf8'), '%s' %brokerid.encode('utf8'), \
                                      '%s' %price.encode('utf8'), '%s' %buyvol.encode('utf8'), '%s' %sellvol.encode('utf8')])

            row+=1
        readfile.close()
        writefile.close()
        print '%03d' %i + ' Stockid : ' + stockid + ' transform ' + todaydate + ' count : ' + '%05d' %salecount + ' ...... ' + '%03d' %i + '/' + str(len(stock))


def main():
    #取得股票代碼
    stock = Getstockid(4)

    #取得今天日期,格式為民國年月日 1011001
    todaydate = GetTodayDate(2)  #民國

    #從網站上抓取CSV檔的交易資料
    GetCSVfilefromweb(todaydate, stock.stock)

    #轉換CSV檔的交易明細
    Transformcsv(stock.stock)

    #儲存交易日的日期到CSV檔中
    WriteSaledayetoCSV(GetTodayDate(1))
    
if __name__ == "__main__":    
    main()

[2014/8/3] 修正抓取上櫃股票代碼,變數中殘存上一筆CFICode代碼,導致下一筆判別錯誤,以致重複出現股票代碼。