2012年8月11日 星期六

Python抓每月合併營收

使用Python透過解析網頁元素取得公開資訊觀測站上所公佈的每月合併營收
#!/usr/bin/python
# -*- coding: utf-8 -*-

#---------------------------------------------
#   抓上市與上櫃各股每月營收
#   Version : 1.1
#   Author : Amin white
#   Release Date : 2012-06-27
#   Python version : 2.7.2
#---------------------------------------------

#引用函式庫
import csv, codecs, urllib, datetime, os, time, pdb
from sgmllib import SGMLParser

def main():
    #上市櫃公司自90年6月才有登入月營收資料

    #包含上市與上櫃
    stockkind = ['sii', 'otc']

    #指定儲存的路徑,可自行變更儲存路徑
    Savefiledir = 'D:\\Consolidated_Revenue\\'    

    #建立儲存營收CSV資料夾
    if not os.path.isdir(Savefiledir):
        os.makedirs(Savefiledir)

    #取得使用當天的日期
    today = datetime.datetime.today()
    todaysec = time.mktime(datetime.datetime(int(today.strftime('%Y')), int(today.strftime('%m')), int(today.strftime('%d'))).timetuple())
    
    for i in xrange(len(stockkind)):
        stocktype = stockkind[i]
        for j in range(2012, 2013):
            pyADYear = str(j)
            pyROCYear = str(j - 1911)
            for k in range(1, 13): 

                #取得從1月至今日每月的營收                
                Revenuedaysec = time.mktime(datetime.datetime(j, k, 10).timetuple())
                if Revenuedaysec <= todaysec:
                    
                    print '取得 ' + pyADYear + ' 年 ' + str('%02d' %k) + ' 月 ' + stocktype + ' 全部公司營收資料' 
                    #營收網址
                    url = 'http://mops.twse.com.tw/mops/web/ajax_t21sb06?TYPEK=' + stocktype + '&year= ' + pyROCYear + '&month=' + str('%02d' %k) + '&step=1&firstin=1&off=1'

                     #解析網頁開始
                    webcode = urllib.urlopen(url)
                    if webcode.code == 200:
                        stock = Parser_htm()
                        stock.parse(webcode.read())
                        webcode.close()
                        
                    #儲存CSV檔名
                    SaveCSVname = Savefiledir + stocktype + '_' + pyADYear + str('%02d' %k) + '.csv'
                    print '設定寫入檔案名稱與格式內容......'

                    #開始寫入檔案準備
                    fileoption = codecs.open(SaveCSVname, 'wb')

                    #指定檔案以UTF8儲存
                    fileoption.write(codecs.BOM_UTF8)

                    #指定CSV檔分隔的方式
                    writer = csv.writer(fileoption, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)

                    print '寫入營收至 ' + SaveCSVname +  ' 開始......'
                    #寫入欄位說明
                    writer.writerow([u'公司代號'.encode('utf8'), u'公司名稱'.encode('utf8'), u'當月營收'.encode('utf8'), u'上月營收'.encode('utf8'),\
                                     u'去年當月營收'.encode('utf8'), u'上月比較增減(%)'.encode('utf8'), u'去年同月增減(%)'.encode('utf8'), u'當月累計營收'.encode('utf8'), \
                                     u'去年累計營收'.encode('utf8'), u'前期比較增減(%)'.encode('utf8')])

                    for i in xrange(len(stock.totaldata)):
                        totaldata = stock.totaldata[i]

                        #寫入每間公司各營收資料
                        writer.writerow([totaldata[0].encode('utf8'), totaldata[1].encode('utf8'), totaldata[2].encode('utf8'), \
                                         totaldata[3].encode('utf8'), totaldata[4].encode('utf8'), totaldata[5].encode('utf8'), \
                                         totaldata[6].encode('utf8'), totaldata[7].encode('utf8'), totaldata[8].encode('utf8'), \
                                         totaldata[9].encode('utf8')])
                    #關閉檔案
                    fileoption.close()                    
                    print '寫入營收至 ' + SaveCSVname + ' 完成......\n'                    
        

#解析營收網頁class
class Parser_htm(SGMLParser):

    #初始化class
    def __init__(self):
        SGMLParser.__init__(self)

    #初始化變數數值
    def reset(self):
        SGMLParser.reset(self)
        self.bPrintDetail = False
        self.bEven_Odd_Row = False
        self.Id_Company_count = 0
        self.bId_Company = False
        self.Rowcount = 0
        self.bRow = False
        self.btblHead = False
        self.tblHeadcount =0
        self.bHeadname = False
        self.Headnamecount = 0
        self.Headname = []
        self.stockdata = []
        self.totaldata = []

    #解析網頁    
    def parse(self,data):
        self.feed(data)
        self.close()

    #解析網頁標籤為tr的內容
    def start_tr(self, attrs):
        if len(attrs) == 1:
            if attrs[0][0] == 'class':
                if attrs[0][1] == 'tblHead':
                    self.tblHeadcount += 1
                    
                if attrs[0][1] == 'even' or attrs[0][1] == 'odd':
                    self.bEven_Odd_Row = True

    def start_td(self, attrs):
        if len(attrs) == 0 and self.bEven_Odd_Row:
            self.Id_Company_count += 1
            self.bId_Company = True
           
        if len(attrs) == 1:
            if attrs[0][0] == 'align' and attrs[0][1] == 'right':
                self.Rowcount += 1
                self.bRow = True
                
    def start_th(self, attrs):
        if len(attrs) == 0 and self.tblHeadcount == 2:
            self.Headnamecount += 1
            self.bHeadname = True        

    def handle_data(self, text):        
        if self.bHeadname and self.Headnamecount <= 9:
            #print text.strip().decode('utf8').encode('utf8')
            self.Headname.append(text.strip().decode('utf8').encode('utf8'))
            self.bHeadname = False
        else:
            self.Headnamecount = 0
            
        
        if self.bId_Company:
            if self.Id_Company_count == 1:
                if self.bPrintDetail:
                    print self.Headname[0] + ' : ' + text.strip()
                self.stockdata.append(text.strip().decode('utf8'))
            elif self.Id_Company_count == 2:
                #data = text.strip().decode('BIG5')
                if self.bPrintDetail:
                    print self.Headname[1] + ' : ' + text.strip()
                self.stockdata.append(text.strip().decode('utf8'))
                self.Id_Company_count = 0
            self.bId_Company = False
                
        if self.bRow:
            self.bRow = False
            if self.Rowcount < 8:
                if self.bPrintDetail:
                    print self.Headname[self.Rowcount + 2] + ' : ' + text.strip().replace(",", "")
                self.stockdata.append(text.strip())
            elif self.Rowcount == 8:
                if self.bPrintDetail:
                    print self.Headname[self.Rowcount + 1] + ' : ' + text.strip().replace(",", "") + '\n'
                self.stockdata.append(text.strip())
                self.Rowcount = 0
                self.totaldata.append(self.stockdata)
                self.stockdata = []
                
            """if self.Rowcount == 1:
                print '當月合併營收 : ' + text.strip()
            elif self.Rowcount == 2:
                print '上月合併營收 : ' + text.strip()
            elif self.Rowcount == 3:
                print '去年當月合併營收 : ' + text.strip()
            elif self.Rowcount == 4:
                print '上月比較增減(%) : ' + text.strip()
            elif self.Rowcount == 5:
                print '去年同月增減(%) : ' + text.strip()
            elif self.Rowcount == 6:
                print '當年累計營收 : ' + text.strip()
            elif self.Rowcount == 7:
                print '去年累計營收 : ' + text.strip()
            elif self.Rowcount == 8:
                print '前期比較增減(%) : ' + text.strip() + '\n'
                self.Rowcount = 0""" 
            
#函數進入點
if __name__ == "__main__":
    main()

Python執行畫面如下圖所示


CSV檔案儲存內容如下