2012年8月11日 星期六

Python每月營收

使用Python透過解析網頁元素取得公開資訊觀測站上所公佈的每月營收

#!/usr/bin/python
# -*- coding: utf-8 -*-

#---------------------------------------------
#   抓上市與上櫃各股每月營收
#   Version : 1.1
#   Author : Amin white
#   Release Date : 2012-06-27
#   Python version : 2.7
#---------------------------------------------

#引用函式庫
import csv, codecs, urllib, datetime, os, time
from sgmllib import SGMLParser
    
def main():
    #上市櫃公司自90年6月才有登入月營收資料

    #包含上市與上櫃
    stockkind = ['sii', 'otc']

    #指定儲存的路徑,可自行修改儲存路徑
    Savefiledir = 'D:\\Revenue\\'    

    #建立儲存營收CSV資料夾
    if not os.path.isdir(Savefiledir):
        os.makedirs(Savefiledir)

    #取得使用當天的日期
    today = datetime.datetime.today()
    todaysec = time.mktime(datetime.datetime(int(today.strftime('%Y')), int(today.strftime('%m')), int(today.strftime('%d'))).timetuple())

    for i in xrange(len(stockkind)):
        stocktype = stockkind[i]
        for j in range(2012, 2013):
            pyADYear = str(j)
            pyROCYear = str(j - 1911)
            for k in range(1, 13):

                #取得從1月至今日每月的營收
                Revenuedaysec = time.mktime(datetime.datetime(j, k, 10).timetuple())
                if Revenuedaysec <= todaysec:

                    print '取得 ' + pyADYear + ' 年 ' + str('%02d' %k) + ' 月 ' + stocktype + ' 全部公司營收資料' 
                    #營收網址
                    url = "http://mops.twse.com.tw/t21/" + stocktype + "/t21sc03_" + pyROCYear + "_" + str(k) + ".html"

                    #解析網頁開始
                    webcode = urllib.urlopen(url)
                    if webcode.code == 200:
                        stock = Parser_htm()
                        stock.parse(webcode.read())
                        webcode.close()

                    #儲存CSV檔名
                    SaveCSVname = Savefiledir + stocktype + '_' + pyADYear + str('%02d' %k) + '.csv'
                    print '設定寫入檔案名稱與格式內容......'

                    #開始寫入檔案準備
                    fileoption = codecs.open(SaveCSVname, 'wb')

                    #指定檔案以UTF8儲存
                    fileoption.write(codecs.BOM_UTF8)

                    #指定CSV檔分隔的方式
                    writer = csv.writer(fileoption, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)

                    print '寫入營收至 ' + SaveCSVname +  ' 開始......'
                    #寫入欄位說明
                    writer.writerow([u'產業別'.encode('utf8'), u'公司代號'.encode('utf8'), u'公司名稱'.encode('utf8'), u'當月營收'.encode('utf8'), u'上月營收'.encode('utf8'),\
                                     u'去年當月營收'.encode('utf8'), u'上月比較增減(%)'.encode('utf8'), u'去年同月增減(%)'.encode('utf8'), u'當月累計營收'.encode('utf8'), \
                                     u'去年累計營收'.encode('utf8'), u'前期比較增減(%)'.encode('utf8')])

                    for i in xrange(len(stock.totaldata)):
                        totaldata = stock.totaldata[i]

                        #寫入每間公司各營收資料
                        writer.writerow([totaldata[0].encode('utf8'), totaldata[1].encode('utf8'), totaldata[2].encode('utf8'), \
                                         totaldata[3].encode('utf8'), totaldata[4].encode('utf8'), totaldata[5].encode('utf8'), \
                                         totaldata[6].encode('utf8'), totaldata[7].encode('utf8'), totaldata[8].encode('utf8'), \
                                         totaldata[9].encode('utf8'), totaldata[10].encode('utf8')])
                    #關閉檔案
                    fileoption.close()                    
                    print '寫入營收至 ' + SaveCSVname + ' 完成......\n'
            
#解析營收網頁class
class Parser_htm(SGMLParser):

    #初始化class等同constructor
    def __init__(self):
        SGMLParser.__init__(self)

    #初始化變數數值
    def reset(self):
        SGMLParser.reset(self)
        self.bPrintDetail = False
        self.bStartParserHeml = False
        self.bStartParserdata = False
        self.bDataempty = False
        
        self.bItemclass = False
        self.szitemclass = ""
        self.bStockid = False
        self.szstockid = ""
        self.bCompanyname = False
        self.nowrapflag = False
        self.nowrapcount = 0
        self.stockdata = []
        self.totaldata = []

    #解析網頁
    def parse(self,data):
        self.feed(data)
        self.close()

    #解析網頁標籤為tr的內容
    def start_tr(self, attrs):
        if len(attrs) == 1:
            if attrs[0][0] == 'align':
                if attrs[0][1] == 'right':
                    self.bStartParserdata = True

    #解析網頁標籤為th的內容
    def start_th(self, attrs):
        #抓取準備解析資料內容起點
        if len(attrs) == 2:
            if attrs[0][0] == 'class' and attrs[0][1] == 'tt' and attrs[1][0] == 'align' and attrs[1][1] == 'left':              
                self.bItemclass = True
                self.szitemclass = ""
                self.bStartParserHeml = True
                
        #抓取準備解析資料內容終點       
        if len(attrs) == 4:
            if attrs[0][0] == 'class' and attrs[0][1] == 'tt' and \
               attrs[1][0] == 'nowrap' and attrs[1][1] == 'nowrap' and \
               attrs[2][0] == 'colspan' and attrs[2][1] == '2' and \
               attrs[3][0] == 'align' and attrs[3][1] == 'center':
                self.bStartParserHeml = False
                self.bStartParserdata = False                

    #解析網頁標籤為td的內容
    def start_td(self, attrs):
        #解析td標籤屬性名稱與屬性
        if len(attrs) == 1:
            if attrs[0][0] == 'align':
                
                #抓股票代碼與資料解析起點
                if attrs[0][1] == 'center':
                    self.bStartParserdata = True
                    self.bStockid = True
                    
                #抓公司名稱起點   
                if attrs[0][1] == 'left':
                    self.bCompanyname = True
                    
            #收尋資料為空字串的終點與解析各營收內容起點
            if attrs[0][0] == 'nowrap' and attrs[0][1] == 'nowrap':
                self.nowrapflag = True
                self.bDataempty = False
                self.nowrapcount += 1                

        #收尋資料為空字串的起點
        if len(attrs) == 0:
            self.bDataempty = True

    #取得網頁表格內容是文字,數字以外的資料
    def handle_entityref(self,ref):
        #解析資料內容為空字串
        if ref == 'nbsp':
            if self.bDataempty:
                self.stockdata.append("")
                self.nowrapcount += 1
                #print ref + " " + str(self.nowrapcount)
            
    #開始讀取各公司營收資料到暫存list中
    def handle_data(self, text):
        #產業別
        if self.bItemclass:
            data = unicode(text.strip(), "BIG5").encode('utf8').split(':')
            self.szitemclass = unicode(data[1], "utf8")
            if self.bPrintDetail:
                print "產業別 :          " + self.szitemclass.encode('utf8') 
            self.bItemclass = False
               
        if self.bStartParserdata:
            
            #公司代碼或稱股票代碼
            if self.bStockid :
                if self.bPrintDetail:
                    print "公司代號 :        " + text.strip()
                self.szstockid = text.strip()
                self.stockdata.append(self.szitemclass)
                self.stockdata.append(self.szstockid)
                self.bStockid = False
                self.nowrapcount = 0

            #公司名稱
            if self.bCompanyname :
                #使用BIG5解碼,因為支援字型不夠,就需要加入以下的內容,若使用cp950解碼,只需加入網頁無法呈現的字型即可                
                """if self.szstockid == '1325':
                    data = u'恒大'
                elif self.szstockid == '2353':
                    data = u'宏碁'
                elif self.szstockid == '3046':
                    data = u'建碁'
                elif self.szstockid == '6285':
                    data = u'啟碁科技'
                elif self.szstockid == '4527':
                    data = u'方土霖'
                elif self.szstockid == '8111':
                    data = u'立碁電子'
                elif self.szstockid == '6174':
                    data = u'安碁科技'
                else:
                    data = text.strip().decode('BIG5')"""
                
                if self.szstockid == '4527':
                    data = u'方土霖'
                else:
                    data = text.strip().decode('cp950')
                    
                if self.bPrintDetail:
                    print "公司名稱 :        " + data.encode('utf8')
                self.stockdata.append(data)
                self.bCompanyname = False            

            if self.nowrapflag :
                self.nowrapflag = False
                #各營收資料
                if self.nowrapcount == 1:
                    if self.bPrintDetail:
                        print "當月營收 :        " + text.strip()
                    data = text.strip().replace(",", "")
                    self.stockdata.append(data)
                elif self.nowrapcount == 2:
                    if self.bPrintDetail:
                        print "上月營收 :        " + text.strip()
                    data = text.strip().replace(",", "")
                    self.stockdata.append(data)
                elif self.nowrapcount == 3:
                    if self.bPrintDetail:
                        print "去年當月營收 :     " + text.strip()
                    data = text.strip().replace(",", "")
                    self.stockdata.append(data)
                elif self.nowrapcount == 4:
                    if self.bPrintDetail:
                        print "上月比較增減(%) :  " + text.strip()
                    data = text.strip().replace(",", "")
                    self.stockdata.append(data)
                elif self.nowrapcount == 5:
                    if self.bPrintDetail:
                        print "去年同月增減(%) :  " + text.strip()
                    data = text.strip().replace(",", "")
                    self.stockdata.append(data)
                elif self.nowrapcount == 6:
                    if self.bPrintDetail:
                        print "當月累計營收 :     "  + text.strip()
                    data = text.strip().replace(",", "")
                    self.stockdata.append(data)
                elif self.nowrapcount == 7:
                    if self.bPrintDetail:
                        print "去年累計營收 :     " + text.strip()
                    data = text.strip().replace(",", "")
                    self.stockdata.append(data)
                elif self.nowrapcount == 8:
                    if self.bPrintDetail:
                        print "前期比較增減(%) :  " + text.strip() + "\n"
                    data = text.strip().replace(",", "")
                    self.stockdata.append(data)
                    self.totaldata.append(self.stockdata)
                    self.stockdata = []

#函數進入點
if __name__ == "__main__":
    main()

Python執行畫面如下圖所示

CSV檔案儲存內容如下