2013年11月30日 星期六

Python抓取個股財務比率合併財報季表

抓取"個股財務比率合併財報季表",我們以台泥1101來作範例。
抓取 http://jsjustweb.jihsun.com.tw/z/zc/zcr/zcr_1101.djhtm 有興趣可參考如下Python。
# -*- coding: utf-8 -*-

import urllib, chardet
from sgmllib import SGMLParser
    
class Parser(SGMLParser):
    def __init__(self):
        SGMLParser.__init__(self)

    def reset(self):
        SGMLParser.reset(self)
        self.start = False
        self.header = False
        self.item = False        
        self.data = False
        self.rowcount = 0
        self.columncount = 0
        self.tablecolumncount = 0
        self.info = []        
        
    def parse(self,data):
        self.feed(data)
        self.close()

    def start_tr(self, attrs):
        for name, value in attrs:
            if len(attrs) == 1 and name == 'id':
                if value == 'oScrollMenu':
                    self.start = True
                    
    def start_td(self, attrs):
        if len(attrs) == 2:
            if attrs[1][0] == 'colspan' and self.tablecolumncount == 0:
                self.tablecolumncount = int(attrs[1][1])

        if self.start:
            for name, value in attrs:                     
                if len(attrs) == 1 and name == 'class':
                    if value == 't2':
                        self.header = True
                        if self.columncount == 0:
                            self.info.append([])
                    elif value == 't4t1':                        
                        self.item = True
                        self.info.append([])
                    elif value == 't3n1' or value == 't3r1':
                        self.data = True                    
        
    def handle_data(self, text):
        if self.header:
            #print "header = "+ text
            self.header = False
            self.info[self.rowcount].append(text.strip())
            self.columncount += 1
            
        elif self.item:
            #print "Item = "+ text
            self.item = False            
            self.info[self.rowcount].append(text.strip())            
            self.columncount += 1
            
        elif self.data:
            #print "Data = " + text
            self.data = False            
            self.info[self.rowcount].append(text)
            self.columncount += 1
            
        if self.columncount == self.tablecolumncount and self.tablecolumncount != 0:
            self.columncount = 0
            self.rowcount += 1
                        
def main():
    url = "http://jsjustweb.jihsun.com.tw/z/zc/zcr/zcr_1101.djhtm"
    webcode = urllib.urlopen(url)
    if webcode.code == 200:
        stock = Parser()
        webdata = webcode.read()
        mychar = chardet.detect(webdata)
        print "Web Encoding : " + mychar['encoding']
        stock.parse(webdata)        
        webcode.close()

    for i in range(0, len(stock.info)):        
        for j in range(0, len(stock.info[i])):
            print  stock.info[i][j],
        print "\n"
        
if __name__ == "__main__":
    main()   

執行結果如下: