抓取 http://jsjustweb.jihsun.com.tw/z/zc/zcr/zcr_1101.djhtm 有興趣可參考如下Python。
# -*- coding: utf-8 -*- import urllib, chardet from sgmllib import SGMLParser class Parser(SGMLParser): def __init__(self): SGMLParser.__init__(self) def reset(self): SGMLParser.reset(self) self.start = False self.header = False self.item = False self.data = False self.rowcount = 0 self.columncount = 0 self.tablecolumncount = 0 self.info = [] def parse(self,data): self.feed(data) self.close() def start_tr(self, attrs): for name, value in attrs: if len(attrs) == 1 and name == 'id': if value == 'oScrollMenu': self.start = True def start_td(self, attrs): if len(attrs) == 2: if attrs[1][0] == 'colspan' and self.tablecolumncount == 0: self.tablecolumncount = int(attrs[1][1]) if self.start: for name, value in attrs: if len(attrs) == 1 and name == 'class': if value == 't2': self.header = True if self.columncount == 0: self.info.append([]) elif value == 't4t1': self.item = True self.info.append([]) elif value == 't3n1' or value == 't3r1': self.data = True def handle_data(self, text): if self.header: #print "header = "+ text self.header = False self.info[self.rowcount].append(text.strip()) self.columncount += 1 elif self.item: #print "Item = "+ text self.item = False self.info[self.rowcount].append(text.strip()) self.columncount += 1 elif self.data: #print "Data = " + text self.data = False self.info[self.rowcount].append(text) self.columncount += 1 if self.columncount == self.tablecolumncount and self.tablecolumncount != 0: self.columncount = 0 self.rowcount += 1 def main(): url = "http://jsjustweb.jihsun.com.tw/z/zc/zcr/zcr_1101.djhtm" webcode = urllib.urlopen(url) if webcode.code == 200: stock = Parser() webdata = webcode.read() mychar = chardet.detect(webdata) print "Web Encoding : " + mychar['encoding'] stock.parse(webdata) webcode.close() for i in range(0, len(stock.info)): for j in range(0, len(stock.info[i])): print stock.info[i][j], print "\n" if __name__ == "__main__": main()
執行結果如下: