抓取 http://jsjustweb.jihsun.com.tw/z/zc/zcr/zcr_1101.djhtm 有興趣可參考如下Python。
# -*- coding: utf-8 -*-
import urllib, chardet
from sgmllib import SGMLParser
class Parser(SGMLParser):
def __init__(self):
SGMLParser.__init__(self)
def reset(self):
SGMLParser.reset(self)
self.start = False
self.header = False
self.item = False
self.data = False
self.rowcount = 0
self.columncount = 0
self.tablecolumncount = 0
self.info = []
def parse(self,data):
self.feed(data)
self.close()
def start_tr(self, attrs):
for name, value in attrs:
if len(attrs) == 1 and name == 'id':
if value == 'oScrollMenu':
self.start = True
def start_td(self, attrs):
if len(attrs) == 2:
if attrs[1][0] == 'colspan' and self.tablecolumncount == 0:
self.tablecolumncount = int(attrs[1][1])
if self.start:
for name, value in attrs:
if len(attrs) == 1 and name == 'class':
if value == 't2':
self.header = True
if self.columncount == 0:
self.info.append([])
elif value == 't4t1':
self.item = True
self.info.append([])
elif value == 't3n1' or value == 't3r1':
self.data = True
def handle_data(self, text):
if self.header:
#print "header = "+ text
self.header = False
self.info[self.rowcount].append(text.strip())
self.columncount += 1
elif self.item:
#print "Item = "+ text
self.item = False
self.info[self.rowcount].append(text.strip())
self.columncount += 1
elif self.data:
#print "Data = " + text
self.data = False
self.info[self.rowcount].append(text)
self.columncount += 1
if self.columncount == self.tablecolumncount and self.tablecolumncount != 0:
self.columncount = 0
self.rowcount += 1
def main():
url = "http://jsjustweb.jihsun.com.tw/z/zc/zcr/zcr_1101.djhtm"
webcode = urllib.urlopen(url)
if webcode.code == 200:
stock = Parser()
webdata = webcode.read()
mychar = chardet.detect(webdata)
print "Web Encoding : " + mychar['encoding']
stock.parse(webdata)
webcode.close()
for i in range(0, len(stock.info)):
for j in range(0, len(stock.info[i])):
print stock.info[i][j],
print "\n"
if __name__ == "__main__":
main()
執行結果如下:
