#!/usr/bin/python # -*- coding: utf-8 -*- #--------------------------------------------- # 抓個股集保資料 # Version : 0.1 # Author : Amin white # Release Date : 2012-01-01 # Python version : 2.7.2 #--------------------------------------------- #引用函式庫 import urllib, csv, datetime, os, codecs, time from datetime import date from sgmllib import SGMLParser def GetTdccinfo(stockid, stockname): startdate = "" enddate = "" #取得今天的日期 today = datetime.datetime.today() year = today.strftime('%Y') month = today.strftime('%m') day = today.strftime('%d') #計算從1970到今天的總秒數 todaysec = time.mktime(datetime.datetime(int(year), int(month), int(day)).timetuple()) #指定儲存的路徑,可自行變更儲存路徑 workdir = 'D:\\Stock_Concentration\\' #建立儲存資料夾 if not os.path.isdir(workdir): os.makedirs(workdir) savefile = workdir + stockid + '.csv' print "寫入" + stockid + " " + stockname.encode('utf8') + " 集保資料到 " + savefile + " 開始..." #開始寫入檔案準備 writefile = file(savefile, 'wb') #指定檔案以UTF8儲存 writefile.write(codecs.BOM_UTF8) #指定CSV檔分隔的方式 writer = csv.writer(writefile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) #寫入欄位說明,可自行變更儲存說明欄位 writer.writerow([u'日期'.encode('utf8'), u'股票代號'.encode('utf8'), u'股票名稱'.encode('utf8'), \ u'1-999'.encode('utf8'), u'1-999'.encode('utf8'), u'1-999'.encode('utf8'), \ u'1,000-5,000'.encode('utf8'), u'1,000-5,000'.encode('utf8'), u'1,000-5,000'.encode('utf8'), \ u'5,001-10,000'.encode('utf8'), u'5,001-10,000'.encode('utf8'), u'5,001-10,000'.encode('utf8'), \ u'10,001-15,000'.encode('utf8'), u'10,001-15,000'.encode('utf8'), u'10,001-15,000'.encode('utf8'),\ u'15,001-20,000'.encode('utf8'), u'15,001-20,000'.encode('utf8'), u'15,001-20,000'.encode('utf8'), \ u'20,001-30,000'.encode('utf8'), u'20,001-30,000'.encode('utf8'), u'20,001-30,000'.encode('utf8'), \ u'30,001-40,000'.encode('utf8'), u'30,001-40,000'.encode('utf8'), u'30,001-40,000'.encode('utf8'),\ u'40,001-50,000'.encode('utf8'), u'40,001-50,000'.encode('utf8'), u'40,001-50,000'.encode('utf8'), \ u'50,001-100,000'.encode('utf8'), u'50,001-100,000'.encode('utf8'), u'50,001-100,000'.encode('utf8'), \ u'100,001-200,000'.encode('utf8'), u'100,001-200,000'.encode('utf8'), u'100,001-200,000'.encode('utf8'), \ u'200,001-400,000'.encode('utf8'), u'200,001-400,000'.encode('utf8'), u'200,001-400,000'.encode('utf8'), \ u'400,001-600,000'.encode('utf8'), u'400,001-600,000'.encode('utf8'), u'400,001-600,000'.encode('utf8'), \ u'600,001-800,000'.encode('utf8'), u'600,001-800,000'.encode('utf8'), u'600,001-800,000'.encode('utf8'), \ u'800,001-1,000,000'.encode('utf8'), u'800,001-1,000,000'.encode('utf8'), u'800,001-1,000,000'.encode('utf8'),\ u'1,000,001以上'.encode('utf8'), u'1,000,001以上'.encode('utf8'), u'1,000,001以上'.encode('utf8')]) DataCount = 0 for y in range(int(year) - 1, int(year) + 1): for m in range(1, 13): cell = [] for d in range(1, 15): #排除2011年特殊的日子 if y == 2011 and m == 5 and d == 2 and date(y, 5, 2).isoweekday() == 1: continue; #計算要取得指定日期的秒數 Revenuedaysec = time.mktime(datetime.datetime(y, m, d).timetuple()) #判別指定日期是否有超過今天的日期 if date(y, m, d).isoweekday() != 6 and date(y, m, d).isoweekday() != 7 and Revenuedaysec <= todaysec: datestring = str(y) + str("%02d" % m) + str("%02d" % d) url = "http://www.tdcc.com.tw/smWeb/QryStock.jsp?SCA_DATE=" + datestring + "&SqlMethod=StockNo&StockNo=" + stockid + "&StockName=&sub=%ACd%B8%DF" cell = TdccData(url) if len(cell) == 0 : break #抓取第一筆集保日期 if startdate == "": startdate = datestring #寫入集保資料,可自行變更儲存資料欄位 writer.writerow([datestring.encode('utf8'), stockid.encode('utf8'), stockname.encode('utf8'), cell[0].encode('utf8'), \ cell[1].encode('utf8'), cell[2].encode('utf8'), cell[3].encode('utf8'), cell[4].encode('utf8'), \ cell[5].encode('utf8'), cell[6].encode('utf8'), cell[7].encode('utf8'), cell[8].encode('utf8'), \ cell[9].encode('utf8'), cell[10].encode('utf8'), cell[11].encode('utf8'), cell[12].encode('utf8'), \ cell[13].encode('utf8'), cell[14].encode('utf8'), cell[15].encode('utf8'), cell[16].encode('utf8'), \ cell[17].encode('utf8'), cell[18].encode('utf8'), cell[19].encode('utf8'), cell[20].encode('utf8'), \ cell[21].encode('utf8'), cell[22].encode('utf8'), cell[23].encode('utf8'), cell[24].encode('utf8'), \ cell[25].encode('utf8'), cell[26].encode('utf8'), cell[27].encode('utf8'), cell[28].encode('utf8'), \ cell[29].encode('utf8'), cell[30].encode('utf8'), cell[31].encode('utf8'), cell[32].encode('utf8'), \ cell[33].encode('utf8'), cell[34].encode('utf8'), cell[35].encode('utf8'), cell[36].encode('utf8'), \ cell[37].encode('utf8'), cell[38].encode('utf8'), cell[39].encode('utf8'), cell[40].encode('utf8'), \ cell[41].encode('utf8'), cell[42].encode('utf8'), cell[43].encode('utf8'), cell[44].encode('utf8') ]) DataCount += 1 #抓取第最後一筆集保日期 enddate = datestring #關閉檔案 writefile.close() print stockname.encode('utf8') + " 集保資料 " + startdate + '~' + enddate + "寫入 " + savefile + " 中,共完成 " + str(DataCount) + " 筆寫入\n" def TdccData(URL): #解析網頁開始 webcode = urllib.urlopen(URL) if webcode.code == 200: Tdcc = ParseWebData() Tdcc.parse(webcode.read()) Tdcc.close() if len(Tdcc.cell) > 0 : return Tdcc.cell else: return [] class ParseWebData(SGMLParser): #初始化class等同constructor def __init__(self): SGMLParser.__init__(self) #初始化變數數值 def reset(self): SGMLParser.reset(self) self.headname = False self.center = False self.right = False self.cell = [] self.centercount = 0 self.rightcount = 0 #解析網頁 def parse(self,data): self.feed(data) self.close() #解析網頁標籤為td的內容 def start_td(self, attrs): for name,value in attrs: if name =='class' and value == 'wuc9': self.headname = True if name =='align' and value == 'center': self.center = True self.centercount+=1 if name =='align' and value == 'right': self.right = True self.rightcount+=1 #開始讀取集保資料到暫存list中 def handle_data(self, text): #print text if self.headname: #print text self.headname = False if self.center: if self.centercount == 1: #print "center : " + text self.center = False if self.centercount == 2: #print "center : " + text self.center = False self.centercount = 0 if self.right : if self.rightcount == 1: #print "right : " + text self.cell.append(text.strip().replace(",", "")) self.right = False if self.rightcount == 2: #print "right : " + text self.cell.append(text.strip().replace(",", "")) self.right = False if self.rightcount == 3: #print "right : " + text self.cell.append(text.strip()) self.rightcount = 0 self.right = False class ParsestrModeWeb(SGMLParser): #初始化變數數值 def reset(self): SGMLParser.reset(self) self.stockinfo = False self.cellno = 0 self.stockid = [] self.stockname = [] self.cell1 = "" #解析網頁標籤為td的內容 def start_td(self, attrs): if len(attrs) == 1: if attrs[0][0] == 'bgcolor' and attrs[0][1] == '#FAFAD2': self.stockinfo = True self.cellno+=1 self.cellno%=7 #開始讀取股票代碼 def handle_data(self, text): if self.stockinfo: if self.cellno == 1: self.cell1 = text.strip() elif self.cellno == 6 and text.strip() == 'ESVUFR': data = self.cell1.strip().split(' \xa1@') self.stockid.append(data[0].strip()) self.stockname.append(data[1].strip().decode('cp950')) elif self.cellno == 7: self.cellno = 0 self.stockinfo = False def main(): #上市, 上櫃股票代碼網址 stocktype = [2,4] for i in range(0, len(stocktype)): url = "http://brk.twse.com.tw:8000/isin/C_public.jsp?strMode=" + str(stocktype[i]) #解析網頁開始 webcode = urllib.urlopen(url) if webcode.code == 200: stock = ParsestrModeWeb() stock.feed(webcode.read()) stock.close() #取得個股集保資料 for j in range(0, len(stock.stockid)): GetTdccinfo(stock.stockid[j], stock.stockname[j]) if __name__ == "__main__": main()
Python執行結果畫面
CSV檔案儲存內容如下