[Web] re:exchange rate crawler
呃...就是網頁的 css 改掉了 class name 也跟著改,所以原本 grep decimal 不管用了...要重寫
於是用前幾篇的 python url庫重練,這次感覺抓得沒有之前那麼針對性,比較通用,應該可以撐得更久一點bar ?!
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2.7 | |
import urllib2 , io | |
from HTMLParser import HTMLParser | |
####################### SETTING ######################## | |
bank_url = "http://rate.bot.com.tw/xrt?Lang=zh-TW" | |
####################### PAESER IMPLEMENT ############### | |
class rateParser(HTMLParser): | |
def __init__(self): | |
HTMLParser.__init__(self) | |
self.getData = False | |
self.getCurrencyName = True | |
self.countRate = 0 | |
self.lastTag = "" | |
def handle_starttag(self , tag , attrs): | |
self.lastTag = tag | |
if tag == "td": | |
self.getData = True; | |
def handle_data(self , data): | |
if self.getData == True: | |
if self.lastTag == 'td' and self.countRate < 4: | |
if data.strip() != "": | |
print '%-7s\t' %data.strip() , " " , | |
self.countRate += 1 | |
if self.lastTag == 'div' and self.getCurrencyName == True: | |
if data.strip() != "": | |
print data.strip() , "\t" , | |
self.getCurrencyName = False | |
def handle_endtag(self , tag): | |
if tag == 'td' and self.getData == True: | |
self.getData = False | |
if tag == 'tr': | |
if self.getCurrencyName == False: | |
print "" | |
self.getCurrencyName = True | |
self.countRate = 0 | |
####################### MAIN ###################### | |
if __name__ == '__main__': | |
req = urllib2.Request(bank_url)#declear a http request | |
req.add_header("User-Agent" , 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')#pretend firefox | |
response = urllib2.urlopen(req)#download html file | |
web_page = response.read()#read html file | |
rateParser().feed(web_page)#analysis html file |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import io , urllib | |
from urllib.request import urlopen | |
from html.parser import HTMLParser | |
####################### SETTING ######################## | |
bank_url = "http://rate.bot.com.tw/xrt?Lang=zh-TW" | |
####################### PAESER IMPLEMENT ############### | |
class rateParser(HTMLParser): | |
def __init__(self): | |
HTMLParser.__init__(self) | |
self.getData = False | |
self.getCurrencyName = True | |
self.countRate = 0 | |
self.lastTag = "" | |
def handle_starttag(self , tag , attrs): | |
self.lastTag = tag | |
if tag == "td": | |
self.getData = True; | |
def handle_data(self , data): | |
if self.getData == True: | |
if self.lastTag == 'td' and self.countRate < 4: | |
if data.strip() != "": | |
print('{:7}\t '.format(data.strip()) , end='') | |
self.countRate += 1 | |
if self.lastTag == 'div' and self.getCurrencyName == True: | |
if data.strip() != "": | |
print('{} \t '.format(data.strip()), end='') | |
self.getCurrencyName = False | |
def handle_endtag(self , tag): | |
if tag == 'td' and self.getData == True: | |
self.getData = False | |
if tag == 'tr': | |
if self.getCurrencyName == False: | |
print("") | |
self.getCurrencyName = True | |
self.countRate = 0 | |
####################### MAIN ###################### | |
if __name__ == '__main__': | |
response = urlopen(bank_url)#download html file | |
rateParser().feed((response.read().decode('utf-8'))) |
留言
張貼留言