#主程
from?baike_spider?import?url_manager,html_downloader,html_parser,html_outputer
class?SpiderMain(object):
????def?__init__(self):
????????self.urls?=?url_manager.UrlManager()
????????self.downloader?=?html_downloader.HtmlDownloader()
????????self.parser?=?html_parser.HtmlParser()
????????self.outputer?=?html_outputer.HtmlOutputer()
????????
????def?craw(self,root_url):
????????count?=?1
????????self.urls.add_new_url(root_url)
????????
????????while?self.urls.has_new_url():
????????????try:
????????????????new_url?=?self.urls.get_new_url()
????????????????print?'craw?%d:?%s'?%(count,new_url)
????????????????html_cont?=?self.downloader.download(new_url)
????????????????new_urls,new_data?=?self.parser.parse(new_url,html_cont)
????????????????self.urls.add_new_urls(new_urls)
????????????????self.outputer.collect_data(new_data)
????????????
????????????????if?count?==?100:
????????????????????break
????????????????count?=?count?+?1
????????????except:
????????????????print?'craw?failed'
????????self.outputer.output_html()
if?__name__?==?"__main__":
????root_url?=?"http://baike.baidu.com/view/21087.htm"
????obj_spider?=?SpiderMain()
????obj_spider.craw(root_url)
#錯(cuò)誤提示
Traceback?(most?recent?call?last):
??File?"F:\eclipse\spider\baike_spider\spider_main.py",?line?34,?in?<module>
????obj_spider.craw(root_url)
??File?"F:\eclipse\spider\baike_spider\spider_main.py",?line?12,?in?craw
????self.urls.add_new_url(root_url)
??File?"F:\eclipse\spider\baike_spider\url_manager.py",?line?11,?in?add_new_url
????if?url?not?in?self.new_urls?and?url?not?in?self.old.urls:
AttributeError:?'UrlManager'?object?has?no?attribute?'old'
2016-09-27
self.old_url 寫(xiě)成了 self.old.urls -_-