#?coding=gbk
'''
Created?on?2017年11月23日
@author:?stu
'''
from?baike_spider?import?url_manager,?html_downloader,?html_parser,\
????html_outputer
#?初始化url管理器、下載器、解析器、輸出器
class?SpiderMain(object):
????def?__init__(self):
????????self.urls?=?url_manager.UrlManager()
????????self.downloader?=?html_downloader.HtmlDownloader()
????????self.parser?=?html_parser.HtmlParser()
????????self.outputer?=?html_outputer.HtmlOutputer()
????????
????
????def?craw(self,root_url):
????????count?=?1
????????self.urls.add_new_url(root_url)
????????while?self.urls.has_new_url():
????????????try:
????????????????new_url?=?self.urls.get_new_url()
????????????????print?("craw?%d?:?%s"?%(count,new_url))
????????????????html_cont?=?self.downloader.download(new_url)
????????????????new_urls,?new_data?=?self.parser.parse(new_url,html_cont)
????????????????self.urls.add_new_urls(new_urls)
????????????????self.outputer.callect_data(new_data)
????????????????
????????????????if?count?==100:
????????????????????break
????????????????
????????????????count?+=?1
????????????except:
????????????????print('craw?failed')
????????????
????????self.outputer.output_html()
if?__name__?=='__main__':
????root_url?=?"https://baike.baidu.com/item/Python/407313?fr=aladdin"
????obj_spider?=?SpiderMain()
????obj_spider.craw(root_url)
????
????
????
????
????Traceback?(most?recent?call?last):
??File?"D:\eclipse\workspase\imooc\src\baike_spider\spider_main.py",?line?45,?in?<module>
????obj_spider.craw(root_url)
??File?"D:\eclipse\workspase\imooc\src\baike_spider\spider_main.py",?line?23,?in?craw
????self.urls.add_new_url(root_url)
TypeError:?add_new_url()?takes?1?positional?argument?but?2?were?given
添加回答
舉報
0/150
提交
取消