#Spider?main
from?baike_spider?import?url_manager,?html_downloader
import?html_parser
import?html_outputer
????
class?SpiderMain(object):
????def?__init__(self):?
????????self.urls=url_manager.UrlManager()
????????self.downloader=html_downloader.HtmlDownloader()
????????self.parser=html_parser.HtmlParser()
????????self.outputer=html_outputer.HtmlOutputer()
????def?craw(self,root_url):
????????count=1
????????self.urls.add_new_url(root_url)
????????while?self.urls.has_new_url():
????????????try:
????????????????new_url=self.urls.get_new_url()
????????????????print'craw?%d?:%s'%(count,new_url)
????????????????html_cont=self.downloader.download(new_url)
????????????????new_urls,new_data=self.parser.parse(new_url,html_cont)
????????????????self.urls.add_new_urls(new_urls)
????????????????self.outputer.collect_data(new_data)
????????????????
????????????????if?count==1000:
????????????????????break
????????????????count+=1
????????????except:
????????????????print'craw?failed'
????????????????
????????????
????????self.outputer.output_html()
????????
def?__init__(self):
????????self.new_urls?=?set()
????????self.old_urls?=?set()
if?__name__=="__main__":
????root_url="https://baike.baidu.com/item/Python/407313?fr=aladdin"
????obj_spider=SpiderMain()
????obj_spider.craw(root_url)
#outputer
class?HtmlOutputer(object):
????def?__init__(self):
????????self.datas=[]
????
????def?collect_data(self,data):
????????if?data?is?None:
????????????return
????????self.datas.append(data)
????
????def?output_html(self):
????????fout=open('output.html','w')
????????
????????fout.write('<html>')
????????fout.write('<body>')
????????fout.write('<table>')
????????for?data?in?self.datas:
????????????fout.write('<tr>')
????????????fout.write('<td>%s</td>'%?data['url'])
????????????fout.write('<td>%s</td>'%?data['title'])
????????????fout.write('<td>%s</td>'%?data['summary'])
????????????fout.write('</tr>')
????????fout.write('</table>')
????????fout.write('</body>')
????????fout.write('</html>')
????????
????????fout.close