第一行:craw 1 : None。第二行:craw field
主代碼:
#?coding:utf8 #encoding=utf-8** import?url_manager,?html_downloader,?html_parser,?html_outputer class?SpiderMain(object): ????def?__init__(self): ????????self.urls?=?url_manager.UrlManager() ????????self.downloader?=?html_downloader.HtmlDownloader() ????????self.parser?=?html_parser.HtmlParser() ????????self.outputer?=?html_outputer.Htmloutputer() ????def?craw(self,?root_url): ????????count?=?1 ????????self.urls.add_new_url(root_url) ????????while?self.urls.has_new_url(): ????????????try: ????????????????new_url?=?self.urls.get_new_url() ????????????????print?'craw?%d?:?%s'?%?(count,?new_url) ????????????????html_cont?=?self.downloader.download() ????????????????new_urls,?new_data?=?self.parser.parse(new_url,?html_cont) ????????????????self.urls.add_new_urls(new_urls) ????????????????self.outputer.collect_data(new_data) ????????????????if?count?==?1000: ????????????????????print?'done' ????????????????????break ????????????????count?=?count?+?1 ????????????except: ????????????????print?'craw?field' ????????self.outputer.output_html() if?__name__?==?"__main__": ????root_url?=?"https://baike.baidu.com/item/Python/407313?fr=aladdin" ????obj_spider?=?SpiderMain() ????obj_spider.craw(root_url)
管理器:
class?UrlManager(object): ????def?__init__(self): ????????self.new_urls?=?set() ????????self.old_urls?=?set() ????def?add_new_url(self,?url): ????????if?url?is?None: ????????????return ????????if?url?not?in?self.new_urls?and?url?not?in?self.old_urls: ????????????self.new_urls.add(url) ????def?add_new_urls(self,?new_urls): ????????if?urls?is?None?or?len(urls)?==?0: ????????????return ????????for?url?in?urls: ????????????self.add_new_url(url) ????def?has_new_url(self): ????????return?len(self.new_urls)?!=?0 ????def?get_new_url(self): ????????new_url?=?self.new_urls.pop() ????????self.old_urls.add(new_url) ????????return
解析器:
class?HtmlParser(object): ????def?_get_new_urls(self,?page_url,?soup): ????????new_urls?=?set() ????????#?/view/123.htm ????????links?=?soup.find_all('a',?herf=re.compile(r"/item/\d+\.htm")) ????????for?link?in?links: ????????????new_url?=?link['href'] ????????????new_full_url?=?urlparse.urljoin(page_url,?new_url) ????????????new_urls.add(new_full_url) ????????return ????def?_get_new_data(self,?page_url,?soup): ????????res_data?=?{} ????????#?url ????????res_data['url']?=?page_url ????????#<dd?class="lemmaWgt-lemmaTitle-title">?<h1>Python</h1> ????????title_node?=?soup.find('dd',class_=?"lemmaWgt-lemmaTitle-title").find("h1") ????????res_data['title']?=?title_node.get_text() ????????#<div?class="lemma-summary" ????????summary_node?=?soup.find('div',?class_?="lemma-summary") ????????res_data['summary']?=?sumary_node.get_text() ????????return?res_data ????def?parse(self,?pew_url,?html_cont): ????????if?page_url?is?None?or?html_cont?is?None: ????????????return ????????soup?=?BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8') ????????new_urls?=?self._get_new_urls(page_url,?soup) ????????new_data?=?self._get_new_data(page_url,?soup) ????????return?new_urls,?new_data
輸出器:
class?Htmloutputer(object): ????def?__init__(self): ????????self.datas?=?[] ????def?collect_data(self,?new_data): ????????if?data?is?None: ????????????return ????????self.datas.append(data) ????def?output_html(self): ????????fout?=?open('output.html',?'w') ????????fout.write("<html>") ????????fout.write("<body>") ????????fout.write("<table>") ????????for?data?in?self.datas: ????????????fout.write("<tr>") ????????????fout.write("<rd>%s</td>"?%?data['url']) ????????????fout.write("<rd>%s</td>"?%?data['title'].encode('uft-8')) ????????????fout.write("<rd>%s</td>"?%?data['summary'].encode('uft-8')) ????????????fout.write("</tr>") ????????fout.write("</table>") ????????fout.write("<body>") ????????fout.write("<html>")
下載器:
import?urllib2 class?HtmlDownloader(object): ????def?download(self,?url): ????????if?url?is?None: ????????????return??None ????????response?=?urllib2.urlopen(url) ????????if?response.getcode()?!=?200: ????????????return?None ????????return?response.read()
輸出結(jié)果:
craw 1 : None
craw field
Process finished with exit code 0