抓取幾條URL后就沒反應(yīng)了,怎么回事
#coding:utf-8 import?urllib2 class?HtmlDownloader(object): ???? ???? ????def?download(self,?url): ????????if?url?is?None: ????????????return?None ???????? ????????response?=?urllib2.urlopen(url) ???????? ????????if?response.getcode()?!=?200: ????????????return?None ???????? ????????return?response.read() class?HtmlOutputer(object): ????def?__init__(self): ????????self.datas?=?[] ???? ????def?collect_data(self,?data): ????????if?data?is?None: ????????????return ????????self.datas.append(data) ???????? ???????? ????def?output_html(self): ????????fout?=?open('output.html',?'w') ???????? ????????fout.write('<html>') ????????fout.write('<body>') ????????fout.write("<head><meta?charset='utf-8'></head>") ????????fout.write('<table>') ???????? ????????#?ascii?in?python ????????for?data?in?self.datas: ????????????fout.write('<tr>') ????????????fout.write('<td>%s</td>'?%?data['url']) ????????????fout.write('<td>%s</td>'?%?data['title'].encode('utf-8')) ????????????fout.write('<td>%s</td>'?%?data['summary'].encode('utf-8')) ????????????fout.write('</tr>') ???????? ????????fout.write('</table>') ????????fout.write('</body>') ????????fout.write('</html>') ???????? ????????fout.close() from?bs4?import?BeautifulSoup import?re import?urlparse class?HtmlParser(object): ???? ???? ????def?_get_new_urls(self,?page_url,?soup): ????????new_urls?=?set() ????????#/view/123.htm ????????links?=?soup.find_all('a',?href=re.compile(r"/view/\d+\.htm")) ????????for?link?in?links: ????????????new_url?=?link['href'] ????????????new_full_url?=?urlparse.urljoin(page_url,?new_url) ????????????new_urls.add(new_full_url) ????????return?new_urls ???? ????def?_get_new_data(self,?page_url,?soup): ????????res_data?=?{} ???????? ????????#url ????????res_data['url']?=?page_url ???????? ????????#<dd?class="lemmaWgt-lemmaTitle-title"><h1>Python</h1> ????????title_node?=?soup.find('dd',?class_="lemmaWgt-lemmaTitle-title").find('h1') ????????res_data['title']?=?title_node.get_text() ???????? ????????#<div?class="lemma-summary"?label-module="lemmaSummary"> ????????summary_node?=?soup.find('div',?class_="lemma-summary") ????????res_data['summary']?=?summary_node.get_text() ???????? ????????return?res_data ???? ???? ???? ????def?parse(self,?page_url,?html_cont): ????????if?page_url?is?None?or?html_cont?is?None: ????????????return ???????? ????????soup?=?BeautifulSoup(html_cont,?'html.parser',?from_encoding='utf-8') ????????new_urls?=?self._get_new_urls(page_url,?soup) ????????new_data?=?self._get_new_data(page_url,?soup) ????????return?new_urls,?new_data class?UrlManager(object): ????def?__init__(self): ????????self.new_urls?=?set() ????????self.old_urls?=?set() ????? ????def?add_new_url(self,url): ????????if?url?is?None: ????????????print?'no?new?url' ????????????return ????????if?url?not?in?self.new_urls?and?url?not?in?self.old_urls: ????????????self.new_urls.add(url) ???????? ????def?add_new_urls(self,?urls): ????????if?urls?is?None?or?len(urls)?==?0: ????????????return ????????for?url?in?urls: ????????????self.add_new_url(url) ???????????? ????def?has_new_url(self): ????????return?len(self.new_urls)?!=?0 ??????????????? ????def?get_new_url(self): ????????new_url?=?self.new_urls.pop() ????????self.old_urls.add(new_url) ????????return?new_url class?SpiderMain(object): ????def?__init__(self): ????????self.urls?=?UrlManager() ????????self.downloader?=?HtmlDownloader() ????????self.parser?=?HtmlParser() ????????self.outputer?=?HtmlOutputer() ???? ????def?craw(self,root_url): ????????count?=?1 ????????self.urls.add_new_url(root_url) ????????while?self.urls.has_new_url(): ????????????try: ????????????????new_url?=?self.urls.get_new_url() ????????????????print?'craw?%d?:?%s'?%?(count,?new_url) ????????????????html_cont?=?self.downloader.download(new_url) ????????????????new_urls,?new_data?=?self.parser.parse(new_url,?html_cont) ????????????????self.urls.add_new_urls(new_urls) ????????????????self.outputer.collect_data(new_data) ???????????????????? ????????????????if?count?==?20: ????????????????????break ???????????????????? ??????????????? ????????????except: ????????????????print?'craw?failed' ????????????count?=?count?+?1 ???????????? ???????????? ????????self.outputer.output_html() if?__name__?==?"__main__": ????root_url?=?"http://baike.baidu.com/view/21087.htm" ????obj_spider?=?SpiderMain() ????obj_spider.craw(root_url)
我把數(shù)量設(shè)為20,運行了幾次,每次抓取的條數(shù)都不一樣,有兩次是成功爬取20條的
以下是測試結(jié)果
====================?RESTART:?G:/Python/crawler_baidu.py?==================== craw?1?:?http://baike.baidu.com/view/21087.htm craw?2?:?http://baike.baidu.com/view/16068.htm craw?3?:?http://baike.baidu.com/view/53557.htm craw?4?:?http://baike.baidu.com/view/10812277.htm ====================?RESTART:?G:/Python/crawler_baidu.py?==================== craw?1?:?http://baike.baidu.com/view/21087.htm craw?2?:?http://baike.baidu.com/view/16068.htm craw?3?:?http://baike.baidu.com/view/53557.htm craw?4?:?http://baike.baidu.com/view/10812277.htm craw?5?:?http://baike.baidu.com/view/32571.htm craw?6?:?http://baike.baidu.com/view/757238.htm craw?7?:?http://baike.baidu.com/view/2888099.htm craw?8?:?http://baike.baidu.com/view/1369367.htm craw?9?:?http://baike.baidu.com/view/737.htm craw?10?:?http://baike.baidu.com/view/38648.htm craw?11?:?http://baike.baidu.com/view/230750.htm craw?12?:?http://baike.baidu.com/view/560583.htm craw?13?:?http://baike.baidu.com/view/902.htm craw?14?:?http://baike.baidu.com/view/18979.htm craw?15?:?http://baike.baidu.com/view/1158766.htm craw?16?:?http://baike.baidu.com/view/7977.htm craw?17?:?http://baike.baidu.com/view/1298929.htm craw?18?:?http://baike.baidu.com/view/642401.htm craw?19?:?http://baike.baidu.com/view/52940.htm craw?20?:?http://baike.baidu.com/view/449909.htm >>>? ====================?RESTART:?G:/Python/crawler_baidu.py?==================== craw?1?:?http://baike.baidu.com/view/21087.htm craw?2?:?http://baike.baidu.com/view/16068.htm craw?3?:?http://baike.baidu.com/view/53557.htm craw?4?:?http://baike.baidu.com/view/10812277.htm craw?5?:?http://baike.baidu.com/view/32571.htm craw?6?:?http://baike.baidu.com/view/757238.htm craw?7?:?http://baike.baidu.com/view/2888099.htm craw?8?:?http://baike.baidu.com/view/1369367.htm craw?9?:?http://baike.baidu.com/view/737.htm craw?10?:?http://baike.baidu.com/view/38648.htm craw?11?:?http://baike.baidu.com/view/230750.htm craw?12?:?http://baike.baidu.com/view/560583.htm craw?13?:?http://baike.baidu.com/view/902.htm craw?14?:?http://baike.baidu.com/view/18979.htm craw?15?:?http://baike.baidu.com/view/1158766.htm craw?16?:?http://baike.baidu.com/view/7977.htm craw?17?:?http://baike.baidu.com/view/1298929.htm craw?18?:?http://baike.baidu.com/view/642401.htm craw?19?:?http://baike.baidu.com/view/52940.htm craw?20?:?http://baike.baidu.com/view/449909.htm >>>? ====================?RESTART:?G:/Python/crawler_baidu.py?==================== craw?1?:?http://baike.baidu.com/view/21087.htm ====================?RESTART:?G:/Python/crawler_baidu.py?==================== craw?1?:?http://baike.baidu.com/view/21087.htm craw?2?:?http://baike.baidu.com/view/16068.htm craw?3?:?http://baike.baidu.com/view/53557.htm craw?4?:?http://baike.baidu.com/view/10812277.htm craw?5?:?http://baike.baidu.com/view/32571.htm craw?6?:?http://baike.baidu.com/view/757238.htm craw?7?:?http://baike.baidu.com/view/2888099.htm craw?8?:?http://baike.baidu.com/view/1369367.htm ====================?RESTART:?G:/Python/crawler_baidu.py?==================== craw?1?:?http://baike.baidu.com/view/21087.htm ====================?RESTART:?G:/Python/crawler_baidu.py?==================== craw?1?:?http://baike.baidu.com/view/21087.htm craw?2?:?http://baike.baidu.com/view/16068.htm ====================?RESTART:?G:/Python/crawler_baidu.py?==================== craw?1?:?http://baike.baidu.com/view/21087.htm ====================?RESTART:?G:/Python/crawler_baidu.py?==================== craw?1?:?http://baike.baidu.com/view/21087.htm craw?2?:?http://baike.baidu.com/view/16068.htm ====================?RESTART:?G:/Python/crawler_baidu.py?==================== craw?1?:?http://baike.baidu.com/view/21087.htm craw?2?:?http://baike.baidu.com/view/16068.htm craw?3?:?http://baike.baidu.com/view/53557.htm ====================?RESTART:?G:/Python/crawler_baidu.py?==================== craw?1?:?http://baike.baidu.com/view/21087.htm craw?2?:?http://baike.baidu.com/view/16068.htm craw?3?:?http://baike.baidu.com/view/53557.htm craw?4?:?http://baike.baidu.com/view/10812277.htm craw?5?:?http://baike.baidu.com/view/32571.htm craw?6?:?http://baike.baidu.com/view/757238.htm craw?7?:?http://baike.baidu.com/view/2888099.htm craw?8?:?http://baike.baidu.com/view/1369367.htm craw?9?:?http://baike.baidu.com/view/737.htm craw?10?:?http://baike.baidu.com/view/38648.htm craw?11?:?http://baike.baidu.com/view/230750.htm craw?12?:?http://baike.baidu.com/view/560583.htm craw?13?:?http://baike.baidu.com/view/902.htm craw?14?:?http://baike.baidu.com/view/18979.htm ====================?RESTART:?G:/Python/crawler_baidu.py?==================== craw?1?:?http://baike.baidu.com/view/21087.htm craw?2?:?http://baike.baidu.com/view/16068.htm craw?3?:?http://baike.baidu.com/view/53557.htm ====================?RESTART:?G:/Python/crawler_baidu.py?==================== craw?1?:?http://baike.baidu.com/view/21087.htm ====================?RESTART:?G:/Python/crawler_baidu.py?==================== craw?1?:?http://baike.baidu.com/view/21087.htm craw?2?:?http://baike.baidu.com/view/16068.htm craw?3?:?http://baike.baidu.com/view/53557.htm craw?4?:?http://baike.baidu.com/view/10812277.htm craw?5?:?http://baike.baidu.com/view/32571.htm craw?6?:?http://baike.baidu.com/view/757238.htm craw?7?:?http://baike.baidu.com/view/2888099.htm craw?8?:?http://baike.baidu.com/view/1369367.htm craw?9?:?http://baike.baidu.com/view/737.htm ====================?RESTART:?G:/Python/crawler_baidu.py?==================== craw?1?:?http://baike.baidu.com/view/21087.htm craw?2?:?http://baike.baidu.com/view/16068.htm
2016-06-29
最后一段循環(huán)順序反了
2016-05-22
可能有防爬蟲設(shè)置
對于頁面的結(jié)構(gòu)可能發(fā)生了變化