第七色在线视频,2021少妇久久久久久久久久,亚洲欧洲精品成人久久av18,亚洲国产精品特色大片观看完整版,孙宇晨将参加特朗普的晚宴

為了賬號安全,請及時綁定郵箱和手機立即綁定
已解決430363個問題,去搜搜看,總會有你想問的

爬蟲運行失敗,craw 1 : https://baike.baidu.com/item/Python/407313?fr=aladdin craw failed

爬蟲運行失敗,craw 1 : https://baike.baidu.com/item/Python/407313?fr=aladdin craw failed

qq_str_6 2018-04-01 15:40:20
使用的是python2.7 #主函數(shù)spider_main.py #?coding?:utf8 from?baike_spider?import?url_manager,html_downloader,html_outputer,\ ????html_parser class??SpiderMain(object): ????def?__init__(self): ????????self.urls?=?url_manager.UrlManager() ????????self.downloader?=?html_downloader.HtmlDownloader() ????????self.parser?=?html_parser.HtmlParser() ????????self.outputer?=?html_outputer.HtmlOutputer() ????def?craw(self,root_url): ????????count?=?1 ????????self.urls.add_new_url(root_url) ????????while?self.urls.has_new_url(): ????????????try: ????????????????new_url?=?self.urls.get_new_url() ????????????????print?'craw?%d?:?%s?'?%?(count,?new_url) ????????????????html_cont?=?self.downloader.download(new_url) ????????????????new_urls,?new_data?=?self.parser.parse(new_url,?html_cont) ????????????????self.urls.add_new_urls(new_urls) ????????????????self.outputer.collect_data(new_data) ????????????????if?count?==?1000: ????????????????????break ????????????????count?=?count?+1 ????????????except: ????????????????print?'craw?failed' ????????self.outputer.output_html() if?__name__?==?"__main__": ????root_url?=?"https://baike.baidu.com/item/Python/407313?fr=aladdin" ????obj_spider?=?SpiderMain(?) ????obj_spider.craw(root_url) #url_manager.py #?coding?:?utf8 class?UrlManager?(object): ????def?__init__(self): ????????self.new_urls?=?set() ????????self.old_urls?=?set() ????def?add_new_url(self,url): ???????if?url?is?None: ????????????return ???????if?url?not?in?self.new_urls??and??url?not?in??self.old_urls: ????????????self.new_urls.add(url) ????def?add_new_urls(self,urls): ????????if?urls?is?None?or?len?(urls)?==?0: ????????????return ????????for?url?in?urls: ????????????self.add_new_url(url) ????def?has_new_url(self): ????????return?len(self.new_urls)?!=0 ????def?get_new_url(self): ????????new_url?=?self.new_urls.pop() ????????self.old_urls.add(new_url) ????????return?new_url #?html_parser.py #?coding?:?utf8 import?re import?urlparse from?bs4?import?BeautifulSoup class?HtmlParser?(object): ???def?_get_new_urls(self,page_url,soup): ???????new_urls?=?set?() ???????#/item/Python/407313?fr=aladdin ???????links?=?soup.find_all('a',href=re.compile(r"/item/Python/\d+\?fr=aladdin")) ???????for?link?in?links: ???????????new_url?=?link['href'] ???????????new_full_url?=?urlparse.urljoin(page_url,new_url) ???????????new_urls.add(new_full_url) ???????return?new_urls ???def??_get_new_date(self,page_url,soup): ???????res_date={} ???????#?url ???????res_date['url']?=page_url ???????#<dd?class="lemmaWgt-lemmaTitle-title"><h1>Python</h1> ???????title_node?=?soup.find('dd',class_="lemmaWgt-lemmaTitle-title").find("h1") ???????res_date['title']?=?title_node.get_text() ????????#<div?class="lemma-summary"?label-module="lemmaSummary"> ???????summary_node?=?soup.find('div',class_="lemma-summary") ???????res_date['summary']?=?summary_node.get_text() ???????return?res_date ???def?parse(self,page_url,html_cont): ??????if?page_url?is?None?or?html_cont?is?None: ??????????return ??????soup?=?BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8') ??????new_urls?=?self._get_new_urls(page_url,soup) ??????new_data?=?self._get_new_data(page_url,soup) ??????return?new_urls,new_data #html_outputer.py #?coding?:?utf8 import?re import?urlparse from?bs4?import?BeautifulSoup class?HtmlParser?(object): ???def?_get_new_urls(self,page_url,soup): ???????new_urls?=?set?() ???????#/item/Python/407313?fr=aladdin ???????links?=?soup.find_all('a',href=re.compile(r"/item/Python/\d+\?fr=aladdin")) ???????for?link?in?links: ???????????new_url?=?link['href'] ???????????new_full_url?=?urlparse.urljoin(page_url,new_url) ???????????new_urls.add(new_full_url) ???????return?new_urls ???def??_get_new_date(self,page_url,soup): ???????res_date={} ???????#?url ???????res_date['url']?=page_url ???????#<dd?class="lemmaWgt-lemmaTitle-title"><h1>Python</h1> ???????title_node?=?soup.find('dd',class_="lemmaWgt-lemmaTitle-title").find("h1") ???????res_date['title']?=?title_node.get_text() ????????#<div?class="lemma-summary"?label-module="lemmaSummary"> ???????summary_node?=?soup.find('div',class_="lemma-summary") ???????res_date['summary']?=?summary_node.get_text() ???????return?res_date ???def?parse(self,page_url,html_cont): ??????if?page_url?is?None?or?html_cont?is?None: ??????????return ??????soup?=?BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8') ??????new_urls?=?self._get_new_urls(page_url,soup) ??????new_data?=?self._get_new_data(page_url,soup) ??????return?new_urls,new_data #html_downloader.py #?coding?:?utf8 import?urllib2 class?HtmlDownloader?(object): ????def?download(self,url): ????????if?url?is?None?: ????????????return?None ????????response=urllib2.urlopen(url) ????????if?response.getcode()?!=?200: ????????????return?None ????????return?response.read()
查看完整描述

目前暫無任何回答

  • 0 回答
  • 0 關(guān)注
  • 1520 瀏覽
慕課專欄
更多

添加回答

舉報

0/150
提交
取消
微信客服

購課補貼
聯(lián)系客服咨詢優(yōu)惠詳情

幫助反饋 APP下載

慕課網(wǎng)APP
您的移動學習伙伴

公眾號

掃描二維碼
關(guān)注慕課網(wǎng)微信公眾號