使用的是python2.7
#主函數(shù)spider_main.py
#?coding?:utf8
from?baike_spider?import?url_manager,html_downloader,html_outputer,\
????html_parser
class??SpiderMain(object):
????def?__init__(self):
????????self.urls?=?url_manager.UrlManager()
????????self.downloader?=?html_downloader.HtmlDownloader()
????????self.parser?=?html_parser.HtmlParser()
????????self.outputer?=?html_outputer.HtmlOutputer()
????def?craw(self,root_url):
????????count?=?1
????????self.urls.add_new_url(root_url)
????????while?self.urls.has_new_url():
????????????try:
????????????????new_url?=?self.urls.get_new_url()
????????????????print?'craw?%d?:?%s?'?%?(count,?new_url)
????????????????html_cont?=?self.downloader.download(new_url)
????????????????new_urls,?new_data?=?self.parser.parse(new_url,?html_cont)
????????????????self.urls.add_new_urls(new_urls)
????????????????self.outputer.collect_data(new_data)
????????????????if?count?==?1000:
????????????????????break
????????????????count?=?count?+1
????????????except:
????????????????print?'craw?failed'
????????self.outputer.output_html()
if?__name__?==?"__main__":
????root_url?=?"https://baike.baidu.com/item/Python/407313?fr=aladdin"
????obj_spider?=?SpiderMain(?)
????obj_spider.craw(root_url)
#url_manager.py
#?coding?:?utf8
class?UrlManager?(object):
????def?__init__(self):
????????self.new_urls?=?set()
????????self.old_urls?=?set()
????def?add_new_url(self,url):
???????if?url?is?None:
????????????return
???????if?url?not?in?self.new_urls??and??url?not?in??self.old_urls:
????????????self.new_urls.add(url)
????def?add_new_urls(self,urls):
????????if?urls?is?None?or?len?(urls)?==?0:
????????????return
????????for?url?in?urls:
????????????self.add_new_url(url)
????def?has_new_url(self):
????????return?len(self.new_urls)?!=0
????def?get_new_url(self):
????????new_url?=?self.new_urls.pop()
????????self.old_urls.add(new_url)
????????return?new_url
#?html_parser.py
#?coding?:?utf8
import?re
import?urlparse
from?bs4?import?BeautifulSoup
class?HtmlParser?(object):
???def?_get_new_urls(self,page_url,soup):
???????new_urls?=?set?()
???????#/item/Python/407313?fr=aladdin
???????links?=?soup.find_all('a',href=re.compile(r"/item/Python/\d+\?fr=aladdin"))
???????for?link?in?links:
???????????new_url?=?link['href']
???????????new_full_url?=?urlparse.urljoin(page_url,new_url)
???????????new_urls.add(new_full_url)
???????return?new_urls
???def??_get_new_date(self,page_url,soup):
???????res_date={}
???????#?url
???????res_date['url']?=page_url
???????#<dd?class="lemmaWgt-lemmaTitle-title"><h1>Python</h1>
???????title_node?=?soup.find('dd',class_="lemmaWgt-lemmaTitle-title").find("h1")
???????res_date['title']?=?title_node.get_text()
????????#<div?class="lemma-summary"?label-module="lemmaSummary">
???????summary_node?=?soup.find('div',class_="lemma-summary")
???????res_date['summary']?=?summary_node.get_text()
???????return?res_date
???def?parse(self,page_url,html_cont):
??????if?page_url?is?None?or?html_cont?is?None:
??????????return
??????soup?=?BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8')
??????new_urls?=?self._get_new_urls(page_url,soup)
??????new_data?=?self._get_new_data(page_url,soup)
??????return?new_urls,new_data
#html_outputer.py
#?coding?:?utf8
import?re
import?urlparse
from?bs4?import?BeautifulSoup
class?HtmlParser?(object):
???def?_get_new_urls(self,page_url,soup):
???????new_urls?=?set?()
???????#/item/Python/407313?fr=aladdin
???????links?=?soup.find_all('a',href=re.compile(r"/item/Python/\d+\?fr=aladdin"))
???????for?link?in?links:
???????????new_url?=?link['href']
???????????new_full_url?=?urlparse.urljoin(page_url,new_url)
???????????new_urls.add(new_full_url)
???????return?new_urls
???def??_get_new_date(self,page_url,soup):
???????res_date={}
???????#?url
???????res_date['url']?=page_url
???????#<dd?class="lemmaWgt-lemmaTitle-title"><h1>Python</h1>
???????title_node?=?soup.find('dd',class_="lemmaWgt-lemmaTitle-title").find("h1")
???????res_date['title']?=?title_node.get_text()
????????#<div?class="lemma-summary"?label-module="lemmaSummary">
???????summary_node?=?soup.find('div',class_="lemma-summary")
???????res_date['summary']?=?summary_node.get_text()
???????return?res_date
???def?parse(self,page_url,html_cont):
??????if?page_url?is?None?or?html_cont?is?None:
??????????return
??????soup?=?BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8')
??????new_urls?=?self._get_new_urls(page_url,soup)
??????new_data?=?self._get_new_data(page_url,soup)
??????return?new_urls,new_data
#html_downloader.py
#?coding?:?utf8
import?urllib2
class?HtmlDownloader?(object):
????def?download(self,url):
????????if?url?is?None?:
????????????return?None
????????response=urllib2.urlopen(url)
????????if?response.getcode()?!=?200:
????????????return?None
????????return?response.read()
添加回答
舉報
0/150
提交
取消