第一部分
from?baike_spider?import?url_manager,?html_downloader,?html_parser,?html_outputer
class?SpiderMain(object):
????def?__init__(self):
????????self.urls?=?url_manager.UrlManager()
????????self.downloader?=?html_downloader.HtmlDownloader()
????????self.parser?=?html_parser.HtmlParser()
????????self.outputer?=?html_outputer.HtmlOutputer()
????def?craw(self,root_url):
????????count?=?1
????????self.urls.add_new_url(root_url)
????????while?self.urls.has_new_url():
????????????try:
????????????????new_url?=?self.urls.get_new_url()
????????????????print('craw?%d?:?%s'%(count,new_url))
????????????????html_cont?=?self.downloader.download(new_url)
????????????????new_urls,?new_data?=?self.parser.parse(new_url,?html_cont)
????????????????self.urls.add_new_urls(new_urls)
????????????????self.outputer.collect_data(new_data)
????????????????if?count?==?1000:
????????????????????break
????????????????count?=?count?+?1
????????????except:
????????????????print("craw?failed")
????????self.outputer.output_html()
if?__name__=="__main__":
????root_url?=?"https://baike.baidu.com/item/python/407313"
????obj_spider?=?SpiderMain()
????obj_spider.craw(root_url)
????
????
????
第二部分
class?UrlManager(object):
????def?__init__(self):
????????self.new_urls?=?set()
????????self.old_urls?=?set()
????def?add_new_url(self,?url):
????????if?url?is?None:
????????????return
????????if?url?not?in?self.new_urls?and?url?not?in?self.old_urls:
????????????self.new_urls.add(url)
????def?add_new_urls(self,?urls):
????????if?urls?is?None?or?len(urls)?==?0:
????????????return
????????for?url?in?urls:
????????????self.add_new_url(url)
????def?has_new_url(self):
????????return?len(self.new_urls)?!=?0
????def?get_new_url(self):
????????new_url?=?self.new_urls.pop()
????????self.old_urls.add(new_url)
????????return?new_url
?
?
?
?
?
?第三部分
?
?
?
from?bs4?import?BeautifulSoup
import?urllib
import?re
class?HtmlParser(object):
????def?_get_new_urls(self,page_url,?soup):
????????new_urls?=?set()
????????links?=?soup.find_all('a',href=re.compile(r"/item/Python/\d+"))
????????for?link?in?links:
????????????new_url?=?link['href']
????????????new_full_url?=?urllib.parse.urljoin(page_url,new_url)
????????????new_urls.add(new_full_url)
????????return?new_urls
????def?_get_new_data(self,?page_url,?soup):
????????res_data?=?{}
????????#url
????????res_data['url']?=?page_url
????#<dd?class="lemmaWgt-lemmaTitle-title"><h1>Python</h1>
????????title_node?=?soup.find('dd',?class_='lemmaWgt-lemmaTitle-title').find("h1")
????????res_data['title']?=?title_node.get_text()
????#<div?class="lemma-summary"?label-module="lemmaSummary">
????????summary_node?=?soup.find('div',class_='lemma-summary')
????????res_data['summary']?=?summary_node.get_text()
????????return?res_data
????def?parse(self,?page_url,?html_cont):
????????if?page_url?is?None?or?html_cont?is?None:
????????????return
????????soup?=?BeautifulSoup(html_cont,?'html.parser',from_encoding='utf-8')
????????new_urls?=?self._get_new_urls(page_url,soup)
????????new_data?=?self._get_new_data(page_url,soup)
????????return?new_urls,new_data
????????
????????
????????
第四部分
import?urllib.request
class?HtmlDownloader(object):
????def?download(self,?url):
????????if?url?is?None:
????????????return?None
????????response?=?urllib.request.urlopen(url)
????????if?response.getcode()?!=?200:
????????????return?None
????????return?response.read()
????????
????????
第五部分
import?self
class?HtmlOutputer(object):
????def?__init__(self):
????????self.datas?=?[]
????def?collect_data(self,?data):
???????if?data?is?None:
???????????return
???????self.datas.append(data)
????def?output_html(self):
????????fout?=?open('output.html','w',encoding="utf-8")
????????fout.write("<html>")
????????fout.write('<meta?charset="utf-8">')
????????fout.write("<body>")
????????fout.write("<table>")
????????for?data?in?self.datas:
????????????fout.write("<tr>")
????????????fout.write("<td>%s</td>"%?data['url'])
????????????fout.write("<td>%s</td>"?%?data['title'])
????????????fout.write("<td>%s</td>"?%?data['summary'])
????????????fout.write("</tr>")
????????fout.write("</table>")
????????fout.write("</body>")
????????fout.write("</html>")
2019-05-26
正則表達式需要改改
2019-04-27
html_cont = 不出內(nèi)容
2019-03-14
craw 1 : https://baike.baidu.com/item/Python/407313
craw failed
運行完就這樣了,我之前也是這樣寫的代碼,沒有迭代,直接出結(jié)果,請問你找到原因沒?
2019-02-21
這個代碼存在一定的缺陷,請注意
2019-02-20
使用的IDE是PYCHARM