非常感謝老師 , 第一次接觸python 已經(jīng)會(huì)下載圖片啦 , 分享出來(lái)大家共同學(xué)習(xí)
老師的代碼稍加修改已經(jīng)可以爬圖片啦
main
from?baike_spider?import?html_downloader from?baike_spider?import?html_outputer from?baike_spider?import?html_parser from?baike_spider?import?img_downloader from?baike_spider?import?url_manager class?SpiderMain(object): ????def?__init__(self): ????????self.urls?=?url_manager.UrlManager() ????????self.downloader?=?html_downloader.HtmlDownloader() ????????self.parser?=?html_parser.HtmlParser() ????????self.outputer?=?html_outputer.HtmlOutputer() ????????self.imgdownloader?=?img_downloader.ImgDownloader() ????def?craw(self,?root_url): ????????count?=?1 ????????self.urls.add_new_url(root_url) ????????while?self.urls.has_new_url(): ????????????try: ????????????????new_url?=?self.urls.get_new_url() ????????????????print(count,?new_url) ????????????????html_count?=?self.downloader.download(new_url) ????????????????new_urls,?new_data,?img_urls?=?self.parser.parser(new_url,?html_count) ????????????????self.urls.add_new_urls(new_urls)?#加入新的url ????????????????self.outputer.collect_data(new_data)?#輸出html文件 ????????????????self.imgdownloader.downImg(img_urls)?#下載圖片 ????????????????if?count?==?100: ????????????????????break ????????????????count?+=?1 ????????????except: ????????????????print(count,?"出現(xiàn)異常") ????????self.outputer.output_html() if?__name__?==?"__main__": ????root_url?=?"http://www.lunvshen.org/t-600" ????obj_spider?=?SpiderMain() ????obj_spider.craw(root_url)
HtmlOutputer
class?HtmlOutputer(object): ????def?__init__(self): ????????self.datas?=?[] ????def?collect_data(self,?new_data): ????????if?new_data?is?None: ????????????return ????????self.datas.append(new_data) ????def?output_html(self): ????????fout?=?open('yu.html',?'w') ????????fout.write("<html>") ????????fout.write("<body>") ????????fout.write("<table>") ????????count?=?1 ????????for?data?in?self.datas: ????????????fout.write("<tr>") ????????????fout.write("<td>") ????????????fout.write('<a?target="_blank"?href?="') ????????????fout.write(data['url']) ????????????fout.write('">') ????????????fout.write(data['url']) ????????????fout.write("</a>") ????????????fout.write("</td>") ????????????fout.write("<td>%s</td>"?%?(str(count)+"-"+?data['title'])) ????????????count+=1 ????????????fout.write("</tr>") ????????fout.write("</table>") ????????fout.write("</body>") ????????fout.write("</html>") ????????fout.close()
HtmlParser
import?re import?urllib from?bs4?import?BeautifulSoup from?urllib?import?parse class?HtmlParser(object): ????def?parser(self,?new_url,?html_count): ????????if?new_url?is?None?or?html_count?is?None: ????????????return ????????soup?=?BeautifulSoup(html_count,?'html.parser',?from_encoding='utf-8') ????????new_urls?=?self._get_new_urls(soup) ????????new_data?=?self._get_new_data(new_url,?soup) ????????new_imgs?=?self._get_img(soup) ????????return?new_urls,?new_data,?new_imgs ????@staticmethod ????def?_get_new_urls(soup): ????????new_urls?=?set() ????????links?=?soup.find_all('a',?href=re.compile(r"/t-\d+")) ????????for?link?in?links: ????????????new_url?=?link['href'] ????????????new_full_url?=?urllib.parse.urljoin('http://www.lunvshen.org/',?new_url) ????????????new_urls.add(new_full_url) ????????return?new_urls ????@staticmethod ????def?_get_new_data(new_url,?soup): ????????res_data?=?{} ????????res_data['url']?=?new_url ????????#?<?dd?class?="lemmaWgt-lemmaTitle-title"?><?h1?>?Python?<?/?h1?> ????????title_node?=?soup.find('div',?class_='topic-title-main').find('h1') ????????res_data['title']?=?title_node.get_text() ????????#?<?div?class?="lemma-summary"?label-module="lemmaSummary"?> ????????#?<?divclass?="para"?label-module="para"?>?Python?<?sup?> ????????#?summary_node?=?soup.find('div',?class_='topic-content').find('img') ????????#?res_data['summary']?=?summary_node.getsrc() ????????return?res_data ????@staticmethod ????def?_get_img(soup): ????????new_imgurls?=?set() ????????imgLinks?=?soup.find_all('img',?src=re.compile(r"http://ww\d\.sinaimg")) ????????for?imgLink?in?imgLinks: ????????????#?print("圖片:::",?imgLink) ????????????new_imgurls.add(imgLink) ????????return?new_imgurls
ImgDownloader
import?urllib.request class?ImgDownloader(object): ????def?downImg(self,?img_urls): ????????if?img_urls?is?None: ????????????return ????????count?=?1 ????????for?imgLink?in?img_urls: ????????????trueImgUrl?=?imgLink.get('src') ????????????urllen?=?len(trueImgUrl) ????????????a?=?urllen?-?18 ????????????b?=?urllen?-?4 ????????????imgName?=?trueImgUrl[a:?b]?+?"-"?+?str(count) ????????????filesavepath?=?'/Users/lucky/Downloads/img/%s.jpg'?%?imgName????#mac的文件地址 ????????????#?下載圖片 ????????????urllib.request.urlretrieve(trueImgUrl,?filesavepath) ????????????count?+=?1 ????????????print("下載成功::",?filesavepath)
UrlManager
class?UrlManager(object): ????def?__init__(self): ????????self.new_urls?=?set() ????????self.old_urls?=?set() ????def?add_new_url(self,?url): ????????if?url?is?None: ????????????return ????????if?url?not?in?self.new_urls?and?url?not?in?self.old_urls: ????????????self.new_urls.add(url) ????def?add_new_urls(self,?urls): ????????if?urls?is?None?or?len(urls)?==?0: ????????????return ????????for?url?in?urls: ????????????self.add_new_url(url) ????def?has_new_url(self): ????????return?len(self.new_urls)?!=?0 ????def?get_new_url(self): ????????new_url?=?self.new_urls.pop() ????????self.old_urls.add(new_url) ????????return?new_url
HtmlDownloader
from?urllib?import?request class?HtmlDownloader(object): ????@staticmethod ????def?download(url): ????????if?url?is?None: ????????????return?None ????????response?=?request.urlopen(url) ????????if?response.getcode()?!=?200: ????????????return?None ????????return?response.read()
2017-03-27
看了網(wǎng)頁(yè)內(nèi)容 更加
堅(jiān)定了我學(xué)爬蟲(chóng)的信念?
2017-05-14
請(qǐng)問(wèn)我在正則的時(shí)候匹配不出結(jié)果是為什么》?? ?那個(gè)print links的結(jié)果是[]