課程
                    
                        /后端開發(fā)
                        
                            /Python
                        
                        /Python開發(fā)簡單爬蟲

只輸出了兩條內(nèi)容，但去掉try并沒有報(bào)錯(cuò)。。。要哭了，請(qǐng)各位大神幫忙看看！

（1）spider_main

#coding:utf8

import sys

reload(sys)? ?

sys.setdefaultencoding('utf8')

class UrlManager(object):

? ? def __init__(self):

? ? ? ? self.new_urls = set()

? ? ? ? self.old_urls = set()

? ??

? ? def add_new_url(self, url):

? ? ? ? if url is None:

? ? ? ? ? ? return

? ? ? ? if url not in self.new_urls and url not in self.old_urls:

? ? ? ? ? ? self.new_urls.add(url)

? ? def add_new_urls(self, urls):

? ? ? ? if urls is None or len(urls) == 0:

? ? ? ? ? ? return

? ? ? ? for url in urls:

? ? ? ? ? ? self.add_new_url(url)

? ??

? ? def has_new_url(self):

? ? ? ? return len(self.new_urls) != 0

? ? def get_new_url(self):

? ? ? ? #pop方法會(huì)抽取并移除一個(gè)url

? ? ? ? new_url = self.new_urls.pop()

? ? ? ? self.old_urls.add(new_url)

? ? ? ? return new_url

（2）html_downloader

#coding:utf8

import urllib2

import sys

reload(sys)? ?

sys.setdefaultencoding('utf8')

class HtmlDownloader(object):

? ??

? ? def download(self, url):

? ? ? ? if url is None:

? ? ? ? ? ? return None

? ? ? ??

? ? ? ? response = urllib2.urlopen(url)

? ? ? ??

? ? ? ? if response.getcode() != 200:

? ? ? ? ? ? return None

? ? ? ??

? ? ? ? return response.read()

（3）html_parser

#coding:utf8

from bs4 import BeautifulSoup

import re

import urlparse

import sys

reload(sys)? ?

sys.setdefaultencoding('utf8')

class HtmlParser(object):

? ??

? ? def _get_new_urls(self, page_url, soup):

? ? ? ? new_urls = set()

? ? ? ? #/item/%E6%95%99%E5%AD%A6? ?https://baike.baidu.com/item/%E6%95%99%E5%AD%A6

? ? ? ? links = soup.find_all('a', href = re.compile(r"/item/."))

? ? ? ? for link in links:

? ? ? ? ? ? new_url = link['href']

? ? ? ? ? ? new_full_url = urlparse.urljoin(page_url, new_url)

? ? ? ? ? ? new_urls.add(new_full_url)

? ? ? ? ? ? return new_urls

? ??

? ? def _get_new_data(self, page_url, soup):

? ? ? ? #將數(shù)據(jù)提出用res_data

? ? ? ? res_data = {}

? ? ? ??

? ? ? ? #把url放進(jìn)最終數(shù)據(jù)中

? ? ? ? res_data['url'] = page_url

? ? ? ??

? ? ? ? #<dd class="lemmaWgt-lemmaTitle-title"> <h1>Python</h1>

? ? ? ? title_node = soup.find('dd',class_ = "lemmaWgt-lemmaTitle-title").find("h1")

? ? ? ? res_data['title'] = title_node.get_text()

? ? ? ??

? ? ? ? #<div class="lemma-summary" label-module="lemmaSummary">

? ? ? ? summary_node = soup.find('div', class_ = "lemma-summary")

? ? ? ? res_data['summary'] = summary_node.get_text()

? ? ? ??

? ? ? ? return res_data

? ??

? ? def parse(self, page_url, html_cont):

? ? ? ? if page_url is None or html_cont is None:

? ? ? ? ? ? return

? ? ? ??

? ? ? ? soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')

? ? ? ? new_urls = self._get_new_urls(page_url, soup)

? ? ? ? new_data = self._get_new_data(page_url, soup)

? ? ? ? return new_urls, new_data

（4）html_outputer

#coding:utf8

import sys

reload(sys)? ?

sys.setdefaultencoding('utf8')

class HtmlOutputer(object):

? ? def __init__(self):

? ? ? ? self.datas = []

? ??

? ? def collect_data(self, data):

? ? ? ? if data is None:

? ? ? ? ? ? return

? ? ? ? self.datas.append(data)

? ??

? ? def output_html(self):

? ? ? ? fout = open('output.html', 'w')

? ? ? ??

? ? ? ? fout.write("<html>")

? ? ? ? fout.write("<body>")

? ? ? ? fout.write("<table>")

? ? ? ??

? ? ? ? #Python默認(rèn)編碼是ascii，所以如需輸出utf-8需做處理.encode('utf-8')

? ? ? ? for data in self.datas:

? ? ? ? ? ? fout.write("<tr>")

? ? ? ? ? ? fout.write("<td>%s</td>" % data['url'])

? ? ? ? ? ? fout.write("<td>%s</td>" % data['title'].encode('utf-8'))

? ? ? ? ? ? fout.write("<td>%s</td>" % data['summary'].encode('utf-8'))

? ? ? ? ? ? fout.write("/tr>")

? ? ? ? ? ??

? ? ? ? fout.write("</table>")

? ? ? ? fout.write("</body>")

? ? ? ? fout.write("</html>")

? ? ? ??

? ? ? ? fout.close()

（5）url_manager

#coding:utf8

import sys

reload(sys)? ?

sys.setdefaultencoding('utf8')

class UrlManager(object):

? ? def __init__(self):

? ? ? ? self.new_urls = set()

? ? ? ? self.old_urls = set()

? ??

? ? def add_new_url(self, url):

? ? ? ? if url is None:

? ? ? ? ? ? return

? ? ? ? if url not in self.new_urls and url not in self.old_urls:

? ? ? ? ? ? self.new_urls.add(url)

? ? def add_new_urls(self, urls):

? ? ? ? if urls is None or len(urls) == 0:

? ? ? ? ? ? return

? ? ? ? for url in urls:

? ? ? ? ? ? self.add_new_url(url)

? ??

? ? def has_new_url(self):

? ? ? ? return len(self.new_urls) != 0

? ? def get_new_url(self):

? ? ? ? #pop方法會(huì)抽取并移除一個(gè)url

? ? ? ? new_url = self.new_urls.pop()

? ? ? ? self.old_urls.add(new_url)

? ? ? ? return new_url

? ??

?

慕尼黑2527285

2018-12-12

源自：Python開發(fā)簡單爬蟲

關(guān)注問題我要回答

488

操作

收起

舉報(bào)

0/150

提交

取消

Python開發(fā)簡單爬蟲

參與學(xué)習(xí) 227603 人
解答問題 1282 個(gè)

本教程帶您解開python爬蟲這門神奇技術(shù)的面紗

進(jìn)入課程

只輸出了兩條內(nèi)容，但去掉try并沒有報(bào)錯(cuò)。。。要哭了，請(qǐng)各位大神幫忙看看！

我要回答關(guān)注問題

使用 Ctrl+D 可將網(wǎng)站添加到書簽

微信客服

購課補(bǔ)貼
聯(lián)系客服咨詢優(yōu)惠詳情

幫助反饋 APP下載

慕課網(wǎng)APP
您的移動(dòng)學(xué)習(xí)伙伴

公眾號(hào)

掃描二維碼
關(guān)注慕課網(wǎng)微信公眾號(hào)

第七色在线视频,2021少妇久久久久久久久久,亚洲欧洲精品成人久久av18,亚洲国产精品特色大片观看完整版,孙宇晨将参加特朗普的晚宴

熱搜

最近搜索清空

只輸出了兩條內(nèi)容，但去掉try并沒有報(bào)錯(cuò)。。。要哭了，請(qǐng)各位大神幫忙看看！

只輸出了兩條內(nèi)容，但去掉try并沒有報(bào)錯(cuò)。。。要哭了，請(qǐng)各位大神幫忙看看！

只輸出了兩條內(nèi)容，但去掉try并沒有報(bào)錯(cuò)。。。要哭了，請(qǐng)各位大神幫忙看看！

只輸出了兩條內(nèi)容，但去掉try并沒有報(bào)錯(cuò)。。。要哭了，請(qǐng)各位大神幫忙看看！