課程
                    
                        /后端開發(fā)
                        
                            /Python
                        
                        /Python開發(fā)簡(jiǎn)單爬蟲

我一點(diǎn)點(diǎn)查代碼沒問題，結(jié)果還是NameError: name 'new_urls' is not defined

class UrlManager(object):? ? ? ? ? ?##url管理器，用于添加但不重復(fù)url，下面有四種

? ? def __init__(self):

? ? ? ? self.new_urls=set()

? ? ? ? self.old_urls=set()

? ??

? ? def add_new_url(self,url):? ? ? ? ? ?##向管理器中添加一個(gè)新的URL，單個(gè)添加

? ? ? ? if url is None:

? ? ? ? ? ? return

? ? ? ? if url not in new_urls and url not in old_urls:? ##判斷一個(gè)url既不在待爬去url群里也不在已經(jīng)爬過的群，那么就把它加進(jìn)待爬去

? ? ? ? ? ? self.new_urls.add(url)

? ? ? ??

? ? def add_new_urls(self,urls):? ? ? ? ? ? ? ##向管理器中添加一批的URL ，批量添加

? ? ? ? if urls is None or len(urls)==0:

? ? ? ? ? ? return

? ? ? ? for url in urls:

? ? ? ? ? ? self.add_new_url(url)

? ? ? ??

? ? def has_new_url(self):? ? ? ? ? ##判斷是否有新的待爬取得url

? ? ? ? return len(self.new_urls)!=0? ? ? ? ?##new_urls長(zhǎng)度不為0，說明有待爬去的url

? ??

? ? def get_new_url(self):? ? ? ? ? ##從url管理器中獲取一個(gè)url

? ? ? ? new_url=self.new_urls.pop()? ? ? ? ##pop獲取一個(gè)的同時(shí)移除這個(gè)url，

? ? ? ? self.old_urls.add(new_url)

? ? ? ? return new_url

import urllib.request

class HtmlDownloader(object):

? ? def download(self,url):

? ? ? ? if url is None:

? ? ? ? ? ? return None

? ? ? ? response=urllib.request.urlopen(url)? ? ? ? ?##這里使用了最簡(jiǎn)單的urllib下載方法，如果要用cookie則需要修改

? ? ? ? if response.getcode()!=200:

? ? ? ? ? ? return None

? ? ? ? return response.read()

from bs4 import BeautifulSoup

import re

##urllib.parse模塊主要是把url拆分為6部分，并返回元組。并且可以把拆分后的部分再組成一個(gè)url。主要有函數(shù)有urljoin、urlsplit、urlunsplit、urlparse等。

import urllib.parse? ? ? ? ? ??

class HtmlParser(object):

? ? def _get_new_urls(self,page_url,soup):

? ? ? ? new_urls=set()? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?##結(jié)果存在urls列表里

? ? ? ? links=soup.find_all('a',href=re.compile(r"/doc/\d+\.html"))? ? ? ##"\d+\"用來指代數(shù)字串,re.compile實(shí)現(xiàn)正則模糊匹配

? ? ? ? for link in links:

? ? ? ? ? ? new_url=link['href']

? ? ? ? ? ? new_full_url=urllib.parse.urljoin(page_url,new_url)? ? ? ? ?##new_url按照page_url的格式拼接成全新的url

? ? ? ? ? ? new_urls.add(new_full_url)

? ? ? ? return new_urls

? ? def _get_new_data(self,page_url,soup):

? ? ? ? res_data={}? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?##存放數(shù)據(jù)

? ? ? ? res_data['url']=page_url? ? ? ? ? ? ? ? ? ?##url也放入最終數(shù)據(jù)中方便使用

? ? ? ? ##標(biāo)題右鍵查看元素，然后右鍵edit as html得到<h1><span class="title">Python，這個(gè)用于確認(rèn)find內(nèi)的內(nèi)容

? ? ? ? title_node=soup.find('span',class_="title").find("h1")

? ? ? ? res_data['title']=title_node.get_text()? ? ? ? ? ? ? ##獲取標(biāo)題數(shù)據(jù)

? ? ? ? ##獲取方法同上得到<div class="card_content" id="js-card-content"><p>

? ? ? ? summary_node=soup.find('div',class_="card_content" )

? ? ? ? res_data['summary']=summary_node.get_text()? ? ? ? ? ? ##獲取內(nèi)容數(shù)據(jù)

? ? ? ? return res_data

? ? ? ??

? ? def parser(self,page_url,html_cont):

? ? ? ? if page_url is None or html_cont is None:

? ? ? ? ? ? return

? ? ? ? soup=BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8')

? ? ? ? new_urls=self._get_new_urls(page_url,soup)

? ? ? ? new_data=self._get_new_data(page_url,soup)

? ? ? ? return new_urls,new_data

class HtmlOutputer(object):? ? ? ? ? ? ? ? ? ? ?##創(chuàng)建輸出器的模塊

? ? def __init__(self):

? ? ? ? self.datas=[]

? ? def collect_data(self,data):? ? ? ? ? ? ? ?##收集數(shù)據(jù)

? ? ? ? if data is None:

? ? ? ? ? ? return

? ? ? ? self.datas.append(data)

? ? def output_html(self):? ? ? ? ? ? ? ? ? ##輸出數(shù)據(jù)寫出到html文件中

? ? ? ? fout=open('output.html','w')? ? ? ? ##建立文件輸出對(duì)象，output.html是輸出文件的命名，w是模式

? ? ? ??

? ? ? ? fout.write("<html>")? ? ? ? ? ? ? ? ?##開始標(biāo)簽

? ? ? ? fout.write("<body>")

? ? ? ? fout.write("<table>")

? ? ? ??

? ? ? ? for data in self.datas:

? ? ? ? ? ? fout.write("<tr>")? ? ? ? ? ? ? ##寫一個(gè)行的開始標(biāo)簽

? ? ? ? ? ? fout.write("<td>%s</td>"% data ['url'])? ? ? ? ? ? ?##單元格的內(nèi)容

? ? ? ? ? ? fout.write("<td>%s</td>"% data ['title'].encode('utf-8'))? ##python默認(rèn)模式是ascii，如果輸出utf-8，就末尾加上.encode('utf-8')

? ? ? ? ? ? fout.write("<td>%s</td>"% data ['summary'].encode('utf-8'))? ?##如果沒有.encode('utf-8')，有些中文可能識(shí)別成亂碼

? ? ? ? ? ? fout.write("</tr>")

? ? ? ? ? ??

? ? ? ? fout.write("</table>")? ? ? ? ? ? ##跟上面相對(duì)應(yīng)的閉合標(biāo)簽

? ? ? ? fout.write("</body>")

? ? ? ? fout.write("</html>")

? ? ? ? fout.close()

class SpiderMain(object):? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ##創(chuàng)建SpiderMain總調(diào)度程序和入口程序

? ? def __init__(self):? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ##初始化UrlManager、HtmlDownloader等各個(gè)模塊

? ? ? ? self.urls=UrlManager()? ? ? ? ? ? ? ? ?##url管理器? ?

? ? ? ? self.downloader=HtmlDownloader()? ? ##下載器? ? ?

? ? ? ? self.parser=HtmlParser()? ? ? ? ? ? ?##解析器

? ? ? ? self.outpouter=HtmlOutputer()? ? ? ##輸出器

? ? ? ??

? ? def craw(self,root_url):? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?##爬蟲調(diào)度程序

? ? ? ? count=1? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ##記錄當(dāng)前次數(shù)

? ? ? ? self.urls.add_new_url(root_url)? ? ? ? ? ? ? ? ? ? ? ? ##入口url添加進(jìn)url管理器

? ? ? ? while self.urls.has_new_url():? ? ? ? ? ? ? ? ? ? ? ? ?##有帶爬取得url時(shí)，啟動(dòng)爬蟲循環(huán)

? ? ? ? ? ? try:

? ? ? ? ? ? ? ? new_url=self.urls.get_new_url()? ? ? ? ? ? ? ? ? ? ?##獲取一個(gè)url

? ? ? ? ? ? ? ? print('craw %d : %s' % (count,new_url))? ? ? ? ? ? ? ? ? ? ?##

? ? ? ? ? ? ? ? html_cont=self.downloader.download(new_url)? ? ? ? ? ? ? ? ? ? ##啟動(dòng)下載器下載頁面

? ? ? ? ? ? ? ? new_urls,new_data=self.parser.parse(new_url,html_cont)? ? ? ? ? ##解析頁面數(shù)據(jù)，得到新的url列表和新的數(shù)據(jù)

? ? ? ? ? ? ? ? self.urls.add_new_urls(new_urls)? ? ? ? ? ? ? ? ? ? ? ? ? ? ##url添加進(jìn)url管理器，這里是urls添加了批量url，上面是單個(gè)url

? ? ? ? ? ? ? ? self.outputer.collect_data(new_data)? ? ? ? ? ? ? ?## 收集數(shù)據(jù)

? ? ? ? ? ? ? ? if count==1000:? ? ? ? ? ? ?##我們?cè)O(shè)置爬取1000個(gè)頁面，達(dá)到1000個(gè)時(shí)就停止

? ? ? ? ? ? ? ? ? ? break

? ? ? ? ? ? ? ? count+=1

? ? ? ? ? ? except:? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ##有些url已經(jīng)無法訪問或無摘要數(shù)據(jù)，所以進(jìn)行異常處理

? ? ? ? ? ? ? ? print("craw failed")

? ? ? ? ?

? ? ? ? self.outputer.output_html()? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ##輸出收集好的數(shù)據(jù)

if __name__ =="__main__":? ? ?##編寫main函數(shù)，雙下滑線

? ? root_url="https://baike.so.com/doc/1790119-1892991.html"? ? ?##設(shè)置入口url,就是python360百科網(wǎng)站字符

? ? obj_spider=SpiderMain()? ? ? ? ? ? ?##SpiderMain是總調(diào)度程序，前面創(chuàng)建了

? ? obj_spider.craw(root_url)? ? ? ? ? ? ##用spider啟動(dòng)爬蟲

慕雪919678

2018-11-01

源自：Python開發(fā)簡(jiǎn)單爬蟲 7-7

關(guān)注問題我要回答

4071

操作

收起

1 回答

Grace清風(fēng)
2019-07-27

把craw下面的代碼對(duì)齊試試，其他該對(duì)齊的也對(duì)齊，我的就經(jīng)常因?yàn)閷?duì)不齊報(bào)錯(cuò)，個(gè)人建議，希望有用。

0 回復(fù) 有任何疑惑可以回復(fù)我~

收起回答

舉報(bào)

0/150

提交

取消

Python開發(fā)簡(jiǎn)單爬蟲

參與學(xué)習(xí) 227603 人
解答問題 1282 個(gè)

本教程帶您解開python爬蟲這門神奇技術(shù)的面紗

進(jìn)入課程

我一點(diǎn)點(diǎn)查代碼沒問題，結(jié)果還是NameError: name 'new_urls' is not defined

我要回答關(guān)注問題

使用 Ctrl+D 可將網(wǎng)站添加到書簽

微信客服

購(gòu)課補(bǔ)貼
聯(lián)系客服咨詢優(yōu)惠詳情

幫助反饋 APP下載

慕課網(wǎng)APP
您的移動(dòng)學(xué)習(xí)伙伴

公眾號(hào)

掃描二維碼
關(guān)注慕課網(wǎng)微信公眾號(hào)

第七色在线视频,2021少妇久久久久久久久久,亚洲欧洲精品成人久久av18,亚洲国产精品特色大片观看完整版,孙宇晨将参加特朗普的晚宴

熱搜

最近搜索清空

我一點(diǎn)點(diǎn)查代碼沒問題，結(jié)果還是NameError: name 'new_urls' is not defined

1 回答

我一點(diǎn)點(diǎn)查代碼沒問題，結(jié)果還是NameError: name 'new_urls' is not defined

我一點(diǎn)點(diǎn)查代碼沒問題，結(jié)果還是NameError: name 'new_urls' is not defined

我一點(diǎn)點(diǎn)查代碼沒問題，結(jié)果還是NameError: name 'new_urls' is not defined