課程
                    
                        /后端開發(fā)
                        
                            /Python
                        
                        /Python開發(fā)簡單爬蟲

爬取news.baidu.com所有圖片

我為什么爬取了四張圖片，看著頁面是有好多圖片

import re

import urllib

from bs4 import BeautifulSoup

#根據(jù)url獲取網(wǎng)頁html內(nèi)容-下載頁面

def getHtmlContent(url):

? ? print '獲取網(wǎng)頁內(nèi)容'

? ? page = urllib.urlopen(url)

? ? return page

#從html解析出所有的jpg圖片的url

def getJPGs(html):

? ? soup = BeautifulSoup(html,'html.parser')

? ? print '獲取所有jpg圖片'

? ? jpgs = soup.find_all('img',src=re.compile(r'.+\..+'))

? ? newJpgs = []

? ? print '遍歷jpgs的url,如下：'

? ? for jpg in jpgs:

? ? ? ? print jpg['src'],'\n'

? ? ? ? if jpg['src'].find('http')==-1:

? ? ? ? ? ? print '匹配http://，若無追加http:'

? ? ? ? ? ? print 'http:'+jpg['src']

? ? ? ? ? ? newJpgs.append('http:'+jpg['src'])

? ? ? ? else:

? ? ? ? ? ? newJpgs.append(jpg['src'])

? ? print 'newJpgs的集合,如下：'

? ? print newJpgs

? ? return newJpgs

#用圖片url下載圖片并保存成制定文件名

def downloadJPGs(imgUrl,fileName):

? ? print imgUrl

? ? print fileName

? ? urllib.urlretrieve(imgUrl,fileName)

? ??

#批量下載圖片，保存到本地

def batchDownLoadJPGs(imgUrls,fileName='F:/python/baidu_news_jpg/'):

? ? count = 1

? ? for imgUrl in imgUrls:

? ? ? ? downloadJPGs(imgUrl,''.join([fileName,'{0}.jpg'.format(count)]))

? ? ? ? if count+1>len(imgUrls):

? ? ? ? ? ? print 'html圖片全部下載完成'

? ? ? ? else:

? ? ? ? ? ? print '正在下載第',str(count),'張JPG格式圖片? ?'

? ? ? ? ? ? count +=1

? ? ? ??

#封裝：下載圖片

def download(url):

? ? html = getHtmlContent(url)

? ? imgUrls = getJPGs(html)

? ? batchDownLoadJPGs(imgUrls)

def main():

? ? #url = 'http://ent.ifeng.com/a/20180911/43100024_0.shtml?_zbs_baidu_news#p=1'

? ? #url = 'http://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=%202.7%20%20format'

? ? url = 'http://news.baidu.com'

? ? download(url)

if __name__ =='__main__':

? ? main()

慕前端0369976

2018-09-12

源自：Python開發(fā)簡單爬蟲

關注問題我要回答

746

操作

收起

1 回答

幕布斯5041536
2018-09-14

from?gevent?import?monkey
monkey.patch_all()
import?urllib.request
import?gevent
import?re
import?os

data?=?None

def?func(url):
????try:
????????print(url)
????????#打開圖片網(wǎng)址資源數(shù)據(jù)
????????response?=?urllib.request.urlopen(url)
????????print(response)
????????#創(chuàng)建文件把數(shù)據(jù)寫入到文件中

????????while?True:
????????????img_data?=?response.read()
????????????print(len(img_data))
????????????if?img_data:
????????????????#把每次接收的數(shù)據(jù)寫入文件中
????????????????global?data
????????????????data?=?img_data.decode()
????????????else:
????????????????#?print("下載完成")
????????????????break

????except?Exception?as?e:
????????print("查看異常")

def?download_file(img_url,?img_name):
????try:
????????print(img_url)
????????#打開圖片網(wǎng)址資源數(shù)據(jù)
????????response?=?urllib.request.urlopen(img_url)
????????#創(chuàng)建文件把數(shù)據(jù)寫入到文件中
????????if?not?os.path.exists("目錄"):
????????????os.mkdir("目錄")
????????with?open("目錄"+"/"+img_name+".jpg","wb")?as?file:
????????????while?True:
????????????????img_data?=?response.read()
????????????????if?img_data:
????????????????????#把每次接收的數(shù)據(jù)寫入文件中
????????????????????file.write(img_data)
????????????????else:
????????????????????print("下載完成")
????????????????????break

????except?Exception?as?e:
????????print("下載異常")



if?__name__?==?'__main__':
????url?=?"news.baidu.com"
????g1?=?gevent.spawn(func,url)
????g1.join()
????print(data)
????result?=?re.findall(r"https?://.*\.jpg",?data)

????a?=?0
????#?path?=
????for?i?in?result:
????????name?=?re.search(r"\w{1}.jpg",i).group()
????????print(name)
????????download_file(i,str(a))
????????a?+=1