import?requests
from?lxml?import?html
import?json
etree?=?html.etree
class?BaoxiaoSpider:
????#?url_list
????def?__init__(self):
????????self.url_temp?=?"http://www.2bmv.com/index_{}.html"
????????self.headers?=?{"User-Agent":?"Mozilla/5.0?(Windows?NT?6.1;?WOW64)?AppleWebKit/537.36?(KHTML,?like?Gecko)?Chrome/74.0.3729.131?Safari/537.36"}
????#?獲取url_list中的網(wǎng)址數(shù)據(jù)
????def?get_url_list(self):
????????#?返回的是網(wǎng)址后面index{}的頁(yè)數(shù)的循環(huán),共50頁(yè)
????????return?[self.url_temp.format(i)?for?i?in?range(3)]
????#?發(fā)送請(qǐng)求,獲取響應(yīng)
????def?parse_url(self,url):
????????response?=?requests.get(url,?headers=self.headers)
????????return?response.content
????#?提取數(shù)據(jù)
????def?get_content_list(self,html_str):
????????htmlDIV?=?etree.HTML(html_str)
????????#?根據(jù)div分組
????????div_list?=?htmlDIV.xpath("http://div[@class='post']")
????????content_list?=?[]
????????#?創(chuàng)建div的循環(huán)
????????for?div?in?div_list:
????????????#?如果每一個(gè)div下只有一個(gè)數(shù)據(jù)的話(如:發(fā)布者頭像head_portrait),加上循環(huán);如果每個(gè)div下含有多個(gè)數(shù)據(jù)(如:text),不加循環(huán)
????????????item?=?{}
????????????#?獲取發(fā)布者的頭像
????????????#?item["head_portrait"]?=?div.xpath("./p/img/@src")[0]?if?len(div.xpath("./p/img/@src"))>0?else?None
????????????#?發(fā)布時(shí)間和瀏覽數(shù)
????????????#?item["time"]?=?div.xpath("./div/div/text()")[0]?if?len(div.xpath("./div/text()"))>0?else?None
????????????#?p標(biāo)簽下的文本
????????????item["text"]?=?div.xpath(".//div[contains(@class,'pic_text')]/p/text()")
????????????#?獲取圖片
????????????#?item["img"]?=?div.xpath("./div/div/center/img/@src")
????????????content_list.append(item)
????????return?content_list
????#?保存數(shù)據(jù)
????def?save_content_list(self,content_list):
????????for?i?in?content_list:
????????????print(i)
????#?實(shí)現(xiàn)主要邏輯
????def?run(self):
????????#?1.?url_list
????????url_list?=?self.get_url_list()
????????#?2.遍歷,發(fā)送請(qǐng)求,獲取響應(yīng)
????????for?url?in?url_list:
????????????html_str?=?self.parse_url(url)
????????#?3.提取數(shù)據(jù)
????????content_list?=?self.get_content_list(html_str)
????????#?4.保存數(shù)據(jù)
????????self.save_content_list(content_list)
if?__name__?==?'__main__':
????baoxiao?=?BaoxiaoSpider()
????baoxiao.run()爬取的網(wǎng)站是:http://www.2bmv.com/index_1.html使用Xpath Helper工具查詢,同樣的路徑下內(nèi)容很多,但是爬取出來的內(nèi)容就非常少。希望老師和各位大神可以解答一下疑惑。
- 2 回答
- 0 關(guān)注
- 1035 瀏覽
添加回答
舉報(bào)
0/150
提交
取消