爬網(wǎng)頁(yè)表格,想要把表格最后一欄url以超鏈接形式寫入EXCEL,要怎么做呢?提取的url寫入excel是文本格式。獲取網(wǎng)頁(yè)urls,寫入link_data列表:links?=?soup.find_all('a',?href=re.compile(r"/permitExt/xkgkAction!xkgk"))
for?link?in?links:
????link_data.append(urlparse.urljoin(root_url,?link['href']))網(wǎng)頁(yè)url寫入excel:for?i?in?range(2,?len(data)+2):
????sheet.cell(row=i,?column=9,?value=(link_data[i-2]))整體代碼如下。爬取的數(shù)據(jù)、鏈接沒有問題,然而excel中的鏈接是文本格式,不是超鏈接。#?coding=utf-8
import?urllib.request
import?urllib.parse?as?urlparse
import?re
from?bs4?import?BeautifulSoup
from?openpyxl?import?*
def?CementSpider(end_page):
????'''
????爬取網(wǎng)站的表格
????:param?end_page:?
????:return:?
????'''
????data?=?[]??#?初始文本數(shù)據(jù)
????link_data?=?[]??#?初始鏈接數(shù)據(jù)
????root_url?=?'http://permit.mep.gov.cn/'
????#?翻頁(yè)查找
????for?page_num?in?range(1,?end_page?+?1):
????????page_num?=?str(page_num)
????????url?=?'http://permit.mep.gov.cn/permitExt/outside/Publicity?pageno=%s'?%?(page_num)
????????search?=?"&enterName=&publishtime=&province=&city=&treadcode=C301,C3011,C3021&treadname="?+?urllib.request.quote("水泥、石灰和石膏制造,水泥制造,水泥制品制造")
????????resp?=?urllib.request.urlopen(url?+?search)
????????if?resp.getcode()?==?200:
????????????print("獲取第%s頁(yè)鏈接成功!"?%?(page_num))
????????else:
????????????print("獲取第%s頁(yè)鏈接失敗/(ㄒoㄒ)/~~"?%?(page_num))
????????html_doc?=?resp.read()
????????soup?=?BeautifulSoup(html_doc,?"html.parser")
????????try:
????????????#?<div?class="tb-con">
????????????trs?=?soup.find('div',?class_="tb-con").find_all('tr')
????????????#?存儲(chǔ)表格數(shù)據(jù)
????????????data1?=?[]
????????????for?tr?in?trs:
????????????????row_data?=?[]
????????????????for?td?in?tr:
????????????????????row_data.append(td.string)
????????????????data1.append(row_data)
????????????data?+=?data1[1:]
????????except:
????????????print("第%s頁(yè)沒有數(shù)據(jù)啊/(ㄒoㄒ)/~~"?%?(page_num))
????????try:
????????????#?<td?class="bgcolor1">
????????????links?=?soup.find_all('a',?href=re.compile(r"/permitExt/xkgkAction!xkgk"))
????????????for?link?in?links:
????????????????link_data.append(urlparse.urljoin(root_url,?link['href']))
????????except:
????????????print("第%s頁(yè)沒有詳細(xì)數(shù)據(jù)鏈接啊/(ㄒoㄒ)/~~"?%?(page_num))
????return?data,?link_data
def?write_Excel(data,?link_data):
????'''
????將數(shù)據(jù)寫入EXCEL表
????:param?data:?
????:return:?
????'''
????wb?=?Workbook()
????sheet?=?wb.active
????sheet.title?=?"sheet1"??#?創(chuàng)建最終保存表格
????sheet.cell(row=1,?column=1,?value='編號(hào)')
????sheet.cell(row=1,?column=2,?value='省/直轄市')
????sheet.cell(row=1,?column=3,?value='地市')
????sheet.cell(row=1,?column=4,?value='許可證編號(hào)')
????sheet.cell(row=1,?column=5,?value='單位名稱')
????sheet.cell(row=1,?column=6,?value='行業(yè)類別')
????sheet.cell(row=1,?column=7,?value='有效期限')
????sheet.cell(row=1,?column=8,?value='發(fā)證日期')
????sheet.cell(row=1,?column=9,?value='查看鏈接')
????for?i?in?range(2,?len(data)+2):
????????sheet.cell(row=i,?column=1,?value=i?-?1)
????????if?data[i-2][16]?==?'\n':
????????????sheet.cell(row=i,?column=9,?value=(link_data[i-2]))
????????for?j?in?range(1,?8):
????????????try:
????????????????sheet.cell(row=i,?column=j+1,?value=data[i-2][2*j-1])
????????????except:
????????????????print('該表格某處數(shù)據(jù)空白/(ㄒoㄒ)/~~')
????wb.save("水泥制造.xlsx")??#?EXCEL保存
def?end_page():
????'''
????獲取最后一頁(yè)頁(yè)碼
????:return:?
????'''
????url?=?'http://permit.mep.gov.cn/permitExt/outside/Publicity?&enterName=&province=&city=&treadcode=C301,C3011,C3021&treadname=%E6%B0%B4%E6%B3%A5%E3%80%81%E7%9F%B3%E7%81%B0%E5%92%8C%E7%9F%B3%E8%86%8F%E5%88%B6%E9%80%A0,%E6%B0%B4%E6%B3%A5%E5%88%B6%E9%80%A0,%E6%B0%B4%E6%B3%A5%E5%88%B6%E5%93%81%E5%88%B6%E9%80%A0'
????response?=?urllib.request.urlopen(url)
????html_page?=?response.read()
????soup_page?=?BeautifulSoup(html_page,?"html.parser")
????page_node?=?soup_page.find('div',?class_='page')
????end_page?=?int(re.findall('\d+',?page_node.get_text())[0])
????return?end_page
if?__name__?==?"__main__":
????end_page?=?end_page()
????data,?link_data?=?CementSpider(end_page)
????write_Excel(data,?link_data)
爬蟲獲得的urls以超鏈接形式寫入EXCEL?
倉(cāng)頡小米
2018-01-16 14:05:46