1 回答

TA貢獻(xiàn)1799條經(jīng)驗(yàn) 獲得超9個贊
您可能應(yīng)該嘗試重用解析庫,而不是自己解析這些部分。考慮這種方法:
from bs4 import BeautifulSoup
import re
root_tag = ["article", {"class":"story"}]
image_tag = ["img", {"":""}, "org-src"]
header = ["h3", {"class":"story-title"}]
news_tag = ["a", {"":""}, "href"]
txt_data = ["p", {"":""}]
# import requests
# ua1 = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
# ua2 = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit 537.36 (KHTML, like Gecko) Chrome'
# headers = {'User-Agent': ua2,
# 'Accept': 'text/html,application/xhtml+xml,application/xml;' \
# 'q=0.9,image/webp,*/*;q=0.8'}
# session = requests.Session()
# response = session.get("https://www.reuters.com/energy-environment", headers=headers)
# webContent = response.content
# file = open('output', 'wb')
# file.write(webContent)
# file.close()
file = open('output', 'r')
webContent = file.read()
bs = BeautifulSoup(webContent, 'html.parser')
all_tab_data = bs.findAll(*root_tag)
output = []
for div in all_tab_data:
image_url = None
div_img = str(div)
article_section = BeautifulSoup(div_img, 'html.parser')
article_images = article_section.findAll(*image_tag)
if article_images is not None:
output.extend([i.get('org-src') for i in article_images if i and i.get('org-src') is not None])
添加回答
舉報