2 回答

TA貢獻(xiàn)1835條經(jīng)驗 獲得超7個贊
以下實現(xiàn)在16 秒內(nèi)實現(xiàn)了這一目標(biāo)。
為加快執(zhí)行進(jìn)程,我采取了以下措施:
完全刪除
Selenium
(無需點擊)對于
abstract
, 使用BeautifulSoup
的輸出并稍后對其進(jìn)行處理添加
multiprocessing
以顯著加快該過程
from multiprocessing import Process, Manager
import requests?
from bs4 import BeautifulSoup
import re
import time
start_time = time.time()
def get_no_of_pages(showing_text):
? ? no_of_results = int((re.findall(r"(\d+,*\d+) results for all",showing_text)[0].replace(',','')))
? ? pages = no_of_results//200 + 1
? ? print("total pages:",pages)
? ? return pages?
def clean(text):
? ? return text.replace("\n", '').replace("? ",'')
def get_data_from_page(url,page_number,data):
? ? print("getting page",page_number)
? ? response = requests.get(url+"start="+str(page_number*200))
? ? soup = BeautifulSoup(response.content, "lxml")
? ??
? ? arxiv_results = soup.find_all("li",{"class","arxiv-result"})
? ? for arxiv_result in arxiv_results:
? ? ? ? paper = {}?
? ? ? ? paper["titles"]= clean(arxiv_result.find("p",{"class","title is-5 mathjax"}).text)
? ? ? ? links = arxiv_result.find_all("a")
? ? ? ? paper["arxiv_ids"]= links[0].text.replace('arXiv:','')
? ? ? ? paper["arxiv_links"]= links[0].get('href')
? ? ? ? paper["pdf_link"]= links[1].get('href')
? ? ? ? paper["authors"]= clean(arxiv_result.find("p",{"class","authors"}).text.replace('Authors:',''))
? ? ? ? split_abstract = arxiv_result.find("p",{"class":"abstract mathjax"}).text.split("▽ More\n\n\n",1)
? ? ? ? if len(split_abstract) == 2:
? ? ? ? ? ? paper["abstract"] = clean(split_abstract[1].replace("△ Less",''))
? ? ? ? else:?
? ? ? ? ? ? paper["abstract"] = clean(split_abstract[0].replace("△ Less",''))
? ? ? ? paper["date"] = re.split(r"Submitted|;",arxiv_results[0].find("p",{"class":"is-size-7"}).text)[1]
? ? ? ? paper["tag"] = clean(arxiv_results[0].find("div",{"class":"tags is-inline-block"}).text)?
? ? ? ? doi = arxiv_results[0].find("div",{"class":"tags has-addons"})? ? ? ?
? ? ? ? if doi is None:
? ? ? ? ? ? paper["doi"] = "None"
? ? ? ? else:
? ? ? ? ? ? paper["doi"] = re.split(r'\s', doi.text)[1]?
? ? ? ? data.append(paper)
? ??
? ? print(f"page {page_number} done")
if __name__ == "__main__":
? ? url = 'https://arxiv.org/search/?searchtype=all&query=healthcare&abstracts=show&size=200&order=-announced_date_first&'
? ? response = requests.get(url+"start=0")
? ? soup = BeautifulSoup(response.content, "lxml")
? ? with Manager() as manager:
? ? ? ? data = manager.list()??
? ? ? ? processes = []
? ? ? ? get_data_from_page(url,0,data)
? ? ? ? showing_text = soup.find("h1",{"class":"title is-clearfix"}).text
? ? ? ? for i in range(1,get_no_of_pages(showing_text)):
? ? ? ? ? ? p = Process(target=get_data_from_page, args=(url,i,data))
? ? ? ? ? ? p.start()
? ? ? ? ? ? processes.append(p)
? ? ? ? for p in processes:
? ? ? ? ? ? p.join()
? ? ? ? print("Number of entires scraped:",len(data))
? ? ? ? stop_time = time.time()
? ? ? ? print("Time taken:", stop_time-start_time,"seconds")
輸出:
>>> python test.py
getting page 0
page 0 done
total pages: 10
getting page 1
getting page 4
getting page 2
getting page 6
getting page 5
getting page 3
getting page 7
getting page 9
getting page 8
page 9 done
page 4 done
page 1 done
page 6 done
page 2 done
page 7 done
page 3 done
page 5 done
page 8 done
Number of entires scraped: 1890
Time taken: 15.911492586135864 seconds

TA貢獻(xiàn)1155條經(jīng)驗 獲得超0個贊
您可以根據(jù)要求嘗試一下美麗的湯做法。無需點擊更多鏈接。
from requests import get
from bs4 import BeautifulSoup
# you can change the size to retrieve all the results at one shot.
url = 'https://arxiv.org/search/?query=healthcare&searchtype=all&abstracts=show&order=-announced_date_first&size=50&start=0'
response = get(url,verify = False)
soup = BeautifulSoup(response.content, "lxml")
#print(soup)
queryresults = soup.find_all("li", attrs={"class": "arxiv-result"})
for result in queryresults:
title = result.find("p",attrs={"class": "title is-5 mathjax"})
print(title.text)
#If you need full abstract content - try this (you do not need to click on more button
for result in queryresults:
abstractFullContent = result.find("span",attrs={"class": "abstract-full has-text-grey-dark mathjax"})
print(abstractFullContent.text)
輸出:
Interpretable Deep Learning for Automatic Diagnosis of 12-lead Electrocardiogram
Leveraging Technology for Healthcare and Retaining Access to Personal Health Data to Enhance Personal Health and Well-being
Towards new forms of particle sensing and manipulation and 3D imaging on a smartphone for healthcare applications
添加回答
舉報