2 回答

TA貢獻(xiàn)1752條經(jīng)驗(yàn) 獲得超4個(gè)贊
這可能有效。我會(huì)評(píng)論我在做什么和你對(duì)我在做什么的理解的一點(diǎn)代碼。
import scrapy
import pandas as pd
from ..items import latlonglocItem
df = pd.read_csv('wine_df_final.csv')
df = df[pd.notnull(df.real_place)]
real_place = list(set(df.real_place))
class latlonglocSpider(scrapy.Spider): # latlonglocSpider is a child class of scrapy.Spider
name = 'latlonglocs'
start_urls = []
for place in real_place:
baseurl = place.replace(',', '').replace(' ', '+')
cleaned_href = f'http://www.google.com/search?q={baseurl}+coordinates+latitude+longitude+distancesto'
start_urls.append(cleaned_href)
def __init__(self): # Constructor for our class
# Since we did our own constructor we need to call the parents constructor
scrapy.Spider.__init__(self)
self.base_name = None # Here is the base_name we can now use class wide
def parse(self, response):
items = latlonglocItem()
items['base_name'] = response.xpath('string(/html/head/title)').get().split(' coordinates')[0]
self.base_name = items['base_name'] # Lets store the base_name in the class
for href in response.xpath('//*[@id="ires"]/ol/div/h3/a/@href').getall():
if href.startswith('/url?q=https://www.distancesto'):
yield response.follow(href, self.parse_distancesto)
else:
pass
yield items
def parse_distancesto(self, response):
items = latlonglocItem()
try:
# If for some reason self.base_name is never assigned in
# parse() then we want to use an empty string instead of the self.base_name
# The following syntax means use self.base_name unless it is None or empty
# in which case just use and empty string.
base_name = self.base_name or "" # If for some reason
items['appellation'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[2]/p/strong)').get()
items['latitude'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[1]/td)').get()
items['longitude'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[2]/td)').get()
items['elevation'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[10]/td)').get()
yield items
except Exception:
pass

TA貢獻(xiàn)1864條經(jīng)驗(yàn) 獲得超6個(gè)贊
import scrapy
import pandas as pd
from ..items import latlonglocItem
df = pd.read_csv('wine_df_final.csv')
df = df[pd.notnull(df.real_place)]
real_place = list(set(df.real_place))
class latlonglocSpider(scrapy.Spider): # latlonglocSpider is a child class of scrapy.Spider
name = 'latlonglocs'
start_urls = []
for place in real_place:
baseurl = place.replace(',', '').replace(' ', '+')
cleaned_href = f'http://www.google.com/search?q={baseurl}+coordinates+latitude+longitude+distancesto'
start_urls.append(cleaned_href)
def __init__(self): # Constructor for our class
# Since we did our own constructor we need to call the parents constructor
scrapy.Spider.__init__(self)
self.base_name = None # Here is the base_name we can now use class wide
def parse(self, response):
for href in response.xpath('//*[@id="ires"]/ol/div/h3/a/@href').getall():
if href.startswith('/url?q=https://www.distancesto'):
self.base_name = response.xpath('string(/html/head/title)').get().split(' coordinates')[0]
yield response.follow(href, self.parse_distancesto)
else:
pass
def parse_distancesto(self, response):
items = latlonglocItem()
try:
# If for some reason self.base_name is never assigned in
# parse() then we want to use an empty string instead of the self.base_name
# The following syntax means use self.base_name unless it is None or empty
# in which case just use and empty string.
items['base_name'] = self.base_name or "" # If for some reason
items['appellation'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[2]/p/strong)').get()
items['latitude'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[1]/td)').get()
items['longitude'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[2]/td)').get()
items['elevation'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[10]/td)').get()
yield items
except Exception:
pass
并發(fā)請(qǐng)求必須設(shè)置為 1 才能工作并將 base_name 放置在循環(huán)中。
添加回答
舉報(bào)