我的代碼如下:#?-*-?coding:?utf-8?-*-
import?scrapy
from?scrapy.linkextractors?import?LinkExtractor
from?scrapy.spiders?import?CrawlSpider,?Rule
from?..utils.common?import?login_lagou
from?scrapy.http?import?Request
class?LagouSpider(CrawlSpider):
????name?=?'lagou'
????allowed_domains?=?['www.lagou.com']
????start_urls?=?['https://www.lagou.com/']
????rules?=?(
????????Rule(LinkExtractor(allow=("zhaopin/.*",)),?follow=True),????????????????????????????#?招聘的Rule
????????Rule(LinkExtractor(allow=("gongsi/\d+.html",)),?follow=True),???????????????????????#?公司的Rule
????????Rule(LinkExtractor(allow=(r'jobs/\d+.html',)),?callback='parse_job',?follow=True),??#?具體職位的Rule
????)
????headers?=?{
????????"Host":?'passport.lagou.com',
????????"Origin":?'https://passport.lagou.com',
????????"Referer":?'https://passport.lagou.com/login/login.html',
????????"User-Agent":?'Mozilla/5.0?(Windows?NT?10.0;?WOW64)?AppleWebKit/537.36?(KHTML,?like?Gecko)?'
??????????????????????'Chrome/67.0.3396.87?Safari/537.36',
????????"X-Requested-With":?"XMLHttpRequest",
????????"Content-Type":?'application/x-www-form-urlencoded;charset=UTF-8'
????}
????def?start_requests(self):
????????self.cookies?=?login_lagou()
????????print(self.cookies)
????????self.headers.update({
????????????"Cookie":?self.cookies
????????})
????????print(self.headers)
????????yield?Request(url=self.start_urls[0],
??????????????????????cookies=self.cookies,
??????????????????????headers=self.headers,
??????????????????????callback=self.parse,
??????????????????????dont_filter=True)
????#?def?parse_start_url(self,?response):
????#?????return?[]
????#
????#?def?process_results(self,?response,?results):
????#?????return?results
????def?parse_job(self,?response):
????????#?解析拉鉤網(wǎng)的職位
????????i?=?{}
????????print(response)
????????#i['domain_id']?=?response.xpath('//input[@id="sid"]/@value').extract()
????????#i['name']?=?response.xpath('//div[@id="name"]').extract()
????????#i['description']?=?response.xpath('//div[@id="description"]').extract()
????????return?isetting.py的配置文件HTTPERROR_ALLOWED_CODES?=?[302]
#?HTTPERROR_ALLOWED_CODES?=?[400]
COOKIES_ENABLED?=?False
REDIRECT_ENABLED?=?False???#?禁止重定向
DOWNLOAD_DELAY?=?6??????#?設(shè)置時(shí)間間隔為6s,防止被禁
DOWNLOAD_TIMEOUT?=?10???#?設(shè)置超時(shí)時(shí)間
RETRY_ENABLED?=?True????#?設(shè)置開啟重試
RETRY_TIMES?=?3?????????#?設(shè)置重試次數(shù)
添加回答
舉報(bào)
0/150
提交
取消