scrapy 一些设置和问题
作者:互联网
scrapy设置ua池
设置后在setting启用
DOWNLOADER_MIDDLEWARES = {
'laogou.middlewares.LaogouDownloaderMiddleware': 543,
'laogou.middlewares.randomUserAgentMiddleware': 400,
'laogou.middlewares.randomProxyMiddleware': 400,
}
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware class randomUserAgentMiddleware(UserAgentMiddleware): def __init__(self,user_agent=''): self.user_agent = user_agent def process_request(self, request, spider): ua = random.choice(self.user_agent_list) if ua: request.headers.setdefault('User-Agent', ua) user_agent_list = [ \ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" \ "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \ "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \ "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \ "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \ "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ]
scrapy设置ip池
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware class randomHttpProxyMiddleware(HttpProxyMiddleware): def __init__(self,ip = ''): self.ip = ip def process_request(self, request, spider): ip = random.choice(self.ip_list) if ip: request.meta['proxy'] = ip ip_list = [ 'https://182.122.176.49:9999', 'https://125.123.141.20:9999' ]
scrapy 设置自定义cookie:class LaogouwangSpider(scrapy.Spider):
name = 'laogouwang'
# allowed_domains = ['www.laogou.com']
# start_urls = ['http://www.laogou.com/'] def start_requests(self): url = 'https://www.lagou.com/' yield scrapy.Request(url=url,callback=self.parse,meta={'cookiejar':1}) def parse(self, response): print(response.request.headers.getlist('Cookie')) print(response.headers.getlist('Set-Cookie')) url = 'https://www.lagou.com/jobs/list_'+ str(settings.keys) +'?city='+ str(settings.cidy) +'&cl=false&fromSearch=true&labelWords=&suginput=' print(response.meta['cookiejar'])
yield scrapy.Request(url=url,callback=self.download,meta={'cookiejar':response.meta['cookiejar'],'id':1},dont_filter=True)
def download(self, response):
# print(response.text)
print(response.request.headers.getlist('Cookie'))
print(response.headers.getlist('Set-Cookie'))
i = response.meta.get('id')
file = 'false'
if i == 1:
file = 'true'
data = {
"first":file,
"pn":str(i),
"kd":str(settings.keys)
}
headers_post = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Content-Length': str(len(urllib.parse.urlencode(data))),
'Connection': 'keep-alive',
'Referer':str(response.url),
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0',
}
print(headers_post)
print(str(response.url))
print(data)
url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
yield scrapy.FormRequest(url=url,formdata=data,headers=headers_post,callback=self.files,dont_filter=True,meta={'cookiejar':True,'dont_redirect': True,'handle_httpstatus_list': [301,302]})
meta={'cookiejar':1}这个是启动cookei记录,在后面的请求中使用'cookiejar':response.meta['cookiejar']可以更新cookie。
注意,需要在setting中设置COOKIES_ENABLED = True
获取请求cookies是response.request.headers.getlist('Cookie'),响应cookies是response.headers.getlist('Set-Cookie')。
静止重定向dont_filter=True。
在meta里使用'dont_redirect': True,'handle_httpstatus_list': [301,302]可以在当前scrapy请求里禁用重定向。
标签:5.0,like,KHTML,Mozilla,scrapy,AppleWebKit,设置,一些,Gecko 来源: https://www.cnblogs.com/dayouzi/p/10390873.html