Scrapy Downloader Middleware之RandomUserAgent

Scrapy集成随机UA

这里利用了fake-useragent包提供的UserAgent,安装请使用pip install fake-useragent

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# middlewares.py

# -*- coding: utf-8 -*-
# author: Conyyon

from fake_useragent import UserAgent


class RandomUserAgentMiddleware(object):
def __init__(self, random_ua_on, random_ua_type):
super(RandomUserAgentMiddleware, self).__init__()
self.ua = UserAgent()
self.ua_on = random_ua_on
self.ua_type = random_ua_type

@classmethod
def from_crawler(cls, clawler):
return cls(
random_ua_on=clawler.settings.get('RANDOM_USER_AGENT', False),
random_ua_type=clawler.settings.get('RANDOM_USER_AGENT_TYPE', 'random')
)

def process_request(self, request, spider):
spider.logger.info('Checking RANDOM_USER_AGENT ON/OFF...')
spider.logger.info('RANDOM_USER_AGENT: %s' % ('ON' if self.ua_on else 'OFF',))
if self.ua_on:
spider.logger.info('Checking RANDOM_USER_AGENT_TYPE...')
try:
random_ua = getattr(self.ua, self.ua_type)
spider.logger.info('RANDOM_USER_AGENT_TYPE: %s' % (self.ua_type,))
except Exception as e:
spider.logger.info(str(e))
self.ua_type = 'random'
random_ua = getattr(self.ua, self.ua_type)
spider.logger.info('Switching RANDOM_USER_AGENT_TYPE to %s' % (self.ua_type,))
request.headers['User-Agent'] = random_ua
spider.logger.info('Set RANDOM_USER_AGENT Success')
1
2
3
4
5
6
7
8
9
10
11
12
# settings.py

# -*- coding: utf-8 -*-
# author: Conyyon

# Choose one in ['random', 'ie', 'opera', 'chrome', 'firefox', 'safari'] as USER_AGENT_TYPE(random by default)
RANDOM_USER_AGENT = True
# RANDOM_USER_AGENT_TYPE = 'chrome'

DOWNLOADER_MIDDLEWARES = {
'xxx.middlewares.RandomUserAgentMiddleware': 543,
}