Scrapy Downloader Middleware之Selenium

Scrapy集成Selenium

  • [初版] Spider finished.selenium不会退出
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# middlewares.py

# -*- coding: utf-8 -*-
# author: Conyyon

from scrapy.http import HtmlResponse
from selenium import webdriver

class SeleniumPageMiddleware(object):
def __init__(self):
super(SeleniumPageMiddleware, self).__init__()
self.browser = webdriver.Chrome()

def process_request(self, request, spider):
self.browser.get(request.url)
time.sleep(2)
print('selenium访问{}'.format(request.url))
return HtmlResponse(url=self.browser.current_url, body=self.browser.page_source, encoding='utf-8', request=request)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# spiders\sel_spider.py

# -*- coding: utf-8 -*-
# author: Conyyon

import scrapy


class SelSpiderSpider(scrapy.Spider):
name = 'sel_spider'
allowed_domains = ['taobao.com']
start_urls = ['https://www.taobao.com']

def parse(self, response):
print(response.url)
yield scrapy.Request(url='https://world.taobao.com')
  • [终板] 使用信号量在spider closed的时候调用方法关闭selenium
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# middlewares.py

# -*- coding: utf-8 -*-
# author: Conyyon

from scrapy import signals
from scrapy.http import HtmlResponse
from selenium import webdriver


class SeleniumPageMiddleware(object):
def process_request(self, request, spider):
spider.browser.get(request.url)
print('selenium访问{}'.format(request.url))
return HtmlResponse(url=spider.browser.current_url, body=spider.browser.page_source, encoding='utf-8', request=request)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# spiders\sel_spider.py

# -*- coding: utf-8 -*-
# author: Conyyon

import scrapy
from selenium import webdriver
from pydispatch import dispatcher
from scrapy import signals


class SelSpiderSpider(scrapy.Spider):
name = 'sel_spider'
allowed_domains = ['taobao.com']
start_urls = ['https://www.taobao.com']

def __init__(self):
super(SelSpiderSpider, self).__init__()
self.browser = webdriver.Chrome()
dispatcher.connect(self.spider_closed, signals.spider_closed)

def parse(self, response):
print(response.url)
yield scrapy.Request(url='https://world.taobao.com')

def spider_closed(self):
print('爬虫关闭,Selenium退出')
self.browser.quit()