logo头像
Snippet 博客主题

scrapy中重要的组件(四)

1 Spider的子类

1.1 CrawlSpider类

1.1.1源代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
class CrawlSpider(Spider):

rules = ()

def __init__(self, *a, **kw):
super(CrawlSpider, self).__init__(*a, **kw)
self._compile_rules()

def parse(self, response):
return self._parse_response(response, self.parse_start_url, cb_kwargs={}, follow=True)

def parse_start_url(self, response):
return []

def process_results(self, response, results):
return results

def _build_request(self, rule, link):
r = Request(url=link.url, callback=self._response_downloaded)
r.meta.update(rule=rule, link_text=link.text)
return r

def _requests_to_follow(self, response):
if not isinstance(response, HtmlResponse):
return
seen = set()
for n, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(response)
if lnk not in seen]
if links and rule.process_links:
links = rule.process_links(links)
for link in links:
seen.add(link)
request = self._build_request(n, link)
yield rule._process_request(request, response)

def _response_downloaded(self, response):
rule = self._rules[response.meta['rule']]
return self._parse_response(response, rule.callback, rule.cb_kwargs, rule.follow)

def _parse_response(self, response, callback, cb_kwargs, follow=True):
if callback:
cb_res = callback(response, **cb_kwargs) or ()
cb_res = self.process_results(response, cb_res)
for requests_or_item in iterate_spider_output(cb_res):
yield requests_or_item

if follow and self._follow_links:
for request_or_item in self._requests_to_follow(response):
yield request_or_item

def _compile_rules(self):
self._rules = [copy.copy(r) for r in self.rules]
for rule in self._rules:
rule._compile(self)

@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(CrawlSpider, cls).from_crawler(crawler, *args, **kwargs)
spider._follow_links = crawler.settings.getbool(
'CRAWLSPIDER_FOLLOW_LINKS', True)
return spider

1.2 XMLFeedSpider类

1.2.1源代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
class XMLFeedSpider(Spider):
"""
This class intends to be the base class for spiders that scrape
from XML feeds.

You can choose whether to parse the file using the 'iternodes' iterator, an
'xml' selector, or an 'html' selector. In most cases, it's convenient to
use iternodes, since it's a faster and cleaner.
"""

iterator = 'iternodes'
itertag = 'item'
namespaces = ()

def process_results(self, response, results):
"""This overridable method is called for each result (item or request)
returned by the spider, and it's intended to perform any last time
processing required before returning the results to the framework core,
for example setting the item GUIDs. It receives a list of results and
the response which originated that results. It must return a list of
results (Items or Requests).
"""
return results

def adapt_response(self, response):
"""You can override this function in order to make any changes you want
to into the feed before parsing it. This function must return a
response.
"""
return response

def parse_node(self, response, selector):
"""This method must be overriden with your custom spider functionality"""
if hasattr(self, 'parse_item'): # backward compatibility
return self.parse_item(response, selector)
raise NotImplementedError

def parse_nodes(self, response, nodes):
"""This method is called for the nodes matching the provided tag name
(itertag). Receives the response and an Selector for each node.
Overriding this method is mandatory. Otherwise, you spider won't work.
This method must return either a BaseItem, a Request, or a list
containing any of them.
"""

for selector in nodes:
ret = iterate_spider_output(self.parse_node(response, selector))
for result_item in self.process_results(response, ret):
yield result_item

def parse(self, response):
if not hasattr(self, 'parse_node'):
raise NotConfigured('You must define parse_node method in order to scrape this XML feed')

response = self.adapt_response(response)
if self.iterator == 'iternodes':
nodes = self._iternodes(response)
elif self.iterator == 'xml':
selector = Selector(response, type='xml')
self._register_namespaces(selector)
nodes = selector.xpath('//%s' % self.itertag)
elif self.iterator == 'html':
selector = Selector(response, type='html')
self._register_namespaces(selector)
nodes = selector.xpath('//%s' % self.itertag)
else:
raise NotSupported('Unsupported node iterator')

return self.parse_nodes(response, nodes)

def _iternodes(self, response):
for node in xmliter(response, self.itertag):
self._register_namespaces(node)
yield node

def _register_namespaces(self, selector):
for (prefix, uri) in self.namespaces:
selector.register_namespace(prefix, uri)

1.3 CSVFeedSpider类

1.3.1源代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
class CSVFeedSpider(Spider):
"""Spider for parsing CSV feeds.
It receives a CSV file in a response; iterates through each of its rows,
and calls parse_row with a dict containing each field's data.

You can set some options regarding the CSV file, such as the delimiter, quotechar
and the file's headers.
"""

delimiter = None # When this is None, python's csv module's default delimiter is used
quotechar = None # When this is None, python's csv module's default quotechar is used
headers = None

def process_results(self, response, results):
"""This method has the same purpose as the one in XMLFeedSpider"""
return results

def adapt_response(self, response):
"""This method has the same purpose as the one in XMLFeedSpider"""
return response

def parse_row(self, response, row):
"""This method must be overriden with your custom spider functionality"""
raise NotImplementedError

def parse_rows(self, response):
"""Receives a response and a dict (representing each row) with a key for
each provided (or detected) header of the CSV file. This spider also
gives the opportunity to override adapt_response and
process_results methods for pre and post-processing purposes.
"""

for row in csviter(response, self.delimiter, self.headers, self.quotechar):
ret = iterate_spider_output(self.parse_row(response, row))
for result_item in self.process_results(response, ret):
yield result_item

def parse(self, response):
if not hasattr(self, 'parse_row'):
raise NotConfigured('You must define parse_row method in order to scrape this CSV feed')
response = self.adapt_response(response)
return self.parse_rows(response)

1.4 SitemapSpider类

1.4.1源代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
class SitemapSpider(Spider):

sitemap_urls = ()
sitemap_rules = [('', 'parse')]
sitemap_follow = ['']
sitemap_alternate_links = False

def __init__(self, *a, **kw):
super(SitemapSpider, self).__init__(*a, **kw)
self._cbs = []
for r, c in self.sitemap_rules:
if isinstance(c, six.string_types):
c = getattr(self, c)
self._cbs.append((regex(r), c))
self._follow = [regex(x) for x in self.sitemap_follow]

def start_requests(self):
for url in self.sitemap_urls:
yield Request(url, self._parse_sitemap)

def sitemap_filter(self, entries):
"""This method can be used to filter sitemap entries by their
attributes, for example, you can filter locs with lastmod greater
than a given date (see docs).
"""
for entry in entries:
yield entry

def _parse_sitemap(self, response):
if response.url.endswith('/robots.txt'):
for url in sitemap_urls_from_robots(response.text, base_url=response.url):
yield Request(url, callback=self._parse_sitemap)
else:
body = self._get_sitemap_body(response)
if body is None:
logger.warning("Ignoring invalid sitemap: %(response)s",
{'response': response}, extra={'spider': self})
return

s = Sitemap(body)
it = self.sitemap_filter(s)

if s.type == 'sitemapindex':
for loc in iterloc(it, self.sitemap_alternate_links):
if any(x.search(loc) for x in self._follow):
yield Request(loc, callback=self._parse_sitemap)
elif s.type == 'urlset':
for loc in iterloc(it, self.sitemap_alternate_links):
for r, c in self._cbs:
if r.search(loc):
yield Request(loc, callback=c)
break

def _get_sitemap_body(self, response):
"""Return the sitemap body contained in the given response,
or None if the response is not a sitemap.
"""
if isinstance(response, XmlResponse):
return response.body
elif gzip_magic_number(response):
return gunzip(response.body)
# actual gzipped sitemap files are decompressed above ;
# if we are here (response body is not gzipped)
# and have a response for .xml.gz,
# it usually means that it was already gunzipped
# by HttpCompression middleware,
# the HTTP response being sent with "Content-Encoding: gzip"
# without actually being a .xml.gz file in the first place,
# merely XML gzip-compressed on the fly,
# in other word, here, we have plain XML
elif response.url.endswith('.xml') or response.url.endswith('.xml.gz'):
return response.body
微信打赏