scrapy重构壁纸爬虫

Spider

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# -*- coding: utf-8 -*-
import scrapy

from scrapy.http import Request
from spider_test.items import ImgItem




class CsdnSpider(scrapy.Spider):
name = 'girl'
allowed_domains = ['2717.com']
start_urls = ['https://www.2717.com/ent/meinvtupian/list_11_1.html']

def parse(self, response):
print(response)
lis = response.xpath("//div[@class='MeinvTuPianBox']//li")
for li in lis:
item = ImgItem()
a = li.xpath('.//a')[0]
href = a.css('::attr(href)').extract()[0]
href = 'https://www.2717.com' + href
url = href[0:href.rfind('/') + 1]
item['url'] = url
yield Request(url=href, callback=self.parser_img, meta={'item': item})

def parser_img(self, response):
imgitem = ImgItem()
title = response.css('#picBody img::attr(alt)').extract()[0]
img_href = response.css('#picBody img::attr(src)').extract()[0]
imgitem['title'] = title
imgitem['url'] = img_href
next_url = response.css('#nl a::attr(href)').extract()[0]
if (next_url != '##'):
item = response.meta['item']
url = item['url']
yield Request(url=url + next_url, callback=self.parser_img, meta={'item': item})
yield imgitem

字符串截取+寻找字符串位置

rfind()函数从右往左发现第一个括号里的字符位置

1
2
3
https://www.jianshu.com/p/854393ec0fb3
url = href[0:href.rfind('/') + 1
https://www.jianshu.com/p/

使用meta参数完成,函数间参数值的传递

1
2
3
4
//发送  必须是实例化的item
yield Request(url=href, callback=self.parser_img, meta={'item': item})
//接收
item = response.meta['item']

Item

1
2
3
4
5
6
7
8
9
import scrapy

class ImgItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
#图片标题
title = scrapy.Field()
#图片链接
url = scrapy.Field()

pipeline

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import re

from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
from scrapy.http import Request


class SaveImagePipeline(ImagesPipeline):

def get_media_requests(self, item, info):
# 下载图片,如果传过来的是集合需要循环下载
# meta里面的数据是从spider获取,然后通过meta传递给下面方法:file_path
yield Request(url=item['url'], meta={'name': item['title']})

def item_completed(self, results, item, info):
# 是一个元组,第一个元素是布尔值表示是否成功
if not results[0][0]:
raise DropItem('下载失败')
return item

# 重命名,若不重写这函数,图片名为哈希,就是一串乱七八糟的名字
def file_path(self, request, response=None, info=None):
# 接收上面meta传递过来的图片名称
name = request.meta['name']
# 提取url前面名称作为图片名
image_name = request.url.split('/')[-1]
# 清洗Windows系统的文件夹非法字符,避免无法创建目录
folder_strip = re.sub(r'[?\\*|“<>:/]', '', str(name))
# 分文件夹存储的关键:{0}对应着name;{1}对应着image_guid
filename = u'{0}/{1}'.format(folder_strip, image_name)
return filename

setting设置里

1
2
3
4
5
ITEM_PIPELINES = {
'spider_test.pipelines.SaveImagePipeline': 300,

}
IMAGES_STORE = './image' 文件根目录

DownladerMiddlewares

设置随机useragent

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
from fake_useragent import UserAgent

class RandomUserAgentMiddlware(object):
#随机更换user-agent
def __init__(self,crawler):
super(RandomUserAgentMiddlware,self).__init__()
self.ua = UserAgent()

@classmethod
def from_crawler(cls,crawler):
return cls(crawler)

def process_request(self,request,spider):
request.headers.setdefault("User-Agent",self.ua.random)