scrapy重构壁纸爬虫 Spider 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 import scrapyfrom scrapy.http import Requestfrom spider_test.items import ImgItemclass CsdnSpider (scrapy.Spider): name = 'girl' allowed_domains = ['2717.com' ] start_urls = ['https://www.2717.com/ent/meinvtupian/list_11_1.html' ] def parse (self, response ): print (response) lis = response.xpath("//div[@class='MeinvTuPianBox']//li" ) for li in lis: item = ImgItem() a = li.xpath('.//a' )[0 ] href = a.css('::attr(href)' ).extract()[0 ] href = 'https://www.2717.com' + href url = href[0 :href.rfind('/' ) + 1 ] item['url' ] = url yield Request(url=href, callback=self.parser_img, meta={'item' : item}) def parser_img (self, response ): imgitem = ImgItem() title = response.css('#picBody img::attr(alt)' ).extract()[0 ] img_href = response.css('#picBody img::attr(src)' ).extract()[0 ] imgitem['title' ] = title imgitem['url' ] = img_href next_url = response.css('#nl a::attr(href)' ).extract()[0 ] if (next_url != '##' ): item = response.meta['item' ] url = item['url' ] yield Request(url=url + next_url, callback=self.parser_img, meta={'item' : item}) yield imgitem
字符串截取+寻找字符串位置
rfind()函数从右往左发现第一个括号里的字符位置
1 2 3 https://www.jianshu.com/p/854393ec0fb3 url = href[0 :href.rfind('/' ) + 1 https://www.jianshu.com/p/
使用meta参数完成,函数间参数值的传递
1 2 3 4 //发送 必须是实例化的item yield Request(url=href, callback=self.parser_img, meta={'item' : item})//接收 item = response.meta['item' ]
Item 1 2 3 4 5 6 7 8 9 import scrapy class ImgItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() #图片标题 title = scrapy.Field() #图片链接 url = scrapy.Field()
pipeline 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 import refrom scrapy.exceptions import DropItemfrom scrapy.pipelines.images import ImagesPipelinefrom scrapy.http import Requestclass SaveImagePipeline (ImagesPipeline ): def get_media_requests (self, item, info ): yield Request(url=item['url' ], meta={'name' : item['title' ]}) def item_completed (self, results, item, info ): if not results[0 ][0 ]: raise DropItem('下载失败' ) return item def file_path (self, request, response=None , info=None ): name = request.meta['name' ] image_name = request.url.split('/' )[-1 ] folder_strip = re.sub(r'[?\\*|“<>:/]' , '' , str (name)) filename = u'{0}/{1}' .format (folder_strip, image_name) return filename
setting设置里
1 2 3 4 5 ITEM_PIPELINES = { 'spider_test.pipelines.SaveImagePipeline': 300, } IMAGES_STORE = './image' 文件根目录
DownladerMiddlewares 设置随机useragent
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 from fake_useragent import UserAgentclass RandomUserAgentMiddlware (object ): def __init__ (self,crawler ): super (RandomUserAgentMiddlware,self).__init__() self.ua = UserAgent() @classmethod def from_crawler (cls,crawler ): return cls(crawler) def process_request (self,request,spider ): request.headers.setdefault("User-Agent" ,self.ua.random)