爬取壁纸 | 好高骛远

import os
import time
import requests
from scrapy import Selector
from concurrent.futures import ThreadPoolExecutor
from queue import Queue

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36'
}


class BoundedThreadPoolExecutor(ThreadPoolExecutor):
    def __init__(self, max_workers=None, thread_name_prefix=''):
        super().__init__(max_workers, thread_name_prefix)
        self._work_queue = Queue(max_workers * 2)




def get_href(page):
    hrefs = []
    for a in range(1, page + 1):
        print('解析第{0}页'.format(a))
        url = 'https://www.2717.com/ent/meinvtupian/list_11_{0}.html'.format(a)
        res = requests.get(url=url, headers=headers)
        res.encoding = 'gb2312'  # respose重新编码
        html = res.text
        # 获取目录页所有链接
        sel = Selector(text=html)
        lis = sel.xpath("//div[@class='MeinvTuPianBox']//li")
        for li in lis:
            a = li.xpath('.//a')[0]
            href = a.css('::attr(href)').extract()[0]
            href = 'https://www.2717.com' + href
            hrefs.append(href)
    return hrefs


def parser_img(href):
    href = href.replace('.html', '')
    Flag = True
    i = 1
    title = ''
    while Flag:
        url = href + '_{0}.html'.format(i)
        res = requests.get(url=url, headers=headers)
        res.encoding = 'gb2312'  # respose重新编码
        html = res.text
        sel = Selector(text=html)
        title = sel.css('#picBody img::attr(alt)').extract()[0]
        if i == 1:
            creat_folder(title)
            print(title + '开始下载')
        img_href = sel.css('#picBody img::attr(src)').extract()[0]
        img = requests.get(img_href, headers=headers, timeout=(5, 5))
        with open('./image/' + title + '/' + str(i) + '.jpg', 'wb') as f:
            f.write(img.content)
        f.close()
        i += 1
        try:
            next_url = sel.xpath('//*[@id="pop_cl"]').extract()[0]
            Flag = False
        except IndexError as e:
            pass
        time.sleep(0.1)
    print(title + "下载结束")


def creat_folder(title):
    # 判断image文件夹是否存在，不存在则创建
    if not os.path.exists('image'):
        os.makedirs('image')
    if not os.path.exists('image/' + title):
        os.makedirs('image/' + title)


if __name__ == '__main__':
    page = int(input('请输入要下载的页数:'))
    hrefs = get_href(page)
    executor = BoundedThreadPoolExecutor(max_workers=8)
    for href in hrefs:
        executor.submit(parser_img, href)

爬虫假死时

多print,找出卡死位置
使用timeout
urlretrieve可以通过socket来判断超时

通过url下载图片

import urllib.request

url ＝ '......'

request.urlretrieve(url, path)