1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import os
import time
import requests
from scrapy import Selector
from concurrent.futures import ThreadPoolExecutor
from queue import Queue

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36'
}


class BoundedThreadPoolExecutor(ThreadPoolExecutor):
def __init__(self, max_workers=None, thread_name_prefix=''):
super().__init__(max_workers, thread_name_prefix)
self._work_queue = Queue(max_workers * 2)




def get_href(page):
hrefs = []
for a in range(1, page + 1):
print('解析第{0}页'.format(a))
url = 'https://www.2717.com/ent/meinvtupian/list_11_{0}.html'.format(a)
res = requests.get(url=url, headers=headers)
res.encoding = 'gb2312' # respose重新编码
html = res.text
# 获取目录页所有链接
sel = Selector(text=html)
lis = sel.xpath("//div[@class='MeinvTuPianBox']//li")
for li in lis:
a = li.xpath('.//a')[0]
href = a.css('::attr(href)').extract()[0]
href = 'https://www.2717.com' + href
hrefs.append(href)
return hrefs


def parser_img(href):
href = href.replace('.html', '')
Flag = True
i = 1
title = ''
while Flag:
url = href + '_{0}.html'.format(i)
res = requests.get(url=url, headers=headers)
res.encoding = 'gb2312' # respose重新编码
html = res.text
sel = Selector(text=html)
title = sel.css('#picBody img::attr(alt)').extract()[0]
if i == 1:
creat_folder(title)
print(title + '开始下载')
img_href = sel.css('#picBody img::attr(src)').extract()[0]
img = requests.get(img_href, headers=headers, timeout=(5, 5))
with open('./image/' + title + '/' + str(i) + '.jpg', 'wb') as f:
f.write(img.content)
f.close()
i += 1
try:
next_url = sel.xpath('//*[@id="pop_cl"]').extract()[0]
Flag = False
except IndexError as e:
pass
time.sleep(0.1)
print(title + "下载结束")


def creat_folder(title):
# 判断image文件夹是否存在,不存在则创建
if not os.path.exists('image'):
os.makedirs('image')
if not os.path.exists('image/' + title):
os.makedirs('image/' + title)


if __name__ == '__main__':
page = int(input('请输入要下载的页数:'))
hrefs = get_href(page)
executor = BoundedThreadPoolExecutor(max_workers=8)
for href in hrefs:
executor.submit(parser_img, href)

爬虫假死时

  1. 多print,找出卡死位置
  2. 使用timeout
  3. urlretrieve可以通过socket来判断超时

通过url下载图片

1
2
3
4
5
import urllib.request

url = '......'

request.urlretrieve(url, path)