1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
| import os import time import requests from scrapy import Selector from concurrent.futures import ThreadPoolExecutor from queue import Queue
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36' }
class BoundedThreadPoolExecutor(ThreadPoolExecutor): def __init__(self, max_workers=None, thread_name_prefix=''): super().__init__(max_workers, thread_name_prefix) self._work_queue = Queue(max_workers * 2)
def get_href(page): hrefs = [] for a in range(1, page + 1): print('解析第{0}页'.format(a)) url = 'https://www.2717.com/ent/meinvtupian/list_11_{0}.html'.format(a) res = requests.get(url=url, headers=headers) res.encoding = 'gb2312' # respose重新编码 html = res.text # 获取目录页所有链接 sel = Selector(text=html) lis = sel.xpath("//div[@class='MeinvTuPianBox']//li") for li in lis: a = li.xpath('.//a')[0] href = a.css('::attr(href)').extract()[0] href = 'https://www.2717.com' + href hrefs.append(href) return hrefs
def parser_img(href): href = href.replace('.html', '') Flag = True i = 1 title = '' while Flag: url = href + '_{0}.html'.format(i) res = requests.get(url=url, headers=headers) res.encoding = 'gb2312' # respose重新编码 html = res.text sel = Selector(text=html) title = sel.css('#picBody img::attr(alt)').extract()[0] if i == 1: creat_folder(title) print(title + '开始下载') img_href = sel.css('#picBody img::attr(src)').extract()[0] img = requests.get(img_href, headers=headers, timeout=(5, 5)) with open('./image/' + title + '/' + str(i) + '.jpg', 'wb') as f: f.write(img.content) f.close() i += 1 try: next_url = sel.xpath('//*[@id="pop_cl"]').extract()[0] Flag = False except IndexError as e: pass time.sleep(0.1) print(title + "下载结束")
def creat_folder(title): # 判断image文件夹是否存在,不存在则创建 if not os.path.exists('image'): os.makedirs('image') if not os.path.exists('image/' + title): os.makedirs('image/' + title)
if __name__ == '__main__': page = int(input('请输入要下载的页数:')) hrefs = get_href(page) executor = BoundedThreadPoolExecutor(max_workers=8) for href in hrefs: executor.submit(parser_img, href)
|