喜欢斗图, 图不够? 爬了一下斗图网, 自动下载图片, 斗图, 历来没怕过谁.python
环境ubuntu 16.04, python3.5ubuntu
直接上代码bash
#!/usr/bin/python # -*- coding: utf-8 -*- import urllib from urllib.request import urlopen import random import os import re my_headers = [ "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36", ] url_prefix = "http://www.doutula.com/article/list/?page=" referer_prefix = "http://www.doutula.com/article/list/?page=" # 建立保存目录,建立成功返回Ture, 不成功返回False def getFile(path): if not os.path.exists(path): os.makedirs(path) # 若是建立成功则返回, 不成功则尝试用默认路径建立 if os.path.exists(path): print('成功建立目录 %s' % path) return True else: print('建立目录 %s 失败' % path) return False def getPageHTML(url, referer, my_headers): req = urllib.request.Request(url) randdom_header = random.choice(my_headers) req.add_header("User-Agent", randdom_header) req.add_header("Host", "www.doutula.com") req.add_header("Referer", referer) req.add_header("GET", url) # opener = urllib.request.build_opener(urllib.request.ProxyHandler()) # urllib.request.install_opener(opener) response = urllib.request.urlopen(req) content = response.read() return content # 解析HTML页面信息, 获取所需信息 def getHTMLElements(pageHTML): result = [] print('正在获取图片列表') pattern = re.compile('data-original="(\S*?)jpg!dta', re.S) res = re.findall(pattern, pageHTML.decode('utf-8')) for r in res: r += 'jpg' result.append(r) if result: print('成功获取图片列表') return result else: print('获取图片列表失败') return [] # 经过图片url获取图片并保存到指定文件夹中 def getImg(path, imgUrl, filename): print('正在写入图片%s' % imgUrl) u = urllib.request.urlopen(imgUrl) data = u.read() # 切换到目录 os.chdir(path) f = open(filename, 'wb') f.write(data) f.close() os.chdir('../') def log(path, url, referer, my_headers, start_id): print("开始爬取%s页面" % url) # 页面HTML pageHTML = getPageHTML(url, referer, my_headers) # 图片url列表 imageUrl = getHTMLElements(pageHTML) # 保存图片 # 筛除重复数据 imgs = list(set(imageUrl)) # 图片名称初始化 # 对图片url列表进行遍历 for img in imgs: # 将图片url对应的图片保存入目标文件 getImg(path, img, filename=str(start_id) + '.jpg', ) start_id += 1 print('成功爬取%s页面, 获取%d张图片' % (url, len(imgs))) return start_id # 建立保存目录 path = "斗图" f = getFile(path) page_number = 2 if f: next_start_id = 1 for i in range(1, page_number+1): url = url_prefix + str(i) referer = referer_prefix + str(i-1) next_start_id = log(path, url, referer, my_headers, next_start_id) print('爬虫完成, 共获取%d张图片' % (next_start_id-1)) else: print('建立目录%s失败, 中止爬虫')
结果app
/opt/wwwroot/python/bin/python /opt/wwwroot/doutu.py 成功建立目录 斗图 开始爬取http://www.doutula.com/article/list/?page=1页面 正在获取图片列表 成功获取图片列表 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/22/20190422893055_FDICGk.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/19/20190419677576_mgtABy.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/19/20190419677574_yHlqZN.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/17/20190417460735_ZetOWV.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/22/20190422893056_ZNTWxH.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/19/20190419677576_fnZCtR.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/22/20190422893057_tLrIZN.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/16/20190416372281_EVKcFZ.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/18/20190418592845_RyTfwB.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/16/20190416372284_JUoZWm.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/22/20190422893054_gySZih.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/16/20190416372283_MhQrcu.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/17/20190417460736_mkvpPA.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/17/20190417460734_JXsBdD.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/16/20190416372851_XwcOSk.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/16/20190416372852_HQXMkq.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/19/20190419677575_ZWnGrU.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/17/20190417460735_cfWPas.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/18/20190418592850_OcIBWP.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/21/20190421861952_gENdjV.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/16/20190416372850_hySiUj.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/16/20190416372852_UbWHos.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/21/20190421861953_vOqjSn.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/16/20190416372282_DGUWTr.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/16/20190416372805_iDyHdU.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/18/20190418592853_btACyW.jpg 成功爬取http://www.doutula.com/article/list/?page=1页面, 获取26张图片 开始爬取http://www.doutula.com/article/list/?page=2页面 正在获取图片列表 成功获取图片列表 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/14/20190414206133_uQsHqG.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/11/20190411943483_sDBjCg.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/09/20190409770790_dNScVk.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/08/20190408681856_hetJiL.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/08/20190408681858_DJmujK.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/11/20190411943482_XLdQza.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/08/20190408681908_iOuyXg.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/09/20190409770790_sLTbja.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/09/20190409770792_knUdlr.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/12/20190412030067_tWBAzg.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/15/20190415303523_PbsrpF.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/10/20190410855261_FjYcIW.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/08/20190408681901_loACKy.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/08/20190408681857_TXbrzq.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/15/20190415303524_moMfct.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/11/20190411943484_FucDtC.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/14/20190414206134_WyVdgc.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/08/20190408681857_FMapOI.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/10/20190410855259_kKgBZS.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/12/20190412030066_OtbdXm.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/09/20190409770792_TPhezJ.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/10/20190410855260_UKQzfX.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/10/20190410855258_CTXjym.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/12/20190412030066_SxMcbA.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/08/20190408681902_cEAQju.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/14/20190414206132_pOBhSb.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/15/20190415303525_SzCjYs.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/14/20190414206133_VWOxBf.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/11/20190411943482_ZuwNnQ.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/08/20190408681900_eGkLNj.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/15/20190415303525_OPsDGl.jpg 正在写入图片http://img.doutula.com/production/uploads/image//2019/04/12/20190412030067_JYaPnj.jpg 成功爬取http://www.doutula.com/article/list/?page=2页面, 获取32张图片 爬虫完成, 共获取58张图片 Process finished with exit code 0