import urllib.request; import re; ''' 爬取指定地址的页面内容 ''' def getHtmlCode(url): page = urllib.request.urlopen(url) htmlCode = page.read() return htmlCode.decode('utf-8') # htmlCon = getHtml("https://tieba.baidu.com/p/1753935195") # htmlCon = htmlCon.decode('utf-8') # pageFile = open("xh.txt", 'w') # pageFile.write(htmlCon) # pageFile.close() ''' 获取页面内的全部图片并下载到本地 ''' def getImg(htmlCode): reg = r'src="(.+?\.jpg)" width' regImg = re.compile(reg) imgList = regImg.findall(htmlCode) x = 0 for img in imgList: urllib.request.urlretrieve(img, '%s.jpg' % x) x += 1 # htmlCode = getHtmlCode("https://tieba.baidu.com/p/1753935195") # htmlCode = htmlCode.decode('utf-8') print(u'---------网页图片抓取------------') print(u'请输入url:') url = input() if url: pass else: print(u'---------没有输入地址,使用默认地址。--------') url = "https://tieba.baidu.com/p/1753935195" print(u'-------正在抓取网页----------') htmlCode = getHtmlCode(url); print(u'-------正在下载图片---------') getImg(htmlCode); print(u'-------下载图片完成-------') input('Press Enter to exit') print('hello world')
学习来源:https://www.cnblogs.com/Axi8/p/5757270.html 贴吧图片爬取html