#!/usr/bin/env python # -*- coding:utf-8 -*- # 获取网页源码/下载网页/图片/视频/音频.... import requests # 解析网页相关数据 from lxml import etree # 操做文件夹/路径 import os # 1. 下载网页源码 # 2. 解析网页源码(难度比较大) # 3. 存储相关数据 url = "http://www.ivsky.com/tupian/ziranfengguang/" response = requests.get(url) # content text 只不过数据类型不同 # 把网页源码解析为根节点 root = etree.HTML(response.content) # 根据xpath来定位相关数据 # ul li a # 注意:xpath返回的结果必定是个列表 a_list = root.xpath("//ul[@class='tpmenu']/li/a") # 对列表进行切片,跳过"全部分类"这个元素 # a_list = a_list[1:] for a in a_list[1:]: # print(a) # <Element *****> # text() 表示获取标签之间的文本内容 big_title = a.xpath("text()")[0] # 获取标签中的某个属性 @属性名称 big_url = a.xpath("@href")[0] if not big_url.startswith("http"): big_url = "http://www.ivsky.com" + big_url # print(big_title, big_url) big_response = requests.get(big_url) big_root = etree.HTML(big_response.content) big_a_list = big_root.xpath("//div[@class='sline']/div/a") for big_a in big_a_list: small_title = big_a.xpath("text()")[0] small_url = big_a.xpath("@href")[0] if not small_url.startswith("http"): small_url = "http://www.ivsky.com" + small_url print(small_title, small_url) # D:\python项目\Django_Scrapy # \n 转义字符,表示换行 # \\n 表示n # /n 表示n path = "images/" + big_title + "/" + small_title # 若是路径对应的文件夹不存在,目的防止出现"文件夹已存在,建立失败"错误 if not os.path.exists(path): # makedirs = MakeDirectorys 根据路径建立文件夹 os.makedirs(path) page = 1 old_small_url = small_url while True: small_response = requests.get(small_url) small_root = etree.HTML(small_response.content) img_list = small_root.xpath("//div[@class='il_img']/a/img") if not img_list: break for idx, img in enumerate(img_list): src = img.xpath("@src")[0] # name = src.split("/")[-1] name = img.xpath("@alt")[0] + str(page) + "_" + str(idx) + ".jpg" img_response = requests.get(src) f = open(path+"/"+name, "wb") f.write(img_response.content) f.close() page += 1 # ziranfengguan/1.html # ziranfengguan/1.html/2.html small_url = old_small_url + "/index_%s.html" % page