#!/usr/bin/env python # -*- coding:utf-8 -*- import requests from lxml import etree if __name__ == "__main__": url = 'http://sc.chinaz.com/tupian/gudianmeinvtupian.html' headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', } #获取页面文本数据 response = requests.get(url=url,headers=headers) response.encoding = 'utf-8' page_text = response.text #解析页面数据(获取页面中的图片连接) #建立etree对象 tree = etree.HTML(page_text) div_list = tree.xpath('//div[@id="container"]/div') #解析获取图片地址和图片的名称 for div in div_list: image_url = div.xpath('.//img/@src') image_name = div.xpath('.//img/@alt') print(image_url) #打印图片连接 print(image_name)#打印图片名称
运行结果观察发现,咱们能够获取图片的名称,可是连接获取的为空,检查后发现xpath表达式也没有问题,究其缘由出在了哪里呢?python
图片懒加载概念:web
图片懒加载是一种网页优化技术。图片做为一种网络资源,在被请求时也与普通静态资源同样,将占用网络资源,而一次性将整个页面的全部图片加载完,将大大增长页面的首屏加载时间。为了解决这种问题,经过先后端配合,使图片仅在浏览器当前视窗内出现时才加载该图片,达到减小首屏图片请求数的技术就被称为“图片懒加载”。chrome
网站通常如何实现图片懒加载技术呢?后端
在网页源码中,在img标签中首先会使用一个“伪属性”(一般使用src2,original......)去存放真正的图片连接而并不是是直接存放在src属性中。当图片出现到页面的可视化区域中,会动态将伪属性替换成src属性,完成图片的加载。浏览器
站长素材案例后续分析:经过细致观察页面的结构后发现,网页中图片的连接是存储在了src2这个伪属性中网络
#!/usr/bin/env python # -*- coding:utf-8 -*- import requests from lxml import etree if __name__ == "__main__": url = 'http://sc.chinaz.com/tupian/gudianmeinvtupian.html' headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', } #获取页面文本数据 response = requests.get(url=url,headers=headers) response.encoding = 'utf-8' page_text = response.text #解析页面数据(获取页面中的图片连接) #建立etree对象 tree = etree.HTML(page_text) div_list = tree.xpath('//div[@id="container"]/div') #解析获取图片地址和图片的名称 for div in div_list: image_url = div.xpath('.//img/@src'2) #src2伪属性 image_name = div.xpath('.//img/@alt') print(image_url) #打印图片连接 print(image_name)#打印图片名称
二. 利用selenuim 模拟浏览器滑动到底部, 加载数据.app
class ProductSpider(scrapy.Spider):
name = "Product1688"
start_urls = []scrapy
def __init__(self, **kwargs): # 加载 chrome driver, 它的下载地址位于 https://sites.google.com/a/chromium.org/chromedriver/ super().__init__(**kwargs) self.driver = webdriver.Chrome('/path/to/your/chromedriver') self.wait = WebDriverWait(self.driver, 10) def parse(self, response): self.driver.get(response.url) # 打开页面后,滑动至页面底部 self.scroll_until_loaded() # 以 xpath 寻找商品名(标题 ) title = self.driver.find_element_by_xpath('//*[@id="mod-detail-title"]/h1') # 以 xpath 寻找商品主图片 main_images_elements = self.driver.find_elements_by_xpath('//*[@id="dt-tab"]/div/ul/li/div/a/img') # 以 xpath 寻找商品详情图片 detail_images_elements = \ self.driver.find_elements_by_xpath('//*[@id="desc-lazyload-container"]/p/span/strong/img') item = ProductItem() main_images = [] detail_images = [] # 获取商品主图的网络地址,针对商品主图片尺寸的特殊处理 for image in main_images_elements: main_images.append(image.get_attribute('src').replace('60x60.', '')) # 获取商品详情图片的网络地址 for image in detail_images_elements: detail_images.append(image.get_attribute('src')) item['title'] = title.text item['main_image_count'] = len(main_images) item['image_urls'] = main_images + detail_images return item # 模拟浏览器页面滚到页面底部的行为 def scroll_until_loaded(self): check_height = self.driver.execute_script("return document.body.scrollHeight;") while True: self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") try: self.wait.until( lambda driver: self.driver.execute_script("return document.body.scrollHeight;") > check_height) check_height = self.driver.execute_script("return document.body.scrollHeight;") except TimeoutException: break