本文正在参加「Python主题月」,详情查看 活动连接html
何为爬虫:网络爬虫(又称为网页蜘蛛,网络机器人,在FOAF社区中间,更常常的称为网页追逐者),是一种按照必定的规则,自动地抓取万维网信息的程序或者脚本。另一些不常使用的名字还有蚂蚁、自动索引、模拟程序或者蠕虫。百度百科详情python
随着大数据时代的到来,人们对数据资源的需求愈来愈多,而爬虫是一种很好的自动采集数据的手段。推荐一个Python网络爬虫学习路线解读git
分享几个Python学习链接:github
1.请叫我汪海 的CSDNwindows
2.廖雪峰大佬的教程和廖雪峰大佬的视频版教程数组
3.爬虫小白入门markdown
4.爬虫框架 Scrapy网络
官网下载Pythonapp
前人种树,后人乘凉,感谢!
框架
使用pip install ***
安装对应依赖
pip install urllib
复制代码
我这使用的时候提示须要升级pip python -m pip install --upgrade pip
python main.py
复制代码
运行,发现成功下载图片。
动手试试有惊喜或者惊喜(Practice make perfect!)。
# -*- coding:utf-8 -*-
import os
import random
import ssl
import time
import urllib.request
from bs4 import BeautifulSoup
# 请求头配置
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"
# 下载地址
BASE_URL = "https://www.mzitu.com"
# 保存图片文件夹地址
BASE_DIR = "../images"
def start_work(serial_id):
picture_dir = BASE_DIR + os.sep + serial_id
if not os.path.exists(picture_dir):
os.mkdir(picture_dir)
page_count = get_page_count(serial_id)
print("%s 共%d个图片" % (serial_id, page_count))
get_image_for_serial(picture_dir,serial_id,page_count)
# 获取页数
def get_page_count(serial_id):
header = {"user-agent": USER_AGENT}
context = ssl._create_unverified_context()
url = "%s/%s" % (BASE_URL, serial_id)
req = urllib.request.Request(url, headers=header)
resp = urllib.request.urlopen(req, context=context)
content = resp.read()
str_content = content.decode("utf-8")
total_count = __get_counts(str_content)
return total_count
# 获取数量
def __get_counts(html_content):
page_count = 0
soup = BeautifulSoup(html_content, 'lxml')
data = soup.select("body > div.main > div.content > div.pagenavi > a > span")
if data and len(data) >= 3:
page_count = int(data[-2].get_text())
return page_count
# 获取图片地址
def get_image_url(html_content):
soup = BeautifulSoup(html_content, 'lxml')
data = soup.select("body > div.main > div.content > div.main-image > p > a > img")
url = None
try:
url = data[0].get("src")
except Exception as ex:
print("exception occur:%s" % ex)
return url
# 获取图片地址数组
def get_all_image_urls(serial_id, page_count):
url_list=list()
header = {"user-agent": USER_AGENT}
context = ssl._create_unverified_context()
if page_count <= 1:
return
for x in range(1,page_count+1):
print("获取第%d张图片的地址" % x)
url = "%s/%s/%s" % (BASE_URL, serial_id, x)
req = urllib.request.Request(url, headers=header)
resp = urllib.request.urlopen(req, context=context)
content = resp.read()
str_content = content.decode("utf-8")
img_url = get_image_url(str_content)
if img_url:
url_list.append(img_url)
print("第%d张图片地址是:%s" % (x, img_url))
time.sleep(random.randint(1, 2))
return url_list
# 获取图片
def get_image_for_serial(dir_path, serial_id, total_count):
for i in range(1, total_count + 1):
print("开始获取第%d张图片" % i)
get_image_for_index(dir_path, serial_id, i)
sleep_seconds = random.randint(1, 10) /10
time.sleep(sleep_seconds)
# 获取具体图片
def get_image_for_index(dir_path, serial_id, page_index):
header = {"user-agent": USER_AGENT}
context = ssl._create_unverified_context()
print("获取第%d张图片的地址" % page_index)
ref_url = "%s/%s/%s" % (BASE_URL, serial_id, page_index)
req = urllib.request.Request(ref_url, headers=header)
resp = urllib.request.urlopen(req, context=context)
content = resp.read()
str_content = content.decode("utf-8")
img_url = get_image_url(str_content)
if img_url:
print("第%d张图片地址是:%s" % (page_index, img_url))
print("尝试保存图片%s" % img_url)
save_img(dir_path, img_url, ref_url)
# 保存图片
def save_imgs(dir_path, img_urls):
for img_addr in img_urls:
save_img(dir_path, img_addr)
# 保存具体图片
def save_img(dir_path, img_url, ref_url):
header = {
"user-agent": USER_AGENT,
"Referer": ref_url
}
context = ssl._create_unverified_context()
req = urllib.request.Request(img_url, headers=header)
resp = urllib.request.urlopen(req, context=context)
content = resp.read()
with open(dir_path+os.sep+img_url.split('/')[-1], 'wb') as f:
f.write(content)
f.close()
print("已向目录:%s 保存文件:%s" % (dir_path, img_url.split('/')[-1]))
time.sleep(random.randint(1, 2))
if __name__ == "__main__":
vol_list = ["204061"]
for serial_id in vol_list:
start_work(serial_id)
复制代码
我学会了Python嘛?并无!!我只是成功安装了python,而后成功运行了一个用例,算是爬虫的一个实践。我如今会的只是一些基础类型和函数。