最近开始学习爬虫,因而找了几个简单的小项目练习。html
本篇文章主要是对今日头条图集的爬取json
参考资料:http://blog.csdn.net/gx864102252/article/details/73479803函数
因为头条源码进行了改动,致使原教程的没法正常爬取,在此进行了改进,使爬虫能够正常进行。学习
下面是源代码:url
1 import re 2 from hashlib import md5 3 from urllib.parse import urlencode 4 import json 5 from multiprocessing import Pool 6 import os 7 import requests 8 from bs4 import BeautifulSoup 9 10 11 def get_one_html(kd, offset): 12 headers = { 13 'User-Agent': 'Mozilla/5.0', 14 'offset': offset, 15 'format': 'json', 16 'keyword': kd, 17 'cur_tab': 3, 18 'autoload': 'true' 19 } 20 #urlencode将字典转变为url的请求方式 21 url = 'https://www.toutiao.com/search_content/?' + urlencode(headers) 22 try: 23 r = requests.get(url, timeout=30) 24 r.encoding = 'utf-8' 25 r.raise_for_status() 26 return r.text 27 except: 28 print('HTML ERROR') 29 return None 30 31 def get_info(html): 32 data = json.loads(html) 33 if data and 'data' in data.keys(): 34 for item in data.get('data'): 35 #用生成器存储每一个图集的url 36 yield item.get('article_url') 37 38 def get_detail(url): 39 try: 40 r = requests.get(url, timeout=30) 41 r.encoding = 'utf-8' 42 r.raise_for_status() 43 return r.text 44 except: 45 print('HTML_Detail ERROR') 46 return None 47 def parse_detail(html, url): 48 soup = BeautifulSoup(html, 'lxml') 49 title = soup.select('title')[0].string 50 pat = re.compile('gallery: JSON.parse\((.*?)\)', re.S) 51 result = re.search(pat, html) 52 if result: 53 #eval字符串转化为字典 54 data = eval(json.loads(result.group(1))) 55 if data and 'sub_images' in data.keys(): 56 sub_images = data.get('sub_images') 57 58 images = [item.get('url').replace('\\','') for item in sub_images] 59 60 for image in images: download_images(image) 61 return { 62 'title': title, 63 'groups_url': url, 64 'images': images 65 } 66 67 #将图片进行下载 68 def download_images(url): 69 print("Downloading: ", url) 70 try: 71 r = requests.get(url) 72 if r.status_code == 200: 73 save_images(r.content) 74 except: 75 print("Dont Get images") 76 print(r.status_code) 77 78 def save_images(content): 79 global KEYWORD 80 global DIR 81 file_path = '{0}/{1}.{2}'.format(DIR, md5(content).hexdigest(), 'jpg') 82 if not os.path.exists(file_path): 83 with open(file_path, 'wb') as f: 84 f.write(content) 85 f.close() 86 87 def main(offset): 88 global KEYWORD 89 html = get_one_html(KEYWORD, offset) 90 #将生成器内容进行呈现 91 for url in get_info(html): 92 html = get_detail(url) 93 if html: 94 parse_detail(html, url) 95 96 if __name__ == "__main__": 97 #定义两个全局变量 98 KEYWORD = input("请输入搜索关键词:") 99 DIR = dir = os.getcwd() +'/' + KEYWORD 100 os.mkdir(dir) 101 #内容数量控制变量 102 START = 0 103 END = 5 104 #生成列表使用map函数 105 group = [i*20 for i in range(START, END + 1)] 106 #使用多进程 107 pool = Pool() 108 pool.map(main, group)
说明:加入全局变量DIR是为了在爬取以前生成以关键词命名的目录,便于分类。spa