爬虫day6javascript
关于<https://www.aqistudy.cn/html/city_detail.html 中国空气质量在线监测分析平台php
js加密和混淆的解析html
刚开始是的日期没有,加载,是由于已经加载完了,换一个日期.java
能够看到数据是动态加载的node
可是数据加密了mysql
post请求,并且请求的数据也加密了redis
多是谷歌浏览器的js没有监听,因此换比较好用的火狐浏览器sql
事件监听数据库
getData请求数据api
找到element(谷歌浏览器)(定义处) 能够看到执行了getAQI和getweather
function getWeatherData() { var method = 'GETCITYWEATHER'; var param = {}; param.city = city; param.type = type; param.startTime = startTime; param.endTime = endTime; getServerData(method, param, function(obj) { data = obj.data; if(data.total>0) { dataTemp.splice(0, dataTemp.length); dataHumi.splice(0, dataHumi.length); dataWind.splice(0, dataWind.length); for(i=0;i<data.rows.length;i++) { dataTemp.push({ x: converTimeFormat(data.rows[i].time).getTime(), y: parseInt(data.rows[i].temp) }); dataHumi.push({ x: converTimeFormat(data.rows[i].time).getTime(), y: parseInt(data.rows[i].humi) }); dataWind.push({ x: converTimeFormat(data.rows[i].time).getTime(), y: parseInt(data.rows[i].wse), d: data.rows[i].wd, w: data.rows[i].tq, marker:{symbol: getWindDirectionUrl(data.rows[i].wd)} }); } state ++ ; if(state>=2) { showCurrentTab(); } } }, 0.5); }
以及在搜索里找到getWeather ,有4个参数method, param, function(obj) ,0.5
中间的是function函数参数 以及getServerData 后面的0.5);说明是在调用函数
去找getServerData函数定义处
又是乱码...加密了,不对,是混淆了,
到解密平台去解密一下: <http://www.bm8.com.cn/jsConfusion/
object = city type starttime endtime 等 进行md5加密
url 对php那个文件发送请求,得到动态的数据
本身写一个 假的函数去调用得到 param请求参数
function getPostParamCode(method, city, type, startTime, endTime){ var param = {}; param.city = city; param.type = type; param.startTime = startTime; param.endTime = endTime; return getParam(method, param); }
而后经过这个参数 发送post 爬取内容
代码以下: 将字符串转换成js代码
import requests # 补充 import execjs node = execjs.get() # Params method = 'GETDETAIL' city = '北京' type = 'HOUR' start_time = '2018-01-25 00:00:00' end_time = '2018-01-25 23:00:00' # Compile javascript file = 'jsCode.js' ctx = node.compile(open(file, encoding='utf-8').read()) # Get params js = 'getPostParamCode("{0}","{1}","{2}","{3}","{4}")'.format(method, city, type, start_time, end_time) params = ctx.eval(js) url = 'https://www.aqistudy.cn/apinew/aqistudyapi.php' data = { 'd': params } page_text = requests.post(url, data=data).text js = 'decodeData("{0}")'.format(page_text) decrypted_data = ctx.eval(js) # print(js) print(decrypted_data)
而后在经过decodeData 解密 获取最终数据
params
tdgHOYxwKdDSgYXe+RLPzYCgLvrddahasI5XXklB4gVLYqab+XRPpMD/oSqnJ/aEmFwzVEUhLnPzRy03+X1BIzLvxQKwu4A3YsqR3OemYgNnHqPdBwvJlbxia99YeK+xhYnh+pXoudhbw1bJHi/H1n7o0PGXMb60NrW7f/Yd0Y+H4hNSDHVYyZnBsxJh6kkarSTzqNharSCvztTU3b95na/jKrVddatUdH5CVexOuKjxjdT0C1swsJBH7bdn3Sga7wXZ20GcktH39BwkMaScAudbM3yYSgDrJkCmV4i6ZZlU54+aR4MY7r7J9IpW1TSy93gC24xTvjiaa5Apo2c77/b7gcIiTvSc14c2AnLDI5oOfgIl4J2hRMFfqr4g4Lfuq1cRlOQg5c5uZrQjyIsIFicregIDGNu4fluOdSLC+Pg+OQDMIlqLzHtwgZ2MW0HuoL8o/copcJu1ClHTCk0y+g==
decrypted_data
{"success":true,"errcode":0,"errmsg":"success","result":{"success":true,"data":{"total":24,"rows":[{"time":"2018-01-25 00:00:00","aqi":"43","pm2_5":"29","pm10":"43","co":"0.7","no2":"56","o3":"23","so2":"6","rank":null},{"time":"2018-01-25 01:00:00","aqi":"25","pm2_5":"15","pm10":"25","co":"0.6","no2":"45","o3":"34","so2":"6","rank":null},{"time":"2018-01-25 02:00:00","aqi":"25","pm2_5":"9","pm10":"25","co":"0.5","no2":"38","o3":"39","so2":"5","rank":null},{"time":"2018-01-25 03:00:00","aqi":"22","pm2_5":"9","pm10":"22","co":"0.5","no2":"40","o3":"37","so2":"5","rank":null},{"time":"2018-01-25 04:00:00","aqi":"16","pm2_5":"8","pm10":"15","co":"0.5","no2":"32","o3":"45","so2":"4","rank":null},{"time":"2018-01-25 05:00:00","aqi":"16","pm2_5":"8","pm10":"14","co":"0.4","no2":"25","o3":"51","so2":"4","rank":null},{"time":"2018-01-25 06:00:00","aqi":"17","pm2_5":"7","pm10":"15","co":"0.4","no2":"24","o3":"53","so2":"4","rank":null},{"time":"2018-01-25 07:00:00","aqi":"18","pm2_5":"5","pm10":"18","co":"0.4","no2":"26","o3":"52","so2":"3","rank":null},{"time":"2018-01-25 08:00:00","aqi":"19","pm2_5":"5","pm10":"19","co":"0.4","no2":"27","o3":"51","so2":"4","rank":null},{"time":"2018-01-25 09:00:00","aqi":"20","pm2_5":"6","pm10":"20","co":"0.5","no2":"28","o3":"50","so2":"4","rank":null},{"time":"2018-01-25 10:00:00","aqi":"21","pm2_5":"6","pm10":"21","co":"0.4","no2":"22","o3":"58","so2":"4","rank":null},{"time":"2018-01-25 11:00:00","aqi":"27","pm2_5":"9","pm10":"27","co":"0.4","no2":"17","o3":"63","so2":"5","rank":null},{"time":"2018-01-25 12:00:00","aqi":"25","pm2_5":"9","pm10":"25","co":"0.4","no2":"15","o3":"66","so2":"4","rank":null},{"time":"2018-01-25 13:00:00","aqi":"22","pm2_5":"9","pm10":"21","co":"0.4","no2":"14","o3":"68","so2":"4","rank":null},{"time":"2018-01-25 14:00:00","aqi":"23","pm2_5":"6","pm10":"18","co":"0.3","no2":"13","o3":"71","so2":"4","rank":null},{"time":"2018-01-25 15:00:00","aqi":"23","pm2_5":"7","pm10":"17","co":"0.3","no2":"13","o3":"71","so2":"4","rank":null},{"time":"2018-01-25 16:00:00","aqi":"23","pm2_5":"7","pm10":"19","co":"0.4","no2":"14","o3":"71","so2":"4","rank":null},{"time":"2018-01-25 17:00:00","aqi":"22","pm2_5":"8","pm10":"19","co":"0.3","no2":"17","o3":"68","so2":"4","rank":null},{"time":"2018-01-25 18:00:00","aqi":"20","pm2_5":"6","pm10":"20","co":"0.4","no2":"23","o3":"62","so2":"3","rank":null},{"time":"2018-01-25 19:00:00","aqi":"24","pm2_5":"7","pm10":"24","co":"0.4","no2":"29","o3":"54","so2":"4","rank":null},{"time":"2018-01-25 20:00:00","aqi":"23","pm2_5":"8","pm10":"23","co":"0.4","no2":"31","o3":"48","so2":"4","rank":null},{"time":"2018-01-25 21:00:00","aqi":"25","pm2_5":"11","pm10":"25","co":"0.5","no2":"39","o3":"39","so2":"4","rank":null},{"time":"2018-01-25 22:00:00","aqi":"26","pm2_5":"12","pm10":"26","co":"0.5","no2":"45","o3":"32","so2":"5","rank":null},{"time":"2018-01-25 23:00:00","aqi":"31","pm2_5":"13","pm10":"31","co":"0.6","no2":"50","o3":"25","so2":"5","rank":null}]}}}
引擎根据数据流判断哪一个执行了 ! 触发对应的事务!
twisted 支持异步 ,下载器下载
调度器, 过滤器(其实是一个类,set等)去除重复的URL
管道进行存储 到数据库或者是文本
spdier 爬取
趁热吃,凉了就很差吃了.-家长名言
作简单的
基于scrapy的 爬取 与存储到文本
item (写字段,传输的item)
first (写爬虫)
setting.py (写配置)
pipeline(写存储) #
出错了许多:
1 open_spider() close_spider() def 先后执行而不能是其余 , 配置在setting里应该 2 在process_item里写 fp.write() 3 first 中 , for 循环 而后加入值, 要 item = 实例化 item里的, 不能再for 循环外面实例化,可能会出错, 一个个实例化,效率就低了,可是每个写给了文章里 ,在 pipe管道里一样的方法取值, [] 也挺方便 4 item里的类里写上属性值, 才能够传,能够往属性里加 不然会报错 5
setting.py
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36' # Obey robots.txt rules ROBOTSTXT_OBEY = False LOG_LEVEL = 'ERROR' ITEM_PIPELINES = { 'BoosPro.pipelines.BoosproPipeline': 300, }
item.py
import scrapy class BoosproItem(scrapy.Item): title_text = scrapy.Field() socre_text = scrapy.Field()
爬虫对应的py
# -*- coding: utf-8 -*- import scrapy from BoosPro.items import BoosproItem # 能够写boos直聘被封了. 这个和下面的豆瓣的是同样的 class BoosSpider(scrapy.Spider): name = 'douban' # boos被封了 改为豆瓣了 start_urls = ['https://movie.douban.com/top250'] url_next = 'https://movie.douban.com/top250?start=25&filter=' def parse(self, response): title_text = response.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[1]/a/span[1]/text()').extract() # //*[@id="main"]/div/div[3]/ul/li[18]/div/div[1]/h3/a/span socre_text = response.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/div/span[2]/text()').extract() print(title_text, socre_text) item = BoosproItem() item['title_text'] = title_text # 能够写进列表 ? item['socre_text'] = socre_text yield item
有个错误: # 第一个错误,没写字段, NightprItem no job_text 字段, 第二个错误,这种写法能够可是会覆盖的 {'job_text': 'Python高级工程师/专家', 'salary_text': '30-50K'} # for index, job in enumerate(job_text): # item['job_text'] = job # item['salary_text'] = salary_text[index]
pipe 存储到文件夹里
import pymysql from redis import Redis class BoosproPipeline(object): def open_spider(self, spider): self.fp = open('book.text', 'w', encoding='utf-8') def process_item(self, item, spider): print('*****************************************************', item['title_text']) for title in item['title_text']: title = title.replace( '\n', '') print(title) title_text = item['title_text'] socre_text = item['socre_text'] self.title = {} for index, title in enumerate(title_text): self.title[title] = ': ' + socre_text[index] return item def close_spider(self, spider): self.fp.write(str(self.title))
items
title_text = scrapy.Field() socre_text = scrapy.Field() title_desc = scrapy.Field()
爬虫.py
# -*- coding: utf-8 -*- import scrapy # 处理了写入文件的 问题 以及 写入两个页面的问题 手动写入 深度的评价内容 from doubanPro.items import DoubanproItem class DianyingSpider(scrapy.Spider): name = 'dianying' # allowed_domains = ['www.xxx.com'] start_urls = ['https://movie.douban.com/top250'] url_next = 'https://movie.douban.com/top250?start=25&filter=' page = 1 def parse(self, response): # print(response.text) # title_text = response.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[1]/a/span[1]/text()').extract() # //*[@id="main"]/div/div[3]/ul/li[18]/div/div[1]/h3/a/span socre_text = response.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/div/span[2]/text()').extract() detail_urls = response.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[1]/a/@href').extract() for index, text in enumerate(title_text): item = DoubanproItem() # 每次都执行 ? 放在外面能够不 不能放到外面 item['title_text'] = text # 能够写进列表 ? item['socre_text'] = socre_text[index] detail_url = detail_urls[index] print('数据', item['title_text'], item['socre_text']) yield scrapy.Request(url=detail_url, callback=self.detail_parse, meta={'item': item}) # 执行第二次 if self.page < 2: self.page += 2 yield scrapy.Request(url=self.url_next, callback=self.parse) # 执行第二次 # # 进行深度爬取 //* def detail_parse(self,response): item = response.meta['item'] # print('response',response) # //*[@id="link-report"]/span[1]/span title_desc = response.xpath('//*[@id="link-report"]//text()').extract() # 拿到的确定是列表 # print('title_desc', title_desc[2]) item['title_desc'] = title_desc[2] yield item
先start_url 里的地址 走 parse(self,response) 而后 xpath 解析一下 给了item添值,返回yield item ,本质上走的是 scrapy.Requests(url,callback = parse) # 不用写header
或者
在 parse 里, 循环几个URL, if 判断, 由于是递归,因此好像是 for循环同样
yield scrapy.Request(url = deurl , callback= parse) # 递归,
深度爬取,
parse item添加和 if 在手动获取url请求中件添加 深度爬取 标签点开页里面的内容
def parse(): for( response.xpath().extract()) item[''] = detialurl = 解析[index] 或者 是 别的标签 或者须要用到管道符 yeild scrapy.Request(url=url,callback=深层的函数,meta[item]:item) # 返回给解析的函数, 由于callback 会执行, 而后, callback(新解析的属性+已经解析的属性 拼接)在yield item给pipeline, if pag< 5 : pag+=1 yeild scrapy.Request(url= pag, callback = parse) # 递归执行 而后中间又去执行深度解析 def 深度解析(): xpath = ... yeild item
出的错误
# 解决了只有一个键值对文件中 的问题吗 上面的东西解决了! # print(title_text, socre_text) # item = DoubanproItem() # 每次都执行 ? 放在外面能够不 不能够!
pipeline 存储
class DoubanproPipeline(object): datas = [] def open_spider(self, spider): print('开始爬虫') self.fp = open('book.text', 'w', encoding='utf-8') def process_item(self, item, spider): # spider 的做用, 和 item交互 那边 传值 print('***********3333333333333333****', item) data = item['title_text'] +' '+ item['socre_text']+':'+ item['title_desc']+'\n' self.fp.write(data) return item # 这里是关闭的 不是写入的 , 记混了 , 并且 只能写入一次,不能写入屡次,上面的process_item 能够处理屡次 def close_spider(self, spider): self.fp.close() print('结束爬虫!')
setting.py
ITEM_PIPELINES = { 'DatabasePro.pipelines.DatabaseproPipeline': 300, 'DatabasePro.pipelines.RedisPipeline': 301, }
pipelines.py
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html import pymysql from redis import Redis class DatabaseproPipeline(object): conn = None cursor = None def open_spider(self, spider): # 3306 不是字符串 self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='123', db='spider', charset='utf8') print('conn', self.conn) def process_item(self, item, spider): # spider 的做用, 和 item交互 那边 传值 self.cursor = self.conn.cursor() sql = 'insert into douban values ("%s","%s")'%(item['title_text'], item['socre_text']) try : self.cursor.execute(sql) self.conn.commit() except Exception as e: print(e) self.conn.rollback() return item # 这里是关闭的 不是写入的 , 记混了 , 并且 只能写入一次,不能写入屡次,上面的process_item 能够处理屡次 def close_spider(self, spider): self.cursor.close() self.conn.close() print('结束爬虫!') class RedisPipeline(object): conn = None def open_spider(self, spider): # 3306 不是字符串 self.conn = Redis(host='127.0.0.1', port=6379) print(spider.start_urls) # 数据交互 spider def process_item(self, item, spider): # spider 的做用, 和 item交互 那边 传值 print('item',item['title_text'],item['socre_text'],'(((((((((((((((((((((((((((') dic = { 'title': item['title_text'], 'socre': item['socre_text'] } self.conn.lpush('movie', dic) return item
犯得错误:
1 port 3306 和 6379 27017 2 没在设置里开第二个管道
from redis import Redis self.conn = Redis(host='127.0.0.1',port=6379) self.conn.lpush('movie',dic)
sql语句
import pymysql self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='123', db='spider', charset='utf8') self.cursor = self.conn.cursor() sql = 'insert into douban values ("%s","%s")'%(item['title_text'], item['socre_text']) try : self.cursor.execute(sql) self.conn.commit() except Exception as e: print(e) self.conn.rollback()