先贴链接,让各位观众老爷看看,对不对大家的胃口html
工控行业系统漏洞mysql
能够看到,这个网页是html静态的,因此问题变的很是的简单sql
只须要用request请求网页就能够了app
话很少说,直接贴代码url
import requests from urllib.parse import urlencode from lxml import etree import pymysql import time import xlwt import xlrd def makeurl(): # http://ics.cnvd.org.cn/?tdsourcetag=s_pctim_aiomsg&max=20&offset=0 baseurl = 'http://ics.cnvd.org.cn/?' params = { 'tdsourcetag': 's_pctim_aiomsg', 'max': '20' } for page in range(MAX_PAGE): params['offset'] = page * 20 url = baseurl + urlencode(params) print('url is ', url) yield url def get_page_urllist(url): headers = { 'Host': 'ics.cnvd.org.cn', 'Referer': 'http://ics.cnvd.org.cn/?tdsourcetag=s_pctim_aiomsg&max=20&offset=40', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36' } response = requests.get(url, headers=headers) return response.text def parse_urllist(content): html = etree.HTML(content) for li in html.xpath('//tbody[@id="tr"]/tr'): yield li.xpath('td/a/@href')[0] def get_page(url): headers = { 'Host': 'www.cnvd.org.cn', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36' } response = requests.get(url, headers=headers) return response.text def parse_page(content, url): html = etree.HTML(content) item = {} item['url'] = url item['标题'] = str(html.xpath('//div[@class="blkContainerSblk"]/h1/text()')[0]) item['CNVD_ID'] = ''.join( [i.strip() for i in html.xpath('//tbody/tr/td[text()="CNVD-ID"]/following-sibling::*[1]//text()')]) item['公开日期'] = ''.join( [i.strip() for i in html.xpath('//tbody/tr/td[text()="公开日期"]/following-sibling::*[1]//text()')]) item['危害级别'] = ''.join([i.strip().replace(' ', '').replace('\r', '').replace('\n', '').replace('\t', '') for i in html.xpath('//tbody/tr/td[text()="危害级别"]/following-sibling::*[1]//text()')]) item['影响产品'] = ''.join( [i.strip() for i in html.xpath('//tbody/tr/td[text()="影响产品"]/following-sibling::*[1]//text()')]) try: item['BUGTRAQ_ID'] = ''.join( [i.strip() for i in html.xpath('//tbody/tr/td[text()="BUGTRAQ ID"]/following-sibling::*[1]//text()')]) except: item['BUGTRAQ_ID'] = '' item['CVE_ID'] = ''.join( [i.strip() for i in html.xpath('//tbody/tr/td[text()="CVE ID"]/following-sibling::*[1]//text()')]) + ' ' + ''.join( [i.strip() for i in html.xpath('//tbody/tr/td[text()="CVE ID"]/following-sibling::*[1]//@href')]) item['漏洞描述'] = ''.join( [i.strip() for i in html.xpath('//tbody/tr/td[text()="漏洞描述"]/following-sibling::*[1]//text()')]) item['漏洞类型'] = ''.join( [i.strip() for i in html.xpath('//tbody/tr/td[text()="漏洞类型"]/following-sibling::*[1]//text()')]) item['参考连接'] = ''.join( [i.strip() for i in html.xpath('//tbody/tr/td[text()="参考连接"]/following-sibling::*[1]//text()')]) item['漏洞解决方案'] = ''.join( [i.strip() for i in html.xpath('//tbody/tr/td[text()="漏洞解决方案"]/following-sibling::*[1]//text()')]) item['厂商补丁'] = ''.join( [i.strip() for i in html.xpath( '//tbody/tr/td[text()="厂商补丁"]/following-sibling::*[1]//text()')]) + ' http://www.cnvd.org.cn' + ''.join( [i.strip() for i in html.xpath('//tbody/tr/td[text()="厂商补丁"]/following-sibling::*[1]//@href')]) item['验证信息'] = ''.join( [i.strip() for i in html.xpath('//tbody/tr/td[text()="验证信息"]/following-sibling::*[1]//text()')]) item['报送时间'] = ''.join( [i.strip() for i in html.xpath('//tbody/tr/td[text()="报送时间"]/following-sibling::*[1]//text()')]) item['收录时间'] = ''.join( [i.strip() for i in html.xpath('//tbody/tr/td[text()="收录时间"]/following-sibling::*[1]//text()')]) item['更新时间'] = ''.join( [i.strip() for i in html.xpath('//tbody/tr/td[text()="更新时间"]/following-sibling::*[1]//text()')]) item['漏洞附件'] = ''.join( [i.strip() for i in html.xpath('//tbody/tr/td[text()="漏洞附件"]/following-sibling::*[1]//text()')]) return item def save_data(index, item, workbook): sheet = workbook.get_sheet('sheet1') # 建立一个sheet表格 for col, value in enumerate(item.values()): sheet.write(index, col, value) workbook.save(filename) print('保存成功') def excel_prepare(heads): workbook = xlwt.Workbook() sheet = workbook.add_sheet('sheet1', cell_overwrite_ok=True) # 建立一个sheet表格 for col, value in enumerate(heads): sheet.write(0, col, value) return workbook def urlisexist(url, urlset): if url in urlset: return True else: return False def getallurl(filename): workbook = xlrd.open_workbook(filename) sheet1 = workbook.sheet_by_name('sheet1') results = sheet1.col_values(0, 1) return results def read_old(filename): workbook = xlrd.open_workbook(filename) sheet1 = workbook.sheet_by_name('sheet1') alloldset = [] for index in range(sheet1.nrows): alloldset.append(sheet1.row_values(index)) return alloldset, sheet1.nrows def save_old(index, olditem): sheet = workbook.get_sheet('sheet1') # 建立一个sheet表格 for col, value in enumerate(olditem): sheet.write(index, col, value) workbook.save(filename) if __name__ == '__main__': # http://ics.cnvd.org.cn/?tdsourcetag=s_pctim_aiomsg&max=20&offset=0 # 睡眠时间 TIMESLEEP = 0 filename = '工程控制系统漏洞.xls' MAX_PAGE = 96 heads = ['url', '标题', 'CNVD_ID', '公开日期', '危害级别', '影响产品', 'BUGTRAQ_ID', 'CVE_ID', '漏洞描述', '漏洞类型', '参考连接', '漏洞解决方案', '厂商补丁', '验证信息', '报送时间', '收录时间', '更新时间', '漏洞附件'] try: alloldset, length = read_old(filename) except: alloldset = [] length = 1 workbook = excel_prepare(heads) for index, olditem in enumerate(alloldset): save_old(index, olditem) try: urlset = getallurl(filename) except: urlset = [] index = length for urlofpage in makeurl(): pagelistcontent = get_page_urllist(urlofpage) for url in parse_urllist(pagelistcontent): print('url is >>>', url) if not urlisexist(url, urlset): time.sleep(TIMESLEEP) result = get_page(url) item = parse_page(result, url) print('item is >>>', item) save_data(index, item, workbook) index = index + 1 workbook.save(filename)
不懂的地方,下方评论提问spa