网站:http://glidedsky.com
第一题:计算网页上全部数字的和
注册进去就看到了html
点进去待爬取的网站,发现全是数字,,,这个第一题确实简单,没啥讲的python
第二题:一样题,请求1000次
这个题也是同样的,最简单的就是将上面的写的代码,改改就能够了,不过这样的话速度太慢了,能够试着本身优化一下,加线程或者直接使用协程,都是很不错的。固然我以为协程应该能更快一点。没作具体测试。git
运行的结果,这样是直接改的,没有加任何线程或协程,时间有点长。很基础,可是也要看你怎么优化了。github
应该是还能够更优化,我写的代码偷懒了,使用协程写的。web
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2019/8/18 0:33 # @Author : zhao.jia # @Site : # @File : glide_test.py # @Software: PyCharm import requests import tools from lxml import etree import aiohttp import asyncio import datetime import time from requests.adapters import HTTPAdapter class TestGlidedsky: def __init__(self): self.headers = """ Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3 Accept-Encoding: gzip, deflate Accept-Language: zh-CN,zh;q=0.9 Cache-Control: max-age=0 Connection: keep-alive Cookie: _ga=GA1.2.1425271689.1566058842; _gid=GA1.2.586445152.1566058842; Hm_lvt_020fbaad6104bcddd1db12d6b78812f6=1566058842,1566106841; Hm_lpvt_020fbaad6104bcddd1db12d6b78812f6=1566129989; _gat_gtag_UA_75859356_3=1; XSRF-TOKEN=eyJpdiI6IjM4SmpWMlwvaWxPQklreFVaMDFXVFhRPT0iLCJ2YWx1ZSI6IjdoMUFJaVF6YUVvUUNDZU1TaERsN0FVK0dRdTdORW9QUlwvNDlMXC9uXC9IdjdCZ2JCQVhiMXNEV2JKQnI5UXVIMHAiLCJtYWMiOiIyMWMyYzc1MzM3MWQyZTMxNDQwZjA5ZTUxNDZkOThmNTAyOWQwYTQzZDQyZTc4M2Q4YjNlZTI3YjYzZjgwNzA1In0%3D; glidedsky_session=eyJpdiI6Ik1rRUMrXC8yMlVkOEZlSEZja24zdmJRPT0iLCJ2YWx1ZSI6IjRoWG84K1MrM3NLbnlRVytrUVRHd1ZqWWtkdkdyeUtwOTBKdDFWTnl4THdkS1hcL2dmRzA1c1JJRDZSaHk2NlhKIiwibWFjIjoiNmQ2MmJhNWFlNzZiOWEwY2NiMDM1ZTBkZGE2MmNiNGQwNWU4OGJmOTU2OWQxNmU2NmM1MjE1ZmI0NGQ3MjllNyJ9 Host: glidedsky.com Referer: http://glidedsky.com/login Upgrade-Insecure-Requests: 1 User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36 """ self.sess = requests.session() self.sess.headers = tools.headers_to_dict(self.headers) self.sum_count_2 = 0 self.sess.mount('http://', HTTPAdapter(max_retries=3)) self.sess.mount('https://', HTTPAdapter(max_retries=3)) self.sess.verify = False def basic_one(self): sum_count = 0 res = self.sess.get(url="http://glidedsky.com/level/web/crawler-basic-1") res_html = etree.HTML(res.text) nums = res_html.xpath('//div[@class="col-md-1"]/text()') for num in nums: sum_count += int(num.strip()) print("sum=" + sum_count) # 第二题 def basic_two(self): count = 1 sum_count = 0 while True: res = self.sess.get(f"http://glidedsky.com/level/web/crawler-basic-2?page={count}") res_html = etree.HTML(res.text) nums = res_html.xpath('//div[@class="col-md-1"]/text()') for num in nums: sum_count += int(num.strip()) count += 1 if count == 1001: break print(sum_count) async def basic_two_2(self, url): async with aiohttp.ClientSession() as session: async with session.get(url, headers=tools.headers_to_dict(self.headers)) as resp: res = await resp.text() res_html = etree.HTML(res) nums = res_html.xpath('//div[@class="col-md-1"]/text()') for num in nums: self.sum_count_2 += int(num.strip()) def sum_async_count(self): loop = asyncio.get_event_loop() tasks = [asyncio.ensure_future( self.basic_two_2(f"http://glidedsky.com/level/web/crawler-basic-2?page={i}")) for i in range(1, 500)] loop.run_until_complete(asyncio.gather(*tasks)) tasks = [asyncio.ensure_future( self.basic_two_2(f"http://glidedsky.com/level/web/crawler-basic-2?page={i}")) for i in range(500, 1001)] loop.run_until_complete(asyncio.gather(*tasks)) print(self.sum_count_2) if __name__ == '__main__': # 第二题 # starttime = datetime.datetime.now() # TestGlidedsky().basic_two() # endtime = datetime.datetime.now() # count_time_1 = (endtime - starttime).seconds # print(count_time_1) # 第二题 # starttime_2 = datetime.datetime.now() # TestGlidedsky().sum_async_count() # endtime_2 = datetime.datetime.now() # count_time_2 = (endtime_2 - starttime_2).seconds # print(count_time_2)
第三题:仍是求和,
不过此次封禁ip,每一个ip只能访问一次,这个题就有点恶心了,只能去找代理ip了,找免费的就行,想办法多重试。服务器
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2019/8/27 11:00 # @Author : Andrew # @Site : # @File : python-abu.py # @Software: PyCharm #! -*- encoding:utf-8 -*- from urllib import request import base64 from lxml import etree import time import requests from requests.adapters import HTTPAdapter class test: def __init__(self): self.sess = requests.session() self.sess.mount('http://', HTTPAdapter(max_retries=3)) self.sess.mount('https://', HTTPAdapter(max_retries=3)) self.sess.verify = False def abu_test(self): # 代理服务器 proxyHost = "proxy.abuyun.com" proxyPort = "9020" # 代理隧道验证信息 proxyUser = "H2T*****22WD" proxyPass = "7****10526D3F" proxy_dict = {'http': "http-dyn.abuyun.com:9020"} auth = f"{proxyUser}:{proxyPass}" auth = base64.b64encode(auth.encode('utf8')) proxy_header = {"Proxy-Authorization": 'Basic ' + auth.decode()} self.get_html(proxy_dict, proxy_header) def get_html(self, proxy_dict, proxy_header): count = 1 sum_count = 0 headers = """ Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3 Accept-Encoding: gzip, deflate Accept-Language: zh-CN,zh;q=0.9 Cache-Control: max-age=0 Cookie: _ga=GA1.2.1251062763.1566609395; Hm_lvt_020fbaad6104bcddd1db12d6b78812f6=1566609396,1566627265; _gid=GA1.2.1809641921.1566875827; _gat_gtag_UA_75859356_3=1; XSRF-TOKEN=eyJpdiI6IkNpMHk0SHlDSXIrWHU4MTBIaW96blE9PSIsInZhbHVlIjoiMXpzXC9GRmZGekxQYW5wcUt0ZU0xQ0l0MWVnNHdKWHo5XC9JNTRnZ0c0UWJlYjZlaDVhU1BNRGxENGNoWjBpdkE0IiwibWFjIjoiYTVjYmJjMzY3OTNiNTJjMDE5MjZhNmEzNDIwNGFmZDYwYzk5Yjg5ZjViYmExMzQwMjVkMTkzNDcyMmJjZmYxMyJ9; glidedsky_session=eyJpdiI6ImJ4aHA3QllGZE9PTlRnbTByZnNNOFE9PSIsInZhbHVlIjoiMGt6bUdqbDBcL2JSRERXbVFyMEdHNDArZmtOTHdQOFRidVlRUTFvMXRWajAzNUlja3gyN3JmV1U1QkVHUHBVU3UiLCJtYWMiOiI0OTY1ZGZmZDgwMTU4YTliNjM0NWVhZTU5MzRhNGQwYmMwM2YzNDc2ZGRkZjVmZDg0ZjQwMGUwODkyNjUwMmY3In0%3D; Hm_lpvt_020fbaad6104bcddd1db12d6b78812f6=1566875832 Host: glidedsky.com Proxy-Connection: keep-alive Referer: http://glidedsky.com/login Upgrade-Insecure-Requests: 1 User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36 """ import tools headers = tools.headers_to_dict(headers) headers.update(proxy_header) # print(headers) while True: # if count == 37 or count == 38: # continue try: res = self.sess.get(f"http://glidedsky.com/level/web/crawler-ip-block-1?page={count}", headers=headers, proxies=proxy_dict, timeout=10) except Exception as e: print("异常") print(e) continue file_name = f'glidedsky_{count}.html' if res.status_code == 200: with open(file_name, 'w', encoding='utf8') as f: f.write(res.text) res_html = etree.HTML(res.text) nums = res_html.xpath('//div[@class="col-md-1"]/text()') if nums: print("zhaodao") # with open(file_name, 'w', encoding='utf8') as f: # f.write(res.text) for num in nums: sum_count += int(num.strip()) count += 1 print(sum_count) if count == 1001: return sum_count # time.sleep(3) def parse_html(self): count = 1 sum_count = 0 while True: file_name = f'glidedsky_{count}.html' with open(file_name, 'r', encoding='utf8') as f: content = f.read() res_html = etree.HTML(content) nums = res_html.xpath('//div[@class="col-md-1"]/text()') if nums: for num in nums: sum_count += int(num.strip()) print("次数综合", count, sum_count) if count == 1001: break # return sum_count else: print("没有内容", file_name) continue count += 1 print("总和", sum_count) if __name__ == '__main__': # test().abu_test() test().parse_html()
结果: session
本篇文章由一文多发平台ArtiPub自动发布