因兴趣写了一点点爬虫,浅谈爬虫的一些简单操做吧。html
不管接口还好,仍是html文本也好,获取其余的数据,先经过抓包工具肯定好要爬取的url地址python
对于互联网上的站点,接口等,模拟人类操做构造响应的请求,设置对应的请求头,来获取想要的数据数据库
第三步爬取到想要的数据,构造好结构化数据就能够进行持久化存储了。文件或者数据库均可浏览器
再就是爬虫异常、分布式爬虫和反爬网络
https://search.sina.com.cn/?q=%E6%98%A5%E8%8A%82&range=all&c=news&sort=time
爬取urlapp
def __send_page_req(self, page_num: int = 1): """ 发起请求 :return: """ search_url = self.__init_search_url(page_num) response = requests.get(search_url) response = Selector(response=response) return response def __init_search_url(self, page: int = 1): """ 构造请求数据 :param page: :return: """ params = { 'q': self.search_keyword, 'range': "all", 'c': 'news', 'sort': "time", 'page': page } str_params = urllib.parse.urlencode(params) return self.search_url + '?' + str_params
def parse_page_req(self, page_num: int = 1): pass
import requests from scrapy.selector import Selector import urllib.parse import datetime import re class SinaNewsSpider: """ 新浪新闻搜索 """ search_url = 'https://search.sina.com.cn/?{params}' spider_source = 'sina' title_compile = re.compile('<a.*?>([\s\S]*?)</a>') article_min_date = '2020-12-01 00:00:00' # 新闻的最先时间 def __init__(self, search_keyword: str): self.search_keyword = search_keyword def go(self): page_num = 1 while True: news_data, min_date = self.parse_page_req(page_num) [self.__save(item) for item in self.parse_data(news_data)] if min_date > self.article_min_date: page_num += 1 else: break def __save(self, data): """ 数据存储 :param data: :return: """ print(data) pass def parse_data(self, news_data): """ 数据解析 :param news_data: :return: """ for news in news_data: content = self.__get_content(news['detail_url']) if content is None: print('error:', news) else: item = {} item['content'] = content item['source'] = 'sina' item['keyword'] = self.search_keyword item['news_url'] = news['detail_url'] item['insert_time'] = str(datetime.datetime.today()) item['title'] = news['title'] item['release_time'] = news['release_time'] item['author'] = news['author'] yield item def __get_content(self, url): response = requests.get(url) response = Selector(text=response.content.decode('utf-8')) content = response.xpath('//div[@id="article"]').extract_first() content_artibody = response.xpath('//div[@id="artibody"]').extract_first() content_section = response.xpath('//section[@class="art_pic_card art_content"]').extract_first() return content or content_artibody or content_section def parse_page_req(self, page_num: int = 1): """ 解析翻页请求 :param response: :return: """ response = self.__send_page_req(page_num) news_list = response.xpath('//div[@id="result"]/div[@class="box-result clearfix"]') news_data = [] for news in news_list: item = {} title = news.xpath(".//h2/a").extract_first() item['title'] = self.title_compile.findall(title)[0] item['detail_url'] = news.xpath(".//h2/a/@href").extract_first() source_time_str = news.xpath(".//h2/span/text()").extract_first().strip() item['author'], item['release_time'] = source_time_str.split(" ", maxsplit=1) news_data.append(item) return news_data, min(map(lambda x: item['release_time'], news_data)) def __send_page_req(self, page_num: int = 1): """ 发起请求 :return: """ search_url = self.__init_search_url(page_num) response = requests.get(search_url) response = Selector(response=response) return response def __init_search_url(self, page: int = 1): """ 构造请求数据 :param page: :return: """ params = { 'q': self.search_keyword, 'range': "all", 'c': 'news', 'sort': "time", 'page': page } str_params = urllib.parse.urlencode(params) return self.search_url.format(params=str_params) sina = SinaNewsSpider("春节") news_data = sina.go()
python 仍是一个很是不错的工具。