同窗们能够去各大招聘网站查看一下爬虫工程师的要求,大可能是招JAVA、PYTHON,甚至于还有NODEJS,C++;再或者去开源中国查询C#的爬虫项目,仅有几个很是简单或是几年没有更新的项目。html
而单纯性能上.NET对比JAVA,PYTHON并无处于弱势,反而有开发上的优点(得益于世界上最强大的IDE)。爬虫性能瓶颈大可能是在并发下载(网速)、IP池,那么为何.NET没有一个强大的爬虫框架呢?说真的我不知道,可能爬虫框架核心上比较简单,也可能.NET的开发人员没有别的语言的开发人员勤奋,或是.NET的开源氛围没有别的语言高。直到.NET要出开源版的消息传来,我以为是时候开发一个跨平台,跨语言的爬虫框架了。但一开始是比较忐忑的,以为本身水平不够去彻底从新设计一个新的框架出来,所以参考了JAVA的一个轻量级爬虫框架webmagic,并加入了我本身的理解和改进。若是设计或写得很差请你们指正海涵mysql
因为我是参考的webmagic,因此总体架构上没有什么大的变化,设计图以下(图片是直接从webmagic上拿的)git
基本使用只须要引用DotnetSpider2.Core(Nuget中获取)github
DotnetSpider实现一个完整爬虫是须要4个模块的:Scheduler、Downloader、PageProcessor、Pipeline。因为Downloader和Scheduler都是有基本实现的,所以只须要实现PageProcessor和Pipeline就能够实现一个基本爬虫了,这种方式也是最自由的方式。web
彻底自定义的例子以下:sql
public static void Main(string[] args) { // Custmize processor and pipeline 彻底自定义页面解析和数据管道 CustmizeProcessorAndPipeline(); Console.WriteLine("Press any key to continue..."); Console.Read(); }
public static void CustmizeProcessorAndPipeline() { // Config encoding, header, cookie, proxy etc... 定义采集的 Site 对象, 设置 Header、Cookie、代理等 var site = new Site { EncodingName = "UTF-8", RemoveOutboundLinks = true }; for (int i = 1; i < 5; ++i) { // Add start/feed urls. 添加初始采集连接 site.AddStartUrl($"http://list.youku.com/category/show/c_96_s_1_d_1_p_{i}.html"); } Spider spider = Spider.Create(site, // use memoery queue scheduler. 使用内存调度 new QueueDuplicateRemovedScheduler(), // use custmize processor for youku 为优酷自定义的 Processor new YoukuPageProcessor()) // use custmize pipeline for youku 为优酷自定义的 Pipeline .AddPipeline(new YoukuPipeline()); spider.Downloader = new HttpClientDownloader(); spider.ThreadNum = 1; spider.EmptySleepTime = 3000; // Start crawler 启动爬虫 spider.Run(); } public class YoukuPipeline : BasePipeline { private static long count = 0; public override void Process(params ResultItems[] resultItems) { foreach (var resultItem in resultItems) { StringBuilder builder = new StringBuilder(); foreach (YoukuVideo entry in resultItem.Results["VideoResult"]) { count++; builder.Append($" [YoukuVideo {count}] {entry.Name}"); } Console.WriteLine(builder); } // Other actions like save data to DB. 能够自由实现插入数据库或保存到文件 } } public class YoukuPageProcessor : BasePageProcessor { protected override void Handle(Page page) { // 利用 Selectable 查询并构造本身想要的数据对象 var totalVideoElements = page.Selectable.SelectList(Selectors.XPath("//div[@class='yk-pack pack-film']")).Nodes(); List<YoukuVideo> results = new List<YoukuVideo>(); foreach (var videoElement in totalVideoElements) { var video = new YoukuVideo(); video.Name = videoElement.Select(Selectors.XPath(".//img[@class='quic']/@alt")).GetValue(); results.Add(video); } // Save data object by key. 以自定义KEY存入page对象中供Pipeline调用 page.AddResultItem("VideoResult", results); // Add target requests to scheduler. 解析须要采集的URL //foreach (var url in page.Selectable.SelectList(Selectors.XPath("//ul[@class='yk-pages']")).Links().Nodes()) //{ // page.AddTargetRequest(new Request(url.GetValue(), null)); //} } } public class YoukuVideo { public string Name { get; set; } }
配置式爬虫须要额外引用DotnetSpider2.Extension(Nuget中获取)数据库
大部分状况下只须要配置式来实现一个采集任务。相对于基本使用方式,配置式爬式只须要短短的几行代码就能够实现一个爬虫。但凡事有利就有弊,配置式爬的自由度相对低了一些。浏览器
使用配置式爬虫的步骤以下:cookie
完整代码以下, 感觉一下就好,后面章节会详细介绍如何实现:架构
public class JdSkuSampleSpider : EntitySpider { public JdSkuSampleSpider() : base("JdSkuSample", new Site { //HttpProxyPool = new HttpProxyPool(new KuaidailiProxySupplier("快代理API")) }) { } protected override void MyInit(params string[] arguments) { Identity = Identity ?? "JD SKU SAMPLE"; ThreadNum = 1; // dowload html by http client Downloader = new HttpClientDownloader(); // storage data to mysql, default is mysql entity pipeline, so you can comment this line. Don't miss sslmode. AddPipeline(new MySqlEntityPipeline("Database='mysql';Data Source=localhost;User ID=root;Password=;Port=3306;SslMode=None;")); AddStartUrl("http://list.jd.com/list.html?cat=9987,653,655&page=2&JL=6_0_0&ms=5#J_main", new Dictionary<string, object> { { "name", "手机" }, { "cat3", "655" } }); AddEntityType<Product>(); } [EntityTable("test", "jd_sku", EntityTable.Monday, Indexs = new[] { "Category" }, Uniques = new[] { "Category,Sku", "Sku" })] [EntitySelector(Expression = "//li[@class='gl-item']/div[contains(@class,'j-sku-item')]")] [TargetUrlsSelector(XPaths = new[] { "//span[@class=\"p-num\"]" }, Patterns = new[] { @"&page=[0-9]+&" })] public class Product : SpiderEntity { [PropertyDefine(Expression = "./@data-sku", Length = 100)] public string Sku { get; set; } [PropertyDefine(Expression = "name", Type = SelectorType.Enviroment, Length = 100)] public string Category { get; set; } [PropertyDefine(Expression = "cat3", Type = SelectorType.Enviroment)] public int CategoryId { get; set; } [PropertyDefine(Expression = "./div[1]/a/@href")] public string Url { get; set; } [PropertyDefine(Expression = "./div[5]/strong/a")] public long CommentsCount { get; set; } [PropertyDefine(Expression = ".//div[@class='p-shop']/@data-shop_name", Length = 100)] public string ShopName { get; set; } [PropertyDefine(Expression = ".//div[@class='p-name']/a/em", Length = 100)] public string Name { get; set; } [PropertyDefine(Expression = "./@venderid", Length = 100)] public string VenderId { get; set; } [PropertyDefine(Expression = "./@jdzy_shop_id", Length = 100)] public string JdzyShopId { get; set; } [PropertyDefine(Expression = "Monday", Type = SelectorType.Enviroment)] public DateTime RunId { get; set; } } }
public class Program { public static void Main(string[] args) { JdSkuSampleSpider spider = new JdSkuSampleSpider(); spider.Run(); } }
https://github.com/zlzforever/DotnetSpider 望各位大佬加星 :)
博文写得比较早, 框架修改有时会来不及更新博文中的代码, 请查看DotnetSpider.Sample项目中的样例爬虫
QQ群: 477731655