HTTP请求工具类(功能:一、获取网页html;二、下载网络图片;):javascript
using System; using System.Collections.Generic; using System.Drawing; using System.IO; using System.Linq; using System.Net; using System.Text; using System.Threading.Tasks; using System.Windows.Forms; namespace Utils { /// <summary> /// HTTP请求工具类 /// </summary> public class HttpRequestUtil { /// <summary> /// 获取页面html /// </summary> public static string GetPageHtml(string url) { // 设置参数 HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest; request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)"; //发送请求并获取相应回应数据 HttpWebResponse response = request.GetResponse() as HttpWebResponse; //直到request.GetResponse()程序才开始向目标网页发送Post请求 Stream responseStream = response.GetResponseStream(); StreamReader sr = new StreamReader(responseStream, Encoding.UTF8); //返回结果网页(html)代码 string content = sr.ReadToEnd(); return content; } /// <summary> /// Http下载文件 /// </summary> public static void HttpDownloadFile(string url, int minWidth, int minHeight) { int pos = url.LastIndexOf("/") + 1; string fileName = url.Substring(pos); string path = Application.StartupPath + "\\download"; if (!Directory.Exists(path)) { Directory.CreateDirectory(path); } string filePathName = path + "\\" + fileName; if (File.Exists(filePathName)) return; // 设置参数 HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest; request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)"; request.Proxy = null; //发送请求并获取相应回应数据 HttpWebResponse response = request.GetResponse() as HttpWebResponse; //直到request.GetResponse()程序才开始向目标网页发送Post请求 Stream responseStream = response.GetResponseStream(); MemoryStream memoryStream = new MemoryStream(); byte[] bArr = new byte[1024]; int size = responseStream.Read(bArr, 0, (int)bArr.Length); while (size > 0) { memoryStream.Write(bArr, 0, size); size = responseStream.Read(bArr, 0, (int)bArr.Length); } Image tempImage = System.Drawing.Image.FromStream(memoryStream, true); int imageHeight = tempImage.Height; int imageWidth = tempImage.Width; if (imageHeight >= minHeight && imageWidth >= minWidth) { memoryStream.Seek(0, SeekOrigin.Begin); size = memoryStream.Read(bArr, 0, (int)bArr.Length); FileStream fs = new FileStream(filePathName, FileMode.Create); while (size > 0) { fs.Write(bArr, 0, size); size = memoryStream.Read(bArr, 0, (int)bArr.Length); } fs.Close(); } memoryStream.Close(); responseStream.Close(); } } }
VisitedHelper类:html
using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using System.Threading.Tasks; using System.Windows.Forms; namespace Utils { /// <summary> /// 已访问的网址列表 /// </summary> public class VisitedHelper { private static List<string> m_VisitedList = new List<string>(); #region 判断是否已访问 /// <summary> /// 判断是否已访问 /// </summary> public static bool IsVisited(string url) { if (m_VisitedList.Exists(a => a == url)) { return true; } return false; } #endregion #region 添加已访问 /// <summary> /// 添加已访问 /// </summary> public static void Add(string url) { m_VisitedList.Add(url); } #endregion } }
多线程爬取网页代码:java
using System; using System.Collections.Generic; using System.ComponentModel; using System.Data; using System.Drawing; using System.IO; using System.Linq; using System.Text; using System.Text.RegularExpressions; using System.Threading; using System.Threading.Tasks; using System.Windows.Forms; using Utils; namespace 爬虫 { public partial class Form1 : Form { private static int m_MinWidth = 300; private static int m_MinHeight = 300; private static int m_CompletedCount = 0; public Form1() { InitializeComponent(); } private void button1_Click(object sender, EventArgs e) { ThreadPool.SetMaxThreads(100, 100); int.TryParse(txtMinWidth.Text, out m_MinWidth); int.TryParse(txtMinHeight.Text, out m_MinHeight); button1.Enabled = false; lblMsg.Text = "正在爬取图片…"; timer1.Start(); new Thread(new ThreadStart(delegate() { Crawling(txtUrl.Text, null); })).Start(); } /// <summary> /// 爬取 /// </summary> private void Crawling(string url, string host) { if (!VisitedHelper.IsVisited(url)) { VisitedHelper.Add(url); if (host == null) { host = GetHost(url); } string pageHtml = HttpRequestUtil.GetPageHtml(url); Regex regA = new Regex(@"<a[\s]+[^<>]*href=(?:""|')([^<>""']+)(?:""|')[^<>]*>[^<>]+</a>", RegexOptions.IgnoreCase); Regex regImg = new Regex(@"<img[\s]+[^<>]*src=(?:""|')([^<>""']+(?:jpg|jpeg|png|gif))(?:""|')[^<>]*>", RegexOptions.IgnoreCase); MatchCollection mcImg = regImg.Matches(pageHtml); foreach (Match mImg in mcImg) { string imageUrl = mImg.Groups[1].Value; try { int imageWidth = GetImageWidthOrHeight(mImg.Value, true); int imageHeight = GetImageWidthOrHeight(imageUrl, false); if (imageWidth >= m_MinWidth && imageHeight >= m_MinHeight) { if (imageUrl.IndexOf("javascript") == -1) { if (imageUrl.IndexOf("http") == 0) { HttpRequestUtil.HttpDownloadFile(imageUrl, m_MinWidth, m_MinHeight); } else { HttpRequestUtil.HttpDownloadFile(host + imageUrl, m_MinWidth, m_MinHeight); } } } } catch { } } //递归遍历 MatchCollection mcA = regA.Matches(pageHtml); foreach (Match mA in mcA) { try { string nextUrl = mA.Groups[1].Value; if (nextUrl.IndexOf("javascript") == -1) { if (nextUrl.IndexOf("http") == 0) { if (GetHost(url) == host) { ThreadPool.QueueUserWorkItem(new WaitCallback(delegate(object obj) { try { Crawling(nextUrl, host); m_CompletedCount++; } catch { } })); } } else { if (GetHost(url) == host) { ThreadPool.QueueUserWorkItem(new WaitCallback(delegate(object obj) { try { Crawling(host + nextUrl, host); m_CompletedCount++; } catch { } })); } } } } catch { } } } } //end Crawling方法 /// <summary> /// 获取主机 /// </summary> private string GetHost(string url) { Regex regHost = new Regex(@"(?:http|https)://[a-z0-9\-\.:]+", RegexOptions.IgnoreCase); Match mHost = regHost.Match(url); return mHost.Value + "/"; } //计时器事件 private void timer1_Tick(object sender, EventArgs e) { int workerThreads; int completionPortThreads; ThreadPool.GetAvailableThreads(out workerThreads, out completionPortThreads); if (workerThreads == 100 && m_CompletedCount > 0) { lblMsg.Text = "已结束"; } else { lblMsg.Text = "正在爬取图片…"; } } /// <summary> /// 获取图片宽度或高度 /// </summary> private int GetImageWidthOrHeight(string imageTagString, bool isWidth) { string tag = isWidth ? "width" : "height"; Regex reg = new Regex(string.Format(@"{0}=""([\d\.]+)""", tag), RegexOptions.IgnoreCase); Match match = reg.Match(imageTagString); if (match.Success) { return (int)Convert.ToDouble(match.Groups[1].Value); } else { reg = new Regex(string.Format(@"{0}[\s]*:[\s]*([\d\.]+)[\s]*px[\s]*;", tag), RegexOptions.IgnoreCase); match = reg.Match(imageTagString); if (match.Success) { return (int)Convert.ToDouble(match.Groups[1].Value); } } return int.MaxValue; } } //end Form1类 /// <summary> /// 跨线程访问控件的委托 /// </summary> public delegate void InvokeDelegate(); }
截图:网络