爬虫须要抓取网站价格,与通常抓取网页区别的是抓取内容是经过AJAX加载,而且价格是经过CSS背景图片显示的。css
每个数字对应一个样式,如'p_h57_5'html
.p_h57_5 { background: url('http://pic.c-ctrip.com/priceblur/h57/3713de5c594648529f39d031243966dd.gif') no-repeat -590px; padding: 0 6px; font-size: 18px; }
数字对应的样式和对应的backgroundimg都是动态改变的,须要获取到每个房型的房价。虽而后来有了其它渠道获取房价,这里记录一下用Selenium&Emgu抓取的方式。
流程:c#
1.Selenium访问网址 2.全屏截图 3.Selenium选择器获取房型等信息 4.Selenium选择器获取价格DOM元素,计算出价格元素的相对位置,截取价格图片,使用Emgu识别价格而且输出
static void Main(string[] args) { //访问网址 ChromeOptions options = new ChromeOptions(); options.AddArguments("--start-maximized --disable-popup-blocking"); var driver = new ChromeDriver(options); driver.Navigate().GoToUrl("http://hotels.ctrip.com/hotel/992765.html"); try { new WebDriverWait(driver, TimeSpan.FromSeconds(1)).Until( ExpectedConditions.ElementExists((By.ClassName("htl_room_table")))); //表示已加载完毕 } finally { } //删除价格的¥符号 ReadOnlyCollection<IWebElement> elementsList = driver.FindElementsByCssSelector("tr[expand]"); driver.ExecuteScript(@" var arr = document.getElementsByTagName('dfn'); for(var i=0;i<arr.length;i++){ arr[i].style.display = 'none'; } "); //全屏截图 var image2 = GetEntereScreenshot(driver); image2.Save(@"Z:\111.jpg"); //输出 Console.WriteLine("{0,-20}{1,-20}{2,-20}", "房型", "类型", "房价"); foreach (IWebElement _ in elementsList) { //var image = _.Snapshot(); //image.Save(@"Z:\" + Guid.NewGuid() + ".jpg"); //var str = ORC_((Bitmap)image); var roomType = ""; try { roomType = _.FindElement(By.CssSelector(".room_unfold")).Text; } catch (Exception) { } var roomTypeText = regRoomType.Match(roomType); var roomTypeName = _.FindElement(By.CssSelector("span.room_type_name")).Text; //价格元素生成图片 var image = _.FindElement(By.CssSelector("span.base_price")).SnapshotV2(image2); //识别 var price = ORC_((Bitmap)image); Console.WriteLine("{0,-20}{1,-20}{2,-20}", roomTypeText.Value, roomTypeName, price); } Console.Read(); }
图片识别方法windows
static Program() { _ocr.SetVariable("tessedit_char_whitelist", "0123456789"); } private static Tesseract _ocr = new Tesseract(@"C:\Emgu\emgucv-windows-universal-cuda 2.9.0.1922\bin\tessdata", "eng", Tesseract.OcrEngineMode.OEM_TESSERACT_CUBE_COMBINED); //传入图片进行识别 public static string ORC_(Bitmap img) { //""标示OCR识别调用失败 string re = ""; if (img == null) return re; else { Bgr drawColor = new Bgr(Color.Blue); try { Image<Bgr, Byte> image = new Image<Bgr, byte>(img); using (Image<Gray, byte> gray = image.Convert<Gray, Byte>()) { _ocr.Recognize(gray); Tesseract.Charactor[] charactors = _ocr.GetCharactors(); foreach (Tesseract.Charactor c in charactors) { image.Draw(c.Region, drawColor, 1); } re = _ocr.GetText(); } return re; } catch (Exception ex) { return re; } } }
Selenium内置了截图方法,只能截取浏览器中显示的内容,找到一个全屏截图的方式(内置截图+控制滚动条,图片拼接)浏览器
public static Bitmap GetEntereScreenshot(IWebDriver _driver) { Bitmap stitchedImage = null; try { long totalwidth1 = (long)((IJavaScriptExecutor)_driver).ExecuteScript("return document.body.offsetWidth");//documentElement.scrollWidth"); long totalHeight1 = (long)((IJavaScriptExecutor)_driver).ExecuteScript("return document.body.parentNode.scrollHeight"); int totalWidth = (int)totalwidth1; int totalHeight = (int)totalHeight1; // Get the Size of the Viewport long viewportWidth1 = (long)((IJavaScriptExecutor)_driver).ExecuteScript("return document.body.clientWidth");//documentElement.scrollWidth"); long viewportHeight1 = (long)((IJavaScriptExecutor)_driver).ExecuteScript("return window.innerHeight");//documentElement.scrollWidth"); int viewportWidth = (int)viewportWidth1; int viewportHeight = (int)viewportHeight1; // Split the Screen in multiple Rectangles List<Rectangle> rectangles = new List<Rectangle>(); // Loop until the Total Height is reached for (int i = 0; i < totalHeight; i += viewportHeight) { int newHeight = viewportHeight; // Fix if the Height of the Element is too big if (i + viewportHeight > totalHeight) { newHeight = totalHeight - i; } // Loop until the Total Width is reached for (int ii = 0; ii < totalWidth; ii += viewportWidth) { int newWidth = viewportWidth; // Fix if the Width of the Element is too big if (ii + viewportWidth > totalWidth) { newWidth = totalWidth - ii; } // Create and add the Rectangle Rectangle currRect = new Rectangle(ii, i, newWidth, newHeight); rectangles.Add(currRect); } } // Build the Image stitchedImage = new Bitmap(totalWidth, totalHeight); // Get all Screenshots and stitch them together Rectangle previous = Rectangle.Empty; foreach (var rectangle in rectangles) { // Calculate the Scrolling (if needed) if (previous != Rectangle.Empty) { int xDiff = rectangle.Right - previous.Right; int yDiff = rectangle.Bottom - previous.Bottom; // Scroll //selenium.RunScript(String.Format("window.scrollBy({0}, {1})", xDiff, yDiff)); ((IJavaScriptExecutor)_driver).ExecuteScript(String.Format("window.scrollBy({0}, {1})", xDiff, yDiff)); System.Threading.Thread.Sleep(200); } // Take Screenshot var screenshot = ((ITakesScreenshot)_driver).GetScreenshot(); // Build an Image out of the Screenshot Image screenshotImage; using (MemoryStream memStream = new MemoryStream(screenshot.AsByteArray)) { screenshotImage = Image.FromStream(memStream); } // Calculate the Source Rectangle Rectangle sourceRectangle = new Rectangle(viewportWidth - rectangle.Width, viewportHeight - rectangle.Height, rectangle.Width, rectangle.Height); // Copy the Image using (Graphics g = Graphics.FromImage(stitchedImage)) { g.DrawImage(screenshotImage, rectangle, sourceRectangle, GraphicsUnit.Pixel); } // Set the Previous Rectangle previous = rectangle; } } catch (Exception ex) { // handle } return stitchedImage; }
最后的是根据传入的元素和全屏截图,获取到价格元素的图片oop
public static Image SnapshotV2(this IWebElement element, Bitmap bitmap) { Size size = new Size( Math.Min(element.Size.Width, bitmap.Width), Math.Min(element.Size.Height, bitmap.Height)); Rectangle crop = new Rectangle(element.Location, size); return bitmap.Clone(crop, bitmap.PixelFormat); }
运行效果以下
网站