使用HtmlParser使用心得

最近因工作的需要,需要检查html那些不合理或则什么没有闭合。在网上找了很久都没有找到比较合适的工具。于是句试着搞搞HtmlParser。

获取html的代码:

string GetContentFromUrl(string url) { string content = string.Empty; try { HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url); request.Method = "GET"; request.AllowAutoRedirect = true; HttpWebResponse response = request.GetResponse() as HttpWebResponse; using (Stream stream = response.GetResponseStream()) { StringBuilder sb = new StringBuilder(); byte[] buffer = new byte[4096]; MemoryStream sr = new MemoryStream(); Encoding coding = Encoding.GetEncoding(response.CharacterSet); int readLength = stream.Read(buffer, 0, buffer.Length); while (readLength > 0) { sr.Write(buffer, 0, readLength); string txt = coding.GetString(buffer, 0, readLength); sb.Append(txt); readLength = stream.Read(buffer, 0, buffer.Length); } content = sb.ToString(); } response.Close(); request.Abort(); } catch (Exception ex) { content = ex.Message; } return content; }


解析html代码,一下代码在网上都能找到的


private void RecursionHtmlNode(TreeNode treeNode, INode htmlNode, bool siblingRequired) { if (htmlNode == null || treeNode == null) return; TreeNode current = treeNode; //current node if (htmlNode is ITag) { ITag tag = (htmlNode as ITag); if (!tag.IsEndTag()) { string nodeString = tag.TagName + " "; if (tag.Attributes != null && tag.Attributes.Count > 0) { StringBuilder sb = new StringBuilder(); foreach (string key in tag.Attributes.Keys) { if (key.Contains("<TAGNAME>")) continue; if (tag.Attributes[key] != null) sb.Append(key + "=\"" + tag.Attributes[key].ToString() + "\""); } nodeString += sb.ToString(); } current = new TreeNode(nodeString); treeNode.Nodes.Add(current); } } //the children nodes if (htmlNode.Children != null && htmlNode.Children.Count > 0) { this.RecursionHtmlNode(current, htmlNode.FirstChild, true); } //the sibling nodes if (siblingRequired) { INode sibling = htmlNode.NextSibling; while (sibling != null) { this.RecursionHtmlNode(treeNode, sibling, false); sibling = sibling.NextSibling; } } }


调用代码如下:

void ParseHTml() { string content = this.txtContent.Text; if (string.IsNullOrEmpty(content)) return; Lexer lexer = new Lexer(content); Parser parser = new Parser(lexer); NodeList htmlNodes = parser.Parse(null); this.treeView1.Nodes.Clear(); this.treeView1.Nodes.Add("root"); TreeNode treeRoot = this.treeView1.Nodes[0]; for (int i = 0; i < htmlNodes.Count; i++) { this.RecursionHtmlNode(treeRoot, htmlNodes[i], false); } }运行结果如图:


网上有关HtmlParser的源代码下载比较麻烦,我把该部分代码页放在此次demo中了,下载地址:http://download.csdn.net/detail/dz45693/4374572