程序師世界是廣大編程愛好者互助、分享、學習的平台,程序師世界有你更精彩!
首頁
編程語言
C語言|JAVA編程
Python編程
網頁編程
ASP編程|PHP編程
JSP編程
數據庫知識
MYSQL數據庫|SqlServer數據庫
Oracle數據庫|DB2數據庫
 程式師世界 >> 編程語言 >> .NET網頁編程 >> C# >> C#入門知識 >> HTML解析類 ,讓你不使用正則也能輕松獲取HTML相關元素,解析html

HTML解析類 ,讓你不使用正則也能輕松獲取HTML相關元素,解析html

編輯:C#入門知識

HTML解析類 ,讓你不使用正則也能輕松獲取HTML相關元素,解析html


功能:

1、輕松獲取指元素HTML元素。

2、可以根據屬性標簽進行篩選

3、返回的都是Llist強類型無需轉換

 

用過XElement的都知道 用來解析XML非常的方便,但是對於HTML的格式多樣化實在是沒辦法兼容。

所以我就寫了這麼一個類似XElement的 XHTMLElement

 

 

用法:

            string filePath = Server.MapPath("~/file/test.htm");
            //獲取HTML代碼
            string mailBody = FileHelper.FileToString(filePath);

            XHtmlElement xh = new XHtmlElement(mailBody);

            //獲取body的子集a標簽並且class="icon"
            var link = xh.Descendants("body").ChildDescendants("a").Where(c => c.Attributes.Any(a => a.Key == "class" && a.Value == "icon")).ToList();

            //獲取帶href的a元素
            var links = xh.Descendants("a").Where(c => c.Attributes.Any(a => a.Key == "href")).ToList();
            foreach (var r in links)
            {
                Response.Write(r.Attributes.Single(c => c.Key == "href").Value); //出輸href
            }

            //獲取第一個img
            var img = xh.Descendants("img");

            //獲取最近的第一個p元素以及與他同一級的其它p元素
            var ps = xh.Descendants("p");

 

 

代碼:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Text;
using System.Text.RegularExpressions;

namespace SyntacticSugar
{
    /// <summary>
    /// ** 描述:html解析類
    /// ** 創始時間:2015-4-23
    /// ** 修改時間:-
    /// ** 作者:sunkaixuan
    /// ** qq:610262374 歡迎交流,共同提高 ,命名語法等寫的不好的地方歡迎大家的給出寶貴建議
    /// </summary>
    public class XHtmlElement
    {
        private string _html;
        public XHtmlElement(string html)
        {
            _html = html;
        }

        /// <summary>
        /// 獲取最近的相同層級的HTML元素
        /// </summary>
        /// <param name="elementName">等於null為所有元素</param>
        /// <returns></returns>
        public List<HtmlInfo> Descendants(string elementName = null)
        {
            if (_html == null)
            {
                throw new ArgumentNullException("html不能這空!");
            }
            var allList = RootDescendants(_html);
            var reval = allList.Where(c => elementName == null || c.TagName.ToLower() == elementName.ToLower()).ToList();
            if (reval == null || reval.Count == 0)
            {
                reval = GetDescendantsSource(allList, elementName);
            }
            return reval;
        }


        /// <summary>
        /// 獲取第一級元素
        /// </summary>
        /// <param name="elementName"></param>
        /// <returns></returns>
        public List<HtmlInfo> RootDescendants(string html = null)
        {
            /*
             * 業務邏輯:
                         * 1、獲取第一個html標簽一直找結尾標簽,如果在這個過程中遇到相同的標簽收尾標簽就要加1
                         * 2、第一個標簽取到後繼續第一步操作,找第2個元素 。。第N個元素
             */
            if (html == null) html = _html;
            var firstTag = Regex.Match(html, "<.+?>");

            List<string> eleList = new List<string>();
            List<HtmlInfo> reval = new List<HtmlInfo>();
            GetElementsStringList(html, ref eleList);
            foreach (var r in eleList)
            {
                HtmlInfo data = new HtmlInfo();
                data.OldFullHtml = r;
                data.SameLeveHtml = html;
                data.TagName = Regex.Match(r, @"(?<=\s{1}|\<)[a-z,A-Z]+(?=\>|\s)", RegexOptions.IgnoreCase).Value;
                data.InnerHtml = Regex.Match(r, @"(?<=\>).+(?=<)", RegexOptions.Singleline).Value;
                var eleBegin = Regex.Match(r, "<.+?>").Value;
                var attrList = Regex.Matches(eleBegin, @"[a-z,A-Z]+\="".+?""").Cast<Match>().Select(c => new { key = c.Value.Split('=').First(), value = c.Value.Split('=').Last().TrimEnd('"').TrimStart('"') }).ToList();
                data.Attributes = new Dictionary<string, string>();
                if (attrList != null && attrList.Count > 0)
                {
                    foreach (var a in attrList)
                    {
                        data.Attributes.Add(a.key, a.value);
                    }
                }
                reval.Add(data);
            }
            return reval;

        }





        #region private
        private List<HtmlInfo> GetDescendantsSource(List<HtmlInfo> allList, string elementName)
        {
            foreach (var r in allList)
            {
                if (r.InnerHtml == null || !r.InnerHtml.Contains("<")) continue;
                var childList = RootDescendants(r.InnerHtml).Where(c => elementName == null || c.TagName.ToLower() == elementName.ToLower()).ToList();
                if (childList == null || childList.Count == 0)
                {
                    childList = GetDescendantsSource(RootDescendants(r.InnerHtml), elementName);
                    if (childList != null && childList.Count > 0)
                        return childList;
                }
                else
                {
                    return childList;
                }
            }
            return null;
        }

        private void GetElementsStringList(string html, ref List<string> eleList)
        {
            HtmlInfo info = new HtmlInfo();
            info.TagName = Regex.Match(html, @"(?<=\<\s{0,5}|\<)([a-z,A-Z]+|h\d{1})(?=\>|\s)", RegexOptions.IgnoreCase).Value;
            string currentTagBeginReg = @"<\s{0,10}" + info.TagName + @".*?>";//獲取當前標簽元素開始標簽正則
            string currentTagEndReg = @"\<\/" + info.TagName + @"\>";//獲取當前標簽元素收尾標簽正則
            if (string.IsNullOrEmpty(info.TagName)) return;

            string eleHtml = "";
            //情況1 <a/>
            //情況2 <a></a>
            //情況3 <a> 錯誤格式
            //情況4endif
            if (Regex.IsMatch(html, @"<\s{0,10}" + info.TagName + "[^<].*?/>"))//單標簽
            {
                eleHtml = Regex.Match(html, @"<\s{0,10}" + info.TagName + "[^<].*?/>").Value;
            }
            else if (!Regex.IsMatch(html, currentTagEndReg))//沒有收尾
            {
                if (Regex.IsMatch(html, @"\s{0,10}\<\!\-\-\[if"))
                {
                    eleHtml = GetElementString(html, @"\s{0,10}\<\!\-\-\[if", @"\[endif\]\-\-\>", 1);
                }
                else
                {
                    eleHtml = Regex.Match(html, currentTagBeginReg,RegexOptions.Singleline).Value;
                }
            }
            else
            {
                eleHtml = GetElementString(html, currentTagBeginReg, currentTagEndReg, 1);
            }


            try
            {
                eleList.Add(eleHtml);
                html = html.Replace(eleHtml, "");
                html = Regex.Replace(html, @"<\!DOCTYPE.*?>", "");
                if (!Regex.IsMatch(html, @"^\s*$"))
                {
                    GetElementsStringList(html, ref eleList);
                }

            }
            catch (Exception ex)
            {
                throw new Exception("SORRY,您的HTML格式不能解析!!!");

            }

        }

        private string GetElementString(string html, string currentTagBeginReg, string currentTagEndReg, int i)
        {

            string newHtml = GetRegNextByNum(html, currentTagBeginReg, currentTagEndReg, i);
            var currentTagBeginMatches = Regex.Matches(newHtml, currentTagBeginReg, RegexOptions.Singleline).Cast<Match>().Select(c => c.Value).ToList();
            var currentTagEndMatches = Regex.Matches(newHtml, currentTagEndReg).Cast<Match>().Select(c => c.Value).ToList();
            if (currentTagBeginMatches.Count == currentTagEndMatches.Count)
            { //兩個簽標元素相等
                return newHtml;
            }
            return GetElementString(html, currentTagBeginReg, currentTagEndReg, ++i);
        }

        private string GetRegNextByNum(string val, string currentTagBeginReg, string currentTagEndReg, int i)
        {
            return Regex.Match(val, currentTagBeginReg + @"((.*?)" + currentTagEndReg + "){" + i + "}?", RegexOptions.IgnoreCase | RegexOptions.Singleline).Value;
        }
        #endregion



    }
    public static class XHtmlElementExtendsion
    {
        /// <summary>
        /// 獲取最近的相同層級的HTML元素
        /// </summary>
        /// <param name="elementName">等於null為所有元素</param>
        /// <returns></returns>
        public static List<HtmlInfo> Descendants(this  IEnumerable<HtmlInfo> htmlInfoList, string elementName = null)
        {
            var html = htmlInfoList.First().InnerHtml;
            XHtmlElement xhe = new XHtmlElement(html);
            return xhe.Descendants(elementName);
        }
        /// <summary>
        /// 獲取下級元素
        /// </summary>
        /// <param name="elementName"></param>
        /// <returns></returns>
        public static List<HtmlInfo> ChildDescendants(this  IEnumerable<HtmlInfo> htmlInfoList, string elementName = null)
        {
            var html = htmlInfoList.First().InnerHtml;
            XHtmlElement xhe = new XHtmlElement(html);
            return xhe.RootDescendants(html).Where(c => elementName == null || c.TagName == elementName).ToList();
        }

        /// <summary>
        /// 獲取父級
        /// </summary>
        /// <param name="htmlInfoList"></param>
        /// <returns></returns>
        public static List<HtmlInfo> ParentDescendant(this  IEnumerable<HtmlInfo> htmlInfoList,string fullHtml)
        {
            var saveLeveHtml = htmlInfoList.First().SameLeveHtml;
            string replaceGuid=Guid.NewGuid().ToString();
            fullHtml = fullHtml.Replace(saveLeveHtml,replaceGuid);
            var parentHtml = Regex.Match(fullHtml, @"<[^<]+?>[^<]*?" + replaceGuid + @".*?<\/.+?>").Value;
            parentHtml = parentHtml.Replace(replaceGuid, saveLeveHtml);
            XHtmlElement xhe = new XHtmlElement(parentHtml);
            return xhe.RootDescendants();
        }
    }
    /// <summary>
    /// html信息類
    /// </summary>
    public class HtmlInfo
    {
        /// <summary>
        /// 元素名
        /// </summary>
        public string TagName { get; set; }
        /// <summary>
        /// 元素屬性
        /// </summary>
        public Dictionary<string, string> Attributes { get; set; }
        /// <summary>
        /// 元素內部html
        /// </summary>
        public string InnerHtml { get; set; }

        public string OldFullHtml { get; set; }

        public string SameLeveHtml { get; set; }

        /// <summary>
        /// 得到元素的html
        /// </summary>
        /// <returns></returns>
        public string FullHtml
        {
            get
            {
                StringBuilder reval = new StringBuilder();
                string attributesString = string.Empty;
                if (Attributes != null && Attributes.Count > 0)
                {
                    attributesString = string.Join(" ", Attributes.Select(c => string.Format("{0}=\"{1}\"", c.Key, c.Value)));
                }
                reval.AppendFormat("<{0} {2}>{1}</{0}>", TagName, InnerHtml, attributesString);
                return reval.ToString();
            }
        }
    }
}

 

 

前台HTML:

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
    <title></title>
</head>
<body>
    <a id="1">我是1</a> 
    <a id="2" class="icon">icon</a>
    <img />
</body>
</html>

 

  1. 上一頁:
  2. 下一頁:
Copyright © 程式師世界 All Rights Reserved