程序師世界是廣大編程愛好者互助、分享、學習的平台,程序師世界有你更精彩!
首頁
編程語言
C語言|JAVA編程
Python編程
網頁編程
ASP編程|PHP編程
JSP編程
數據庫知識
MYSQL數據庫|SqlServer數據庫
Oracle數據庫|DB2數據庫
 程式師世界 >> 編程語言 >> .NET網頁編程 >> C# >> 關於C# >> C#編程 Html table 解析器

C#編程 Html table 解析器

編輯:關於C#
 

html中的表格一般都存儲著比較重要的信息,雖然現在css+div已是主流,但筆者還是不建議把信息添加到div中,過度的使用div標簽一樣是一場災難^_^。下面的代碼可以解析html table,感興趣的朋友可以看看。

using System;
using System.Collections.Generic;
using System.Xml;
using System.Text;
using Sgml;
using System.IO;
using System.Net;
using System.ComponentModel;

namespace Freemouse.Base.parser
{
/// <summary>
/// html table 解析器
/// </summary>
public class HTMLTableReader : IDisposable
{
private List<HTMLTable> table;
private XmlDocument doc;
private Encoding objEncoding;
private SgmlReader reader;
private byte[] htmlBytes;

private bool disposed = false;
private Component component = new Component();

/// <summary>
/// 文本的編碼格式
/// </summary>
public Encoding Encode
{
get { return objEncoding; }
set { objEncoding = value; }
}
/// <summary>
/// 文檔中的表格
/// </summary>
public List<HTMLTable> Tables
{
get { return table; }
}
/// <summary>
/// 提供文檔中表格的索引訪問
/// </summary>
/// <param name="index">index</param>
/// <returns>HTMLTable</returns>
public HTMLTable this[int index]
{
get {
try
{
return table[index];
}
catch (Exception e)
{
throw e;
}
}
}
/// <summary>
/// 提供文檔中表格的索引訪問
/// </summary>
/// <param name="index">table index</param>
/// <param name="subindex">tr index</param>
/// <returns>HTMLTr</returns>
public HTMLTr this[int index, int subindex]
{
get
{
try
{
return table[index].Rows[subindex];
}
catch (Exception e)
{
throw e;
}
}
}
/// <summary>
/// 提供文檔中表格的索引訪問
/// </summary>
/// <param name="index">table index</param>
/// <param name="subindex">tr index</param>
/// <param name="subindex">td index</param>
/// <returns>HTMLTd</returns>
public HTMLTd this[int index, int subindex, int ssubindex]
{
get
{
try
{
return table[index].Rows[subindex].Cells[ssubindex];
}
catch (Exception e)
{
throw e;
}
}
}

private Stream outStream;
/// <summary>
/// 提供格式化後的標准html文檔流,以供保存
/// </summary>
public Stream OutputStream
{
get { return outStream; }
set { outStream = value; }
}
/// <summary>
/// 構造函數
/// </summary>
public HTMLTableReader()
{
doc = new XmlDocument();
table = new List<HTMLTable>();
reader = new SgmlReader();
reader.DocType = "strict";
objEncoding = Encoding.Default;
outStream = new MemoryStream();
}
/// <summary>
/// 析構函數
/// </summary>
~HTMLTableReader()
{
try
{
if (reader != null)
{
reader.Close();
}
if (outStream != null)
{
outStream.Close();
}
}
catch (Exception e)
{
throw e;
}
}
/// <summary>
/// 釋放所有資源
/// </summary>
public void Dispose()
{
Dispose(true);
GC.SuppressFinalize(this);
}

private void Dispose(bool disposing)
{
if (!this.disposed)
{
if (disposing)
{
component.Dispose();
}
disposed = true;

}
}

/// <summary>
/// 從文件或URL中獲取加載dom
/// </summary>
/// <param name="filename">文件名或URL</param>
/// <param name="encoding">編碼</param>
public void Load(string filename,Encoding encoding)
{
Stream readstream = new MemoryStream();
if (Uri.IsWellFormedUriString(filename, UriKind.Absolute))
{
using (WebClient web = new WebClient())
{
try
{
web.Encoding = encoding;
readstream = web.OpenRead(filename);
Load(readstream, encoding);
}
catch (Exception e)
{
throw e;
}
finally
{
readstream.Close();
web.Dispose();
}
}
}
else if (File.Exists(filename))
{
using (FileStream fs = new FileStream(filename, FileMode.Open, FileAccess.Read))
{
try
{
Load(fs, encoding);
}
catch (Exception e)
{
throw e;
}
finally
{
fs.Close();
}
}
}
else
{
throw new Exception("試圖訪問不存在的文件");
}
}
/// <summary>
/// 從流設備中加載dom
/// </summary>
/// <param name="stream">流</param>
/// <param name="encode">編碼</param>
public void Load(Stream stream,Encoding encode)
{
using( StreamWriter output = new StreamWriter(outStream, encode) ){
using (XmlTextWriter writer = new XmlTextWriter(outStream, encode))
{
try
{
reader.InputStream = new StreamReader(stream, encode);
reader.WhitespaceHandling = WhitespaceHandling.None;
writer.Formatting = Formatting.Indented;
writer.WriteStartDocument();
//設置一個根節點
writer.WriteStartElement("root");
while (reader.Read())
{
//獲取html節點
if (reader.NodeType == XmlNodeType.Element && reader.Name == "html")
{
XmlReader r = reader.ReadSubtree();
}
else
{
if (reader.Name == null)
{
continue;
}
//寫入所有table節點
switch (reader.Name.ToLower())
{
case "table":
if(reader.NodeType == XmlNodeType.Element)
writer.WriteNode(reader, true);
break;
case "":
default:
continue;
}
}

}
writer.WriteEndElement();
writer.WriteEndDocument();
writer.Flush();
writer.Close();
htmlBytes = ((MemoryStream)outStream).GetBuffer();
//編碼轉換
if (objEncoding != encode)
{
htmlBytes = Encoding.Convert(encode, objEncoding, htmlBytes, 0, htmlBytes.Length);
}
string xmltext = ReplaceLowOrderASCIICharacters(htmlBytes);
doc.LoadXml(xmltext);
getTable();

}
catch (Exception e)
{
throw e;
}
finally
{
writer.Close();
output.Close();
}
}
}
}
/// <summary>
/// 從字符串中加載dom
/// </summary>
/// <param name="xmltext">文本</param>
/// <param name="encode">編碼</param>
public void LoadXml(string xmltext,Encoding encode)
{
try
{
MemoryStream inputStream = new MemoryStream(encode.GetBytes(xmltext));
Load(inputStream, encode);
}
catch (Exception e)
{
throw e;
}
}
/// <summary>
/// 查找table
/// </summary>
private void getTable()
{
XmlNodeList tablelist = doc.GetElementsByTagName("table");
foreach (XmlNode _table in tablelist)
{
table.Add( new HTMLTable( _table));
}
}
/// <summary>
/// 修復xml低位字符串無法解析的問題
/// </summary>
/// <param name="buffer"></param>
/// <returns></returns>
private string ReplaceLowOrderASCIICharacters(byte[] buffer)
{
string tmp = objEncoding.GetString(buffer);
StringBuilder info = new StringBuilder();
foreach (char cc in tmp)
{
int ss = (int)cc;
if (((ss >= 0) && (ss <= 8)) || ((ss >= 11) && (ss <= 12)) || ((ss >= 14) && (ss <= 32)))
info.AppendFormat(" ", ss);//&#x{0:X};
else info.Append(cc);
}
return info.ToString();
}
}
/// <summary>
/// 代表一個html表格
/// </summary>
public class HTMLTable : HTMLTag
{
private List<HTMLTr> tr;
/// <summary>
/// 代表表格中的行,tr
/// </summary>
public List<HTMLTr> Rows{
get{ return tr; }
}
/// <summary>
/// 構造函數
/// </summary>
/// <param name="node">xmlnode</param>
public HTMLTable(XmlNode node) :base(node)
{
tr = new List<HTMLTr>();
getSubElement(node, Tag.tr);
getDeep = false;
foreach (XmlNode snode in subnodelist)
{
tr.Add(new HTMLTr(snode));
}
}
}
/// <summary>
/// 代表一個table中的行
/// </summary>
public class HTMLTr : HTMLTag{

private List<HTMLTd> td;
/// <summary>
/// 代表行中的單元格,td
/// </summary>
public List<HTMLTd> Cells{
get{ return td; }
}
/// <summary>
/// 構造函數
/// </summary>
/// <param name="node">xmlnode</param>
public HTMLTr(XmlNode node): base(node)
{
td = new List<HTMLTd>();
getSubElement(node, Tag.td);
getDeep = false;
foreach (XmlNode snode in subnodelist)
{
td.Add(new HTMLTd(snode));
}
}

}
/// <summary>
/// 代表一個table中的單元格
/// </summary>
public class HTMLTd : HTMLTag{
public HTMLTd(XmlNode node):base(node)
{
}
}
/// <summary>
/// 代表一個html標簽節點
/// </summary>
public class HTMLTag : IDisposable
{
protected XmlNode node;
protected bool getDeep = true;
protected List<XmlNode> subnodelist;
/// <summary>
/// 標簽中的文字
/// </summary>
public string Value
{
get
{
try
{
return node.InnerText;
}
catch (Exception)
{
return "";
}
}
}
/// <summary>
/// 標簽中的html文本
/// </summary>
public string innerHTML
{
get
{
try
{
return node.InnerXml;
}
catch (Exception)
{
return "";
}
}
}
/// <summary>
/// 屬性索引器
/// </summary>
/// <param name="attribute">指定屬性名</param>
/// <returns>屬性值</returns>
public string this[string attribute]
{
get
{
try
{
return node.Attributes[attribute].Value;
}
catch (Exception)
{
return "";
}
}
}
/// <summary>
/// 查找指定標簽名的子節點
/// </summary>
/// <param name="tagname">標簽名</param>
/// <returns>List<HTMLTag></returns>
public virtual List<HTMLTag> this[Tag tagname]
{
get
{
try
{
getSubElement(node, tagname);
List<HTMLTag> tags = new List<HTMLTag>();
foreach (XmlNode _node in subnodelist)
{
tags.Add(new HTMLTag(_node));
}
return tags;
}
catch (Exception)
{
return new List<HTMLTag>();
}
}
}
/// <summary>
/// 根據tagname查找元素
/// </summary>
/// <param name="tagname">標簽名</param>
/// <returns>List<HTMLTag></returns>
public virtual List<HTMLTag> GetElementsByTagName(Tag tagname)
{
return this[tagname];
}

private XmlNode nodecopy;
/// <summary>
/// 根據id查找元素
/// </summary>
/// <param name="id">id</param>
/// <returns>HTMLTag</returns>
public virtual HTMLTag GetElementByID(string id)
{
HTMLTag ht = new HTMLTag(nodecopy);
foreach (XmlNode subnode in nodecopy.ChildNodes)
{
try
{
string nodeid = subnode.Attributes["id"].Value;
{
if (nodeid.ToLower() == id.ToLower())
{
nodecopy = node;
return new HTMLTag(subnode);
}
else
{
nodecopy = subnode;
return GetElementByID(id);
}
}
}
catch (Exception)
{
continue;
}
}
return ht;
}
/// <summary>
/// 構造函數
/// </summary>
/// <param name="node">節點</param>
public HTMLTag(XmlNode node)
{
this.node = node;
this.nodecopy = node;
subnodelist = new List<XmlNode>();
}
/// <summary>
/// 查找子結點
/// </summary>
/// <param name="parentnode">父節點</param>
/// <param name="tagname">標簽名</param>
protected void getSubElement(XmlNode parentnode,Tag tagname)
{

foreach (XmlNode subnode in parentnode.ChildNodes)
{
if (subnode.Name == tagname.ToString())
{
subnodelist.Add(subnode);
}
if(getDeep && subnode.HasChildNodes)
{
getSubElement(subnode,tagname);
}
}
}

private bool disposed = false;
private Component component = new Component();

/// <summary>
/// 釋放所有資源
/// </summary>
public void Dispose()
{
Dispose(true);
GC.SuppressFinalize(this);
}

private void Dispose(bool disposing)
{
if (!this.disposed)
{
if (disposing)
{
component.Dispose();
}
disposed = true;

}
}

}
/// <summary>
/// html標簽名枚舉
/// </summary>
public enum Tag
{
a,
abbr,
acronym,
address,
applet,
area,
b,
Base,
basefont,
bdo,
big,
blockquote,
body,
br,
button,
caption,
center,
cite,
code,
col,
colgroup,
dd,
del,
dfn,
dir,
div,
dl,
dt,
em,
fieldset,
font,
form,
frame,
frameset,
h1,
h2,
h3,
h4,
h5,
h6,
head,
hr,
html,
i,
iframe,
img,
input,
ins,
isindex,
kbd,
label,
legend,
li,
link,
listing,
map,
menu,
meta,
noframes,
noscript,
Object,
ol,
optgroup,
option,
p,
param,
plaintext,
pre,
q,
rb,
rbc,
rp,
rt,
rtc,
ruby,
s,
samp,
script,
select,
small,
span,
strike,
strong,
style,
sub,
sup,
table,
tbody,
td,
textarea,
tfoot,
th,
thead,
title,
tr,
tt,
u,
ul,
var,
xmp,
nextid
}
}

  1. 上一頁:
  2. 下一頁:
Copyright © 程式師世界 All Rights Reserved