Html格式內容轉Csv內容,包括table(重點在rowspan和colspan合並),p,div元素,table不能包含嵌套功能。
1 /// <summary>
2 /// Html格式內容轉Csv內容包括table(重點在rowspan和colspan合並),p,div元素
3 /// </summary>
4 /// <param name="hrml"></param>
5 /// <returns></returns>
6 private string HtmlToCsv(string hrml)
7 {
8 HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
9 doc.LoadHtml(hrml);
10 StringBuilder sbLines = new StringBuilder();
11 HtmlAgilityPack.HtmlNodeCollection tList = doc.DocumentNode.SelectNodes("//table");
12 if (tList != null)
13 {
14 foreach (HtmlAgilityPack.HtmlNode table in tList)
15 {
16 sbLines.AppendLine("#flag_table#,");
17 HtmlAgilityPack.HtmlNodeCollection rows = table.SelectNodes("//tr");
18 if (rows != null)
19 {
20 int colCount = 0;
21 StringBuilder sbTable = new StringBuilder();
22 foreach (HtmlAgilityPack.HtmlNode td in rows[0].ChildNodes.Where(m => m.OriginalName.ToLower() == "td"))
23 {
24 HtmlAgilityPack.HtmlAttribute attr = td.Attributes["colspan"];
25 int colspan = (attr != null) ? int.Parse(attr.Value) : 1;
26 colCount = colCount + colspan;
27 }
28 int rowCount = rows.Count;
29
30 string[][] arr = new string[rowCount][];
31 for (int r = 0; r < rowCount; r++)
32 {
33 arr[r] = new string[colCount];
34 }
35
36 //填充區域
37 for (int r = 0; r < rowCount; r++)
38 {
39 HtmlAgilityPack.HtmlNode tr = rows[r];
40 List<HtmlAgilityPack.HtmlNode> cols = tr.ChildNodes.Where(m => m.OriginalName.ToLower() == "td").ToList();
41
42 int colspan = 0;
43 int rowspan = 0;
44 for (int c = 0; c < cols.Count; c++)
45 {
46 HtmlAgilityPack.HtmlAttribute cAttr = cols[c].Attributes["colspan"];
47 colspan = (cAttr != null) ? int.Parse(cAttr.Value) : 1;
48 HtmlAgilityPack.HtmlAttribute rAttr = cols[c].Attributes["rowspan"];
49 rowspan = (rAttr != null) ? int.Parse(rAttr.Value) : 1;
50 string text = cols[c].InnerText.Replace(" ", "").Replace(",", ",").Replace("\r", "").Replace("\n", "").Trim();
51
52 if (colspan == 1 && rowspan == 1)
53 {
54 continue;
55 }
56
57 bool isFirst = true;
58 int rFill = r + rowspan;
59 for (int ri = r; ri < rFill; ri++)
60 {
61 int cFill = c + colspan;
62 for (int ci = c; ci < cFill; ci++)
63 {
64 if (isFirst)
65 {
66 text = (text == string.Empty) ? " " : text;
67 arr[ri][ci] = text;
68 isFirst = false;
69 }
70 else
71 {
72 arr[ri][ci] = string.Empty;
73 }
74 }
75 }
76 }
77 }
78
79 //填充單元
80 for (int r = 0; r < rowCount; r++)
81 {
82 HtmlAgilityPack.HtmlNode tr = rows[r];
83 List<HtmlAgilityPack.HtmlNode> cols = tr.ChildNodes.Where(m => m.OriginalName.ToLower() == "td").ToList();
84 Queue<string> queue = new Queue<string>();
85 for (int c = 0; c < cols.Count; c++)
86 {
87 string text = cols[c].InnerText.Replace(" ", "").Replace(",", ",").Replace("\r", "").Replace("\n", "").Trim();
88 queue.Enqueue(text);
89 }
90 for (int c = 0; c < colCount; c++)
91 {
92 if (arr[r][c] == null)
93 {
94 string text = queue.Count > 0 ? queue.Dequeue() : string.Empty;
95 arr[r][c] = text;
96 }
97 else
98 {
99 if (arr[r][c] != string.Empty)
100 {
101 if (queue.Count > 0)
102 {
103 queue.Dequeue();
104 }
105 }
106 }
107 }
108 }
109
110 //組裝成cvs格式內容
111 foreach (string[] cols in arr)
112 {
113 foreach (string col in cols)
114 {
115 sbLines.Append(col + ",");
116 }
117 sbLines.AppendLine(",");
118 }
119 table.RemoveAll();
120 }
121 }
122 }
123
124 HtmlAgilityPack.HtmlNodeCollection pList = doc.DocumentNode.SelectNodes("//p");
125 if (pList != null)
126 {
127 sbLines.AppendLine("#flag_text#,");
128 foreach (HtmlAgilityPack.HtmlNode p in pList)
129 {
130 string text = p.InnerText.Replace(" ", "").Replace(",", ",").Replace("\r", "").Replace("\n", "").Trim();
131 text = GetTextByHtml(text);
132 if (!string.IsNullOrWhiteSpace(text))
133 {
134 sbLines.Append(text + ",");
135 sbLines.AppendLine(",");
136 }
137 else
138 {
139 sbLines.AppendLine(",");
140 }
141 p.RemoveAll();
142 }
143 }
144
145 HtmlAgilityPack.HtmlNodeCollection dList = doc.DocumentNode.SelectNodes("//div");
146 if (pList != null)
147 {
148 sbLines.AppendLine("#flag_text#,");
149 foreach (HtmlAgilityPack.HtmlNode div in pList)
150 {
151 string text = div.InnerText.Replace(" ", "").Replace(",", ",").Replace("\r", "").Replace("\n", "").Trim();
152 text = GetTextByHtml(text);
153 if (!string.IsNullOrWhiteSpace(text))
154 {
155 sbLines.Append(text + ",");
156 sbLines.AppendLine(",");
157 }
158 else
159 {
160 sbLines.AppendLine(",");
161 }
162 //div.RemoveAll();
163 }
164 }
165 return sbLines.ToString();
166 }
html:

csv:

url:http://www.cnblogs.com/dreamman/p/5343924.html