在編寫網絡爬蟲時,HttpWebRequest幾乎可以完成絕大多數網站的抓取,為了更好的使用這一技術,我將常用的幾個功能進行了封裝,以方便調用。這個類已經在多個項目中得到使用,主要解決了Cookies相關的一些問題;如果有其它方面的問題可以提出來,我會進一步完善。
目前HttpHelper包含了以下幾個方面:
代碼如下:
1 using System;
2 using System.Collections.Generic;
3 using System.Collections.Specialized;
4 using System.IO;
5 using System.IO.Compression;
6 using System.Linq;
7 using System.Net;
8 using System.Net.Security;
9 using System.Security.Cryptography.X509Certificates;
10 using System.Text;
11 using System.Text.RegularExpressions;
12 using System.Collections;
13 using HtmlAgilityPack;
14
15 namespace TNIdea.Common.Helper
16 {
17 public class HttpHelper
18 {
19 public const string CharsetReg = @"(meta.*?charset=""?(?<Charset>[^\s""'>]+)""?)|(xml.*?encoding=""?(?<Charset>[^\s"">]+)""?)";
20
21 /// <summary>
22 /// 獲取網頁的內容
23 /// </summary>
24 /// <param name="url">Url</param>
25 /// <param name="postData">Post的信息</param>
26 /// <param name="cookies">Cookies</param>
27 /// <param name="userAgent">浏覽器標識</param>
28 /// <param name="referer">來源頁</param>
29 /// <param name="cookiesDomain">Cookies的Domian參數,配合cookies使用;為空則取url的Host</param>
30 /// <param name="encode">編碼方式,用於解析html</param>
31 /// <returns></returns>
32 public static string GetHttpContent(string url, string postData = null, CookieContainer cookies = null, string userAgent = "", string referer = "", string cookiesDomain = "", Encoding encode = null)
33 {
34 try
35 {
36 HttpWebResponse httpResponse = null;
37 if (!string.IsNullOrWhiteSpace(postData))
38 httpResponse = CreatePostHttpResponse(url, postData, cookies: cookies, userAgent: userAgent, referer: referer);
39 else
40 httpResponse = CreateGetHttpResponse(url, cookies: cookies, userAgent: userAgent, referer: referer);
41
42 #region 根據Html頭判斷
43 string Content = null;
44 //緩沖區長度
45 const int N_CacheLength = 10000;
46 //頭部預讀取緩沖區,字節形式
47 var bytes = new List<byte>();
48 int count = 0;
49 //頭部預讀取緩沖區,字符串
50 String cache = string.Empty;
51
52 //創建流對象並解碼
53 Stream ResponseStream;
54 switch (httpResponse.ContentEncoding.ToUpperInvariant())
55 {
56 case "GZIP":
57 ResponseStream = new GZipStream(
58 httpResponse.GetResponseStream(), CompressionMode.Decompress);
59 break;
60 case "DEFLATE":
61 ResponseStream = new DeflateStream(
62 httpResponse.GetResponseStream(), CompressionMode.Decompress);
63 break;
64 default:
65 ResponseStream = httpResponse.GetResponseStream();
66 break;
67 }
68
69 try
70 {
71 while (
72 !(cache.EndsWith("</head>", StringComparison.OrdinalIgnoreCase)
73 || count >= N_CacheLength))
74 {
75 var b = (byte)ResponseStream.ReadByte();
76 if (b < 0) //end of stream
77 {
78 break;
79 }
80 bytes.Add(b);
81
82 count++;
83 cache += (char)b;
84 }
85
86
87 if (encode == null)
88 {
89 try
90 {
91 if (httpResponse.CharacterSet == "ISO-8859-1" || httpResponse.CharacterSet == "zh-cn")
92 {
93 Match match = Regex.Match(cache, CharsetReg, RegexOptions.IgnoreCase | RegexOptions.Multiline);
94 if (match.Success)
95 {
96 try
97 {
98 string charset = match.Groups["Charset"].Value;
99 encode = Encoding.GetEncoding(charset);
100 }
101 catch { }
102 }
103 else
104 encode = Encoding.GetEncoding("GB2312");
105 }
106 else
107 encode = Encoding.GetEncoding(httpResponse.CharacterSet);
108 }
109 catch { }
110 }
111
112 //緩沖字節重新編碼,然後再把流讀完
113 var Reader = new StreamReader(ResponseStream, encode);
114 Content = encode.GetString(bytes.ToArray(), 0, count) + Reader.ReadToEnd();
115 Reader.Close();
116 }
117 catch (Exception ex)
118 {
119 return ex.ToString();
120 }
121 finally
122 {
123 httpResponse.Close();
124 }
125 #endregion 根據Html頭判斷
126
127 //獲取返回的Cookies,支持httponly
128 if (string.IsNullOrWhiteSpace(cookiesDomain))
129 cookiesDomain = httpResponse.ResponseUri.Host;
130
131 cookies = new CookieContainer();
132 CookieCollection httpHeaderCookies = SetCookie(httpResponse, cookiesDomain);
133 cookies.Add(httpHeaderCookies ?? httpResponse.Cookies);
134
135 return Content;
136 }
137 catch
138 {
139 return string.Empty;
140 }
141 }
142
143
144 /// <summary>
145 /// 創建GET方式的HTTP請求
146 /// </summary>
147 /// <param name="url"></param>
148 /// <param name="timeout"></param>
149 /// <param name="userAgent"></param>
150 /// <param name="cookies"></param>
151 /// <param name="referer"></param>
152 /// <returns></returns>
153 public static HttpWebResponse CreateGetHttpResponse(string url, int timeout = 60000, string userAgent = "", CookieContainer cookies = null, string referer = "")
154 {
155 HttpWebRequest request = null;
156 if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase))
157 {
158 //對服務端證書進行有效性校驗(非第三方權威機構頒發的證書,如自己生成的,不進行驗證,這裡返回true)
159 ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult);
160 request = WebRequest.Create(url) as HttpWebRequest;
161 //request.ProtocolVersion = HttpVersion.Version10; //http版本,默認是1.1,這裡設置為1.0
162 }
163 else
164 {
165 request = WebRequest.Create(url) as HttpWebRequest;
166 }
167
168 request.Referer = referer;
169 request.Method = "GET";
170
171 //設置代理UserAgent和超時
172 if (string.IsNullOrWhiteSpace(userAgent))
173 userAgent = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36";
174
175 request.UserAgent = userAgent;
176 request.Timeout = timeout;
177 request.KeepAlive = true;
178 request.AllowAutoRedirect = true;
179
180 if (cookies == null)
181 cookies = new CookieContainer();
182 request.CookieContainer = cookies;
183
184 return request.GetResponse() as HttpWebResponse;
185 }
186
187 /// <summary>
188 /// 創建POST方式的HTTP請求
189 /// </summary>
190 /// <param name="url"></param>
191 /// <param name="postData"></param>
192 /// <param name="timeout"></param>
193 /// <param name="userAgent"></param>
194 /// <param name="cookies"></param>
195 /// <param name="referer"></param>
196 /// <returns></returns>
197 public static HttpWebResponse CreatePostHttpResponse(string url, string postData, int timeout = 60000, string userAgent = "", CookieContainer cookies = null, string referer = "")
198 {
199 HttpWebRequest request = null;
200 //如果是發送HTTPS請求
201 if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase))
202 {
203 ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult);
204 request = WebRequest.Create(url) as HttpWebRequest;
205 //request.ProtocolVersion = HttpVersion.Version10;
206 }
207 else
208 {
209 request = WebRequest.Create(url) as HttpWebRequest;
210 }
211 request.Referer = referer;
212 request.Method = "POST";
213 request.ContentType = "application/x-www-form-urlencoded";
214
215 //設置代理UserAgent和超時
216 if (string.IsNullOrWhiteSpace(userAgent))
217 request.UserAgent = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36";
218 else
219 request.UserAgent = userAgent;
220 request.Timeout = timeout;
221 request.KeepAlive = true;
222 request.AllowAutoRedirect = true;
223
224 if (cookies == null)
225 cookies = new CookieContainer();
226 request.CookieContainer = cookies;
227
228 //發送POST數據
229 if (!string.IsNullOrWhiteSpace(postData))
230 {
231 byte[] data = Encoding.UTF8.GetBytes(postData);
232 request.ContentLength = data.Length;
233 using (Stream stream = request.GetRequestStream())
234 {
235 stream.Write(data, 0, data.Length);
236 }
237 }
238 //string[] values = request.Headers.GetValues("Content-Type");
239 return request.GetResponse() as HttpWebResponse;
240 }
241
242 /// <summary>
243 /// 驗證證書
244 /// </summary>
245 /// <param name="sender"></param>
246 /// <param name="certificate"></param>
247 /// <param name="chain"></param>
248 /// <param name="errors"></param>
249 /// <returns>是否驗證通過</returns>
250 private static bool CheckValidationResult(object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors errors)
251 {
252 if (errors == SslPolicyErrors.None)
253 return true;
254 return false;
255 }
256
257 /// <summary>
258 /// 根據response中頭部的set-cookie對request中的cookie進行設置
259 /// </summary>
260 /// <param name="setCookie">The set cookie.</param>
261 /// <param name="defaultDomain">The default domain.</param>
262 /// <returns></returns>
263 private static CookieCollection SetCookie(HttpWebResponse response, string defaultDomain)
264 {
265 try
266 {
267 string[] setCookie = response.Headers.GetValues("Set-Cookie");
268
269 // there is bug in it,the datetime in "set-cookie" will be sepreated in two pieces.
270 List<string> a = new List<string>(setCookie);
271 for (int i = setCookie.Length - 1; i > 0; i--)
272 {
273 if (a[i].Substring(a[i].Length - 3) == "GMT")
274 {
275 a[i - 1] = a[i - 1] + ", " + a[i];
276 a.RemoveAt(i);
277 i--;
278 }
279 }
280 setCookie = a.ToArray<string>();
281 CookieCollection cookies = new CookieCollection();
282 foreach (string str in setCookie)
283 {
284 NameValueCollection hs = new NameValueCollection();
285 foreach (string i in str.Split(';'))
286 {
287 int index = i.IndexOf("=");
288 if (index > 0)
289 hs.Add(i.Substring(0, index).Trim(), i.Substring(index + 1).Trim());
290 else
291 switch (i)
292 {
293 case "HttpOnly":
294 hs.Add("HttpOnly", "True");
295 break;
296 case "Secure":
297 hs.Add("Secure", "True");
298 break;
299 }
300 }
301 Cookie ck = new Cookie();
302 foreach (string Key in hs.AllKeys)
303 {
304 switch (Key.ToLower().Trim())
305 {
306 case "path":
307 ck.Path = hs[Key];
308 break;
309 case "expires":
310 ck.Expires = DateTime.Parse(hs[Key]);
311 break;
312 case "domain":
313 ck.Domain = hs[Key];
314 break;
315 case "httpOnly":
316 ck.HttpOnly = true;
317 break;
318 case "secure":
319 ck.Secure = true;
320 break;
321 default:
322 ck.Name = Key;
323 ck.Value = hs[Key];
324 break;
325 }
326 }
327 if (ck.Domain == "") ck.Domain = defaultDomain;
328 if (ck.Name != "") cookies.Add(ck);
329 }
330 return cookies;
331 }
332 catch
333 {
334 return null;
335 }
336 }
337
338 /// <summary>
339 /// 遍歷CookieContainer
340 /// </summary>
341 /// <param name="cookieContainer"></param>
342 /// <returns>List of cookie</returns>
343 public static Dictionary<string, string> GetAllCookies(CookieContainer cookieContainer)
344 {
345 Dictionary<string, string> cookies = new Dictionary<string, string>();
346
347 Hashtable table = (Hashtable)cookieContainer.GetType().InvokeMember("m_domainTable",
348 System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.GetField |
349 System.Reflection.BindingFlags.Instance, null, cookieContainer, new object[] { });
350
351 foreach (string pathList in table.Keys)
352 {
353 StringBuilder _cookie = new StringBuilder();
354 SortedList cookieColList = (SortedList)table[pathList].GetType().InvokeMember("m_list",
355 System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.GetField
356 | System.Reflection.BindingFlags.Instance, null, table[pathList], new object[] { });
357 foreach (CookieCollection colCookies in cookieColList.Values)
358 foreach (Cookie c in colCookies)
359 _cookie.Append(c.Name + "=" + c.Value + ";");
360
361 cookies.Add(pathList, _cookie.ToString().TrimEnd(';'));
362 }
363 return cookies;
364 }
365
366 /// <summary>
367 /// convert cookies string to CookieContainer
368 /// </summary>
369 /// <param name="cookies"></param>
370 /// <returns></returns>
371 public static CookieContainer ConvertToCookieContainer(Dictionary<string, string> cookies)
372 {
373 CookieContainer cookieContainer = new CookieContainer();
374
375 foreach (var cookie in cookies)
376 {
377 string[] strEachCookParts = cookie.Value.Split(';');
378 int intEachCookPartsCount = strEachCookParts.Length;
379
380 foreach (string strCNameAndCValue in strEachCookParts)
381 {
382 if (!string.IsNullOrEmpty(strCNameAndCValue))
383 {
384 Cookie cookTemp = new Cookie();
385 int firstEqual = strCNameAndCValue.IndexOf("=");
386 string firstName = strCNameAndCValue.Substring(0, firstEqual);
387 string allValue = strCNameAndCValue.Substring(firstEqual + 1, strCNameAndCValue.Length - (firstEqual + 1));
388 cookTemp.Name = firstName;
389 cookTemp.Value = allValue;
390 cookTemp.Path = "/";
391 cookTemp.Domain = cookie.Key;
392 cookieContainer.Add(cookTemp);
393 }
394 }
395 }
396 return cookieContainer;
397 }
398
399 public static string BuildPostData(string htmlContent)
400 {
401 HtmlDocument htmlDoc = new HtmlDocument();
402 htmlDoc.LoadHtml(htmlContent);
403 //Get the form node collection.
404 HtmlNode htmlNode = htmlDoc.DocumentNode.SelectSingleNode("//form");
405 HtmlNodeCollection htmlInputs = htmlNode.SelectNodes("//input");
406
407 StringBuilder postData = new StringBuilder();
408
409 foreach (HtmlNode input in htmlInputs)
410 {
411 if(input.Attributes["value"] != null)
412 postData.Append(input.Attributes["name"].Value + "=" + input.Attributes["value"].Value + "&");
413 }
414 return postData.ToString().TrimEnd('&');
415 }
416 }
417 }
部分網站需要登錄的問題我已經著手通過另一個項目來解決(imitate-login),目前還有許多網頁使用了JavaScript或各種基於JS的框架來對網頁進行數據加載,如何來模擬執行JavaScript暫時還沒找到比較優美的解決方案,如果大家有什麼好的方案可以發給我,謝謝!
未經授權,拒絕任何全文及摘要轉載!