在博客園學習知識是很方便的,但若做成客戶端,自定義獲取數據,那就更好啦!
那麼需求有哪些呢,第一,我只查看推薦數大於2的文章;第二,我想要只查看C#或者Java的文章;第三,我想要查看推薦數大於2的新聞;第四,我還想搜索文章,並且只搜索推薦數大於2的文章。
先來預覽一下成品吧

其中列表裡左邊是推薦數,反正我是優先看推薦數多的,中間是標題,右邊是日期,至於其他信息,額,我其實不太關心,點擊一行後直接在浏覽器打開。
額,大體先這樣吧,那麼實現這些功能需要什麼技能呢,首先我得准備一下通用類,大概需要web請求的幫助類、Gzip格式網頁的加解密幫助類、html字符串解析的幫助類。
public class WebHelper
{
public readonly WebClient Web = new WebClient();
//錯誤重試次數
private int _tryTimes;
public Encoding Encoding
{
set
{
Web.Encoding = value;
}
}
public WebHelper()
{
Web.Encoding = Encoding.UTF8;
}
public WebHelper(Encoding encoding)
{
Web.Encoding = encoding;
}
/// <summary>
/// 下載請求的資源
/// </summary>
/// <param name="url">URL</param>
/// <returns></returns>
public string DownloadString(string url)
{
try
{
return Web.DownloadString(url);
}
catch(WebException e)
{
if (e.Message.Contains("404") || e.Status == WebExceptionStatus.ConnectFailure || e.Status == WebExceptionStatus.ProtocolError || _tryTimes == 2)
{
_tryTimes = 0;
return null;
}
_tryTimes++;
return DownloadString(url);
}
}
/// <summary>
/// 將指定的字符串上載到指定的資源
/// </summary>
/// <param name="address">地址</param>
/// <param name="data">參數</param>
/// <returns></returns>
public string UploadString(string address, string data)
{
Web.Headers.Add("Content-Type", "application/x-www-form-urlencoded");
try
{
return Web.UploadString(address, "POST", data);
}
catch
{
if (_tryTimes == 2)
{
_tryTimes = 0;
return null;
}
_tryTimes++;
return UploadString(address, data);
}
}
/// <summary>
/// 下載請求的資源(資源采用Gzip壓縮)
/// </summary>
/// <param name="url">URL</param>
/// <param name="encoding">頁面編碼格式</param>
/// <returns></returns>
public string DownloadGzipString(string url, Encoding encoding)
{
Web.Headers.Add("Accept-Encoding", "gzip");
try
{
return encoding.GetString(ZipHelper.GzipDecompress(Web.DownloadData(url)));
}
catch (WebException e)
{
if (e.Message.Contains("404") || e.Status == WebExceptionStatus.ConnectFailure ||
e.Status == WebExceptionStatus.ProtocolError || _tryTimes == 2)
{
_tryTimes = 0;
return null;
}
_tryTimes++;
return DownloadGzipString(url, encoding);
}
finally
{
Web.Headers.Remove("Accept-Encoding");
}
}
}
這裡有三個方法,其中的DownloadString和UploadString和.net Framework的WebClient的方法用法一樣,多了一個DownloadGzipString方法,這個方法用於get一個用Gzip壓縮的頁面,之所以重復寫DownloadString和UploadString是因為我懶,有時候請求網頁出現異常並不是該網頁不能請求,多請求幾次就能獲取,這裡自動嘗試3次請求,3次請求過後依然失敗則返回null。當然還有一種情況是需要用代理的,考慮到需要用代理的地方不多,並且代理的IP端口一般需要花錢來買,這裡就不貼用代理來請求頁面的代碼了,之前買過兩天耍過代理,我那時候的實現思路就是加一個ProxyPool代理池類,代理池從代理網站獲取當前可用的代理,一般是一次獲取十幾個,然後放入代理池,請求需要代理的網站時就去代理池獲取代理,WebClient.Proxy = new WebProxy(host, port);加了這個再去請求頁面就可以了,當然代理不一定可靠,所以當失敗後不要灰心,再用其他代理試試,總有一個成功的,當需要多線程請求網頁時,就new多個WebHelper類,他們都會共用一個ProxyPool代理池的。
public class ZipHelper
{
/// <summary>
/// Gzip壓縮
/// </summary>
/// <param name="cbytes">需壓縮的數據</param>
/// <returns></returns>
public static byte[] GzipCompress(byte[] cbytes)
{
using (MemoryStream cms = new MemoryStream())
{
using (GZipStream gzip = new GZipStream(cms, CompressionMode.Compress))
{
//將數據寫入基礎流,同時會被壓縮
gzip.Write(cbytes, 0, cbytes.Length);
}
return cms.ToArray();
}
}
/// <summary>
/// Gzip解壓
/// </summary>
/// <param name="cbytes">需解壓的數據</param>
/// <returns></returns>
public static byte[] GzipDecompress(byte[] cbytes)
{
using (MemoryStream dms = new MemoryStream())
{
using (MemoryStream cms = new MemoryStream(cbytes))
{
using (GZipStream gzip = new GZipStream(cms, CompressionMode.Decompress))
{
byte[] bytes = new byte[1024];
int len = 0;
//讀取壓縮流,同時會被解壓
while ((len = gzip.Read(bytes, 0, bytes.Length)) > 0)
{
dms.Write(bytes, 0, len);
}
return dms.ToArray();
}
}
}
}
}

public class StringHelper
{
/// <summary>
/// 根據傳入str進行遍歷取出列表
/// </summary>
/// <param name="str">傳入字符串</param>
/// <param name="startStr">開始字符串</param>
/// <param name="endStr">結束字符串</param>
/// <param name="remove">是否去除開始和結束字符串取出數據</param>
/// <returns></returns>
public static List<string> GetList(string str, string startStr, string endStr, bool remove = true)
{
var lst = new List<string>();
int startIndex = 0;
while (true)
{
string v = GetVal(str, startStr, endStr, remove, ref startIndex);
if (startIndex == -1)
{
break;
}
lst.Add(v);
}
return lst;
}
public static string GetVal(string str, string startStr, string endStr, bool remove = true, int startIndex = 0)
{
return GetVal(str, startStr, endStr, remove, ref startIndex);
}
private static string GetVal(string str, string startStr, string endStr, bool remove, ref int startIndex)
{
int istart = str.IndexOf(startStr, startIndex, StringComparison.CurrentCulture);
if (istart == -1)
{
startIndex = -1;
return string.Empty;
}
int iend = str.IndexOf(endStr, istart + startStr.Length, StringComparison.Ordinal);
if (iend == -1)
{
startIndex = -1;
return string.Empty;
}
startIndex = iend + endStr.Length;
if (remove)
{
istart += startStr.Length;
return str.Substring(istart, iend - istart);
}
return str.Substring(istart, startIndex - istart);
}
/// <summary>
/// 根據傳入str進行遍歷取出列表
/// </summary>
/// <param name="str">傳入字符串</param>
/// <param name="startStr">開始字符串</param>
/// <param name="needLength">需要獲取的長度(不含開始字符串的長度)</param>
/// <param name="remove">是否去除開始字符串取出數據</param>
/// <returns></returns>
public static List<string> GetList(string str, string startStr, int needLength, bool remove = true)
{
var lst = new List<string>();
int startIndex = 0;
while (true)
{
string v = GetVal(str, startStr, needLength, remove, ref startIndex);
if (startIndex == -1)
{
break;
}
lst.Add(v);
}
return lst;
}
public static string GetVal(string str, string startStr, int needLength, bool remove = true, int startIndex = 0)
{
return GetVal(str, startStr, needLength, remove, ref startIndex);
}
public static string GetVal(string str, string startStr, int needLength, bool remove, ref int startIndex)
{
int istart = str.IndexOf(startStr, startIndex, StringComparison.Ordinal);
if (istart == -1)
{
startIndex = -1;
return string.Empty;
}
startIndex = istart + startStr.Length + needLength;
if (startIndex > str.Length)
{
startIndex = -1;
return string.Empty;
}
return remove
? str.Substring(istart + startStr.Length, needLength)
: str.Substring(istart, startStr.Length + needLength);
}
/// <summary>
/// 獲取字符串裡的所有href鏈接
/// </summary>
/// <param name="str">字符串</param>
/// <returns></returns>
public static List<string> GetUrls(string str)
{
return GetList(str, "href=\"", "\"");
}
/// <summary>
/// 獲取字符串裡的首個href鏈接
/// </summary>
/// <param name="str"></param>
/// <returns></returns>
public static string GetUrl(string str)
{
return GetVal(str, "href=\"", "\"");
}
public static string ToGB2312(string str)
{
string r = "";
MatchCollection mc = Regex.Matches(str, @"\\u([\w]{2})([\w]{2})",
RegexOptions.Compiled | RegexOptions.IgnoreCase);
var bts = new byte[2];
foreach (Match m in mc)
{
bts[0] = (byte) int.Parse(m.Groups[2].Value, NumberStyles.HexNumber);
bts[1] = (byte) int.Parse(m.Groups[1].Value, NumberStyles.HexNumber);
r += Encoding.Unicode.GetString(bts);
}
return r;
}
/// <summary>
/// 除去所有在html元素中標記
/// </summary>
/// <param name="html"></param>
/// <returns></returns>
public static string RemoveHTMLTags(string html)
{
Regex regex = new Regex(@"<[^>]+>|</[^>]+>");
return regex.Replace(html, "");
}
}
View Code
這裡主要包含了GetList、RemoveHTMLTags和GetVal方法,爬蟲解析數據就靠他們了,具體的使用方法下面會有講解。
到這裡通用類大體就介紹完了,現在開始實地施工。
/// <summary>
/// 獲取推薦數大於2的博客
/// </summary>
/// <param name="pageIndex"></param>
private bool AddPost(int pageIndex)
{
var url = "https://www.cnblogs.com/mvc/AggSite/PostList.aspx";
var html = _web.UploadString(url, GetUrl() + pageIndex);
var posts = StringHelper.GetList(html, "\"post_item", "\"article_comment");
if (posts.Count == 0)
{
return false;
}
foreach (var item in posts)
{
var n = StringHelper.GetVal(item, "\"diggnum", "/span>");
var diggnum = Convert.ToInt32(StringHelper.GetVal(n, ">", "<"));
if (diggnum < 3)
{
continue;
}
var t = StringHelper.GetVal(item, "\"titlelnk", "/a>");
var title = StringHelper.GetVal(t, ">", "<");
var time = StringHelper.GetVal(item, "發布於 ", 16);
_urls.Add(StringHelper.GetUrl(t));
lstPost.Items.Add($"{diggnum} {title} {time}");
}
return true;
}
/// <summary>
/// 添加搜索的博客
/// </summary>
/// <param name="pageIndex">頁數</param>
private bool AddSearchPost(int pageIndex)
{
var url = $"http://zzk.cnblogs.com/s/blogpost?Keywords={txtSearch.Text.Trim()}&pageindex={pageIndex}";var html = _web.DownloadGzipString(url, Encoding.UTF8);
var posts = StringHelper.GetList(html, "\"searchItem", "\"searchItemInfo-comments");
if (posts.Count == 0)
{
return false;
}
foreach (var item in posts)
{
var diggnum = StringHelper.GetVal(item, ">推薦(", ")");
var n = StringHelper.GetVal(item, "searchItemTitle\">", "</h3>");
var title = StringHelper.RemoveHTMLTags(StringHelper.GetVal(n, "\">", "</a>"));
var date = StringHelper.GetVal(item, "searchItemInfo-publishDate\">", "</span>");
_urls.Add(StringHelper.GetUrl(n));
lstPost.Items.Add($"{diggnum} {title} {date}");
}
return true;
}
/// <summary>
/// 獲取推薦數大於2的新聞
/// </summary>
/// <param name="pageIndex"></param>
private bool AddNews(int pageIndex)
{
var url = "https://www.cnblogs.com/mvc/AggSite/NewsList.aspx";
var html = _web.UploadString(url, $"CategoryId=-1&CategoryType=News&ItemListActionName=NewsList&ItemListActionName=NewsList&PageIndex=" + pageIndex);
var posts = StringHelper.GetList(html, "\"post_item", "\"article_comment");
if (posts.Count == 0)
{
return false;
}
foreach (var item in posts)
{
var n = StringHelper.GetVal(item, "\"diggnum", "/span>");
var diggnum = Convert.ToInt32(StringHelper.GetVal(n, ">", "<"));
if (diggnum < 3)
{
continue;
}
var t = StringHelper.GetVal(item, "\"titlelnk", "/a>");
var title = StringHelper.GetVal(t, ">", "<");
var time = StringHelper.GetVal(item, "發布於 ", 16);
var link = StringHelper.GetUrl(t);
if (!link.Contains("http"))
{
link = "https:" + link;
}
_urls.Add(link);
lstPost.Items.Add($"{diggnum} {title} {time}");
}
return true;
}
授人以魚不然授人以漁,這些是怎麼回事呢

在博客園首頁按下F12,點擊下一頁,看看那些請求,瞄一瞄,就知道PostList.aspx是數據關鍵,裡面的參數中CategoryId是分類ID,CategoryType是分類種類,暫時發現SiteHome和TopSiteCategory兩個值,當點擊母分類時,這個值就是TopSiteCategory,當點擊子分類時,這個值就是SiteHome,PageIndex當前頁這個眾所周知啦,ParentCategoryId是父分類的ID,只有點擊子分類時需要把父分類的ID賦值到這個字段。說了這麼多,這個還只是獲取文章的接口,另外兩個查詢文章的和獲取新聞的也大同小異啦,大家自己研究。另外貼出的代碼裡有個GetUrl方法,這個就是為了賦值這些參數的,也貼出來吧

private string GetUrl()
{
string categoryId = "808";
string categoryType = "SiteHome";
string parentCategoryId = "0";
switch (cbbCate.SelectedIndex)
{
case 0:
parentCategoryId = "108698";
categoryType = "SiteCategory";
switch (cbbType.SelectedIndex)
{
case 0:
categoryId = "18156";
break;
case 1:
categoryId = "108699";
break;
case 2:
categoryId = "108700";
break;
case 3:
categoryId = "108760";
break;
case 4:
categoryId = "108716";
break;
case 5:
categoryId = "108717";
break;
case 6:
categoryId = "108718";
break;
case 7:
categoryId = "108719";
break;
case 8:
categoryId = "108720";
break;
case 9:
categoryId = "108728";
break;
case 10:
categoryId = "108729";
break;
case 11:
categoryId = "108730";
break;
case 12:
categoryId = "108738";
break;
case 13:
categoryId = "108739";
break;
case 14:
categoryId = "108758";
break;
default:
categoryId = parentCategoryId;
categoryType = "TopSiteCategory";
parentCategoryId = "0";
break;
}
break;
case 1:
parentCategoryId = "2";
categoryType = "SiteCategory";
switch (cbbType.SelectedIndex)
{
case 0:
categoryId = "106876";
break;
case 1:
categoryId = "106880";
break;
case 2:
categoryId = "106882";
break;
case 3:
categoryId = "106877";
break;
case 4:
categoryId = "108696";
break;
case 5:
categoryId = "106894";
break;
case 6:
categoryId = "108735";
break;
case 7:
categoryId = "108746";
break;
case 8:
categoryId = "108748";
break;
case 9:
categoryId = "108751";
break;
case 10:
categoryId = "108752";
break;
case 11:
categoryId = "108753";
break;
case 12:
categoryId = "108742";
break;
case 13:
categoryId = "108754";
break;
default:
categoryId = parentCategoryId;
categoryType = "TopSiteCategory";
parentCategoryId = "0";
break;
}
break;
case 2:
parentCategoryId = "108701";
categoryType = "SiteCategory";
switch (cbbType.SelectedIndex)
{
case 0:
categoryId = "106892";
break;
case 1:
categoryId = "108702";
break;
case 2:
categoryId = "106884";
break;
case 3:
categoryId = "108750";
break;
default:
categoryId = parentCategoryId;
categoryType = "TopSiteCategory";
parentCategoryId = "0";
break;
}
break;
case 3:
parentCategoryId = "108703";
categoryType = "SiteCategory";
switch (cbbType.SelectedIndex)
{
case 0:
categoryId = "106883";
break;
case 1:
categoryId = "106893";
break;
case 2:
categoryId = "108731";
break;
case 3:
categoryId = "108737";
break;
default:
categoryId = parentCategoryId;
categoryType = "TopSiteCategory";
parentCategoryId = "0";
break;
}
break;
case 4:
parentCategoryId = "108704";
categoryType = "SiteCategory";
switch (cbbType.SelectedIndex)
{
case 0:
categoryId = "78111";
break;
case 1:
categoryId = "50349";
break;
case 2:
categoryId = "106878";
break;
case 3:
categoryId = "108732";
break;
case 4:
categoryId = "108734";
break;
case 5:
categoryId = "108747";
break;
case 6:
categoryId = "108749";
break;
case 7:
categoryId = "3";
break;
default:
categoryId = parentCategoryId;
categoryType = "TopSiteCategory";
parentCategoryId = "0";
break;
}
break;
case 5:
parentCategoryId = "108705";
categoryType = "SiteCategory";
switch (cbbType.SelectedIndex)
{
case 0:
categoryId = "108706";
break;
case 1:
categoryId = "108707";
break;
case 2:
categoryId = "108736";
break;
case 3:
categoryId = "108708";
break;
case 4:
categoryId = "106886";
break;
default:
categoryId = parentCategoryId;
categoryType = "TopSiteCategory";
parentCategoryId = "0";
break;
}
break;
case 6:
parentCategoryId = "108709";
categoryType = "SiteCategory";
switch (cbbType.SelectedIndex)
{
case 0:
categoryId = "108710";
break;
case 1:
categoryId = "106891";
break;
case 2:
categoryId = "106889";
break;
default:
categoryId = parentCategoryId;
categoryType = "TopSiteCategory";
parentCategoryId = "0";
break;
}
break;
case 7:
parentCategoryId = "108712";
categoryType = "SiteCategory";
switch (cbbType.SelectedIndex)
{
case 0:
categoryId = "108713";
break;
case 1:
categoryId = "108714";
break;
case 2:
categoryId = "108715";
break;
case 3:
categoryId = "108743";
break;
case 4:
categoryId = "108756";
break;
case 5:
categoryId = "106881";
break;
default:
categoryId = parentCategoryId;
categoryType = "TopSiteCategory";
parentCategoryId = "0";
break;
}
break;
case 8:
parentCategoryId = "108724";
categoryType = "SiteCategory";
switch (cbbType.SelectedIndex)
{
case 0:
categoryId = "108721";
break;
case 1:
categoryId = "108725";
break;
case 2:
categoryId = "108726";
break;
case 3:
categoryId = "108755";
break;
case 4:
categoryId = "108757";
break;
default:
categoryId = parentCategoryId;
categoryType = "TopSiteCategory";
parentCategoryId = "0";
break;
}
break;
case 9:
parentCategoryId = "4";
categoryType = "SiteCategory";
switch (cbbType.SelectedIndex)
{
case 0:
categoryId = "807";
break;
case 1:
categoryId = "106879";
break;
case 2:
categoryId = "33909";
break;
case 3:
categoryId = "106885";
break;
case 4:
categoryId = "106895";
break;
case 5:
categoryId = "108759";
break;
default:
categoryId = parentCategoryId;
categoryType = "TopSiteCategory";
parentCategoryId = "0";
break;
}
break;
}
return $"CategoryId={categoryId}&CategoryType={categoryType}&ParentCategoryId={parentCategoryId}&ItemListActionName=PostList&PageIndex=";
}
View Code
功能大體介紹完了,末了還有個小驚喜,就是提示框,怎麼在Winform中彈出提示框,過段時間自動消失呢,像這樣

其實這個不難,弄個定時器就好啦

但需要注意的是,怎麼才能彈出提示在最頂層呢,不然看不到呢,其實把TopMost屬性設為True就好了,另外ShowIcon、ShowInTaskbar、MaximizeBox和MinimizeBox也要設為false,StartPosition設為CenterScreen,這樣才專業。
由於剛弄成,難免會有疏忽八哥,大家看到後要幫忙指正,附上代碼博客園精華客戶端。