C#采集CSDN单个博客所有文章
原理:
通过HtmlAgilityPack解析html源码得到所需的数据。
1、首先通过http://blog.csdn.net/gdjlc 底部的“xx条数据 共xx页”,获取得总页数;
2、获取每一页的所有文章URL,每一页的URL如下所示: http://blog.csdn.net/gdjlc/article/list/当前页索引,从1一直循环到总页数即可得.
3、获取单个文章的内容。
using System;using System.Collections.Generic;using System.ComponentModel;using System.Data;using System.Drawing;using System.Linq;using System.Text;using System.Windows.Forms;using System.IO;using System.Diagnostics;using System.Collections;using System.Threading;using System.Threading.Tasks;using System.Xml.Linq;using System.Net;using HtmlAgilityPack;namespace Demo{ public partial class FrmCSDN : Form { const string BLOGUSER = "gdjlc"; //博客用户名 const string BLOGURL = "http://blog.csdn.net"; const string PAGECOUNTPATH = "//div[@id='papelist']/span[1]";//总页数PATH const string ARTICLEURLPATH = "//span[@class='link_title']/a[1]"; //文章URL的PATH const string ARTICLETITLEPATH = "//div[@class='article_title']/h3/span/a";//文章标题PATH const string POSTDATEPATH = "//span[@class='link_postdate']"; //文章创建日期PATH const string ARTICLECONTENTPATH = "//div[@id='article_content']"; //文章内容PATH List<string> articleUrlList = new List<string>(); //所有文章的URL private object moniter = new object(); Stopwatch stopwatch = new Stopwatch(); int cnt = 0; public FrmCSDN() { InitializeComponent(); } //获取总页数 private int GetPageCount(string pageCountUrl) { HtmlNode rootNode = GetHtmlNodeByUrl(pageCountUrl, Encoding.UTF8); if (rootNode == null) return 0; //形如“177条数据 共12页” string pageCountText = GetNodeInnerText(rootNode, PAGECOUNTPATH); int firstIndex = pageCountText.LastIndexOf("共") + 1; int lastIndex = pageCountText.LastIndexOf("页"); string result = pageCountText.Substring(firstIndex, lastIndex - firstIndex); return Convert.ToInt32(result); } //开始采集按钮 private void btnCollect_Click(object sender, EventArgs e) { stopwatch.Restart(); Task.Factory.StartNew(() => { cnt = 0; int pageCount = GetPageCount(BLOGURL + "/" + BLOGUSER); if (pageCount == 0) return; //所有文章的URL for (int pageIndex = 1; pageIndex <= pageCount; pageIndex++) { string pageIndexUrl = BLOGURL + "/" + BLOGUSER + "/article/list/" + pageIndex.ToString(); HtmlNode rootNode = GetHtmlNodeByUrl(pageIndexUrl, Encoding.UTF8); if (rootNode == null) continue; HtmlNodeCollection ArticleUrlList = rootNode.SelectNodes(ARTICLEURLPATH); foreach (HtmlNode articleUrlNode in ArticleUrlList) { string articleUrl = BLOGURL + articleUrlNode.Attributes["href"].Value; articleUrlList.Add(articleUrl); } } }).ContinueWith((x) => { TaskFactory taskFactory = new TaskFactory(); Task[] tasks = new Task[articleUrlList.Count]; for (int i = 0; i < articleUrlList.Count; i++) { tasks[i] = new Task(CollectArticle, articleUrlList[i]); tasks[i].Start(); } taskFactory.ContinueWhenAll(tasks, TaskEnded, TaskContinuationOptions.None); }); } /// <summary> /// 单篇文章采集 /// </summary> /// <param name="state"></param> void CollectArticle(object state) { Interlocked.Increment(ref cnt); lock (moniter) { SetStatuText(string.Format("总共{0}篇文章, 正在采集中第{1}篇.", articleUrlList.Count, cnt)); string articleUrl = (string)state; string firstArticleHtml = GetHtmlSource(articleUrl, Encoding.UTF8); if (string.IsNullOrEmpty(firstArticleHtml)) return; HtmlNode rootNode = GetHtmlNodeByHtml(firstArticleHtml); string articleTitle = GetNodeInnerText(rootNode, ARTICLETITLEPATH); string postDate = GetNodeInnerText(rootNode, POSTDATEPATH); string articleContent = GetNodeInnerText(rootNode, ARTICLECONTENTPATH); //采集结果处理:保存到数据库或其它...... string blogFile = BLOGUSER + ".txt"; using (StreamWriter sw = new StreamWriter(blogFile, true)) { sw.WriteLine(articleUrl); sw.WriteLine(articleTitle); sw.WriteLine(postDate); sw.WriteLine(articleContent); } } } private void TaskEnded(Task[] task) { SetStatuText("采集结束,耗时 " + stopwatch.Elapsed.Minutes + "分" + stopwatch.Elapsed.Seconds + "秒"); } //通过网页URL获取HtmlNode private HtmlNode GetHtmlNodeByUrl(string url, Encoding encoding) { string html = GetHtmlSource(url, encoding); if (string.IsNullOrEmpty(html)) return null; HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument(); document.LoadHtml(html); HtmlNode rootNode = document.DocumentNode; return rootNode; } //通过网页html源代码获取HtmlNode private HtmlNode GetHtmlNodeByHtml(string htmlSource) { HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument(); document.LoadHtml(htmlSource); HtmlNode rootNode = document.DocumentNode; return rootNode; } /// <summary> /// 获取网页源代码 /// </summary> private string GetHtmlSource(string url, Encoding encoding) { string result = ""; try { WebRequest request = WebRequest.Create(url); using (WebResponse response = request.GetResponse()) using (StreamReader reader = new StreamReader(response.GetResponseStream(), encoding)) result = reader.ReadToEnd(); } catch { result = ""; } return result; } private string GetNodeInnerText(HtmlNode srcNode, string path) { HtmlNode temp = srcNode.SelectSingleNode(path); if (temp == null) return null; return temp.InnerText; } private void SetStatuText(string s) { this.SafeCall(() => { lblStatusInfo.Text = s; }); } } public static class Extenstions { public static void SafeCall(this Control ctrl, Action callback) { if (ctrl.InvokeRequired) ctrl.Invoke(callback); else callback(); } }}