C#采摘CSDN单个博客所有文章

2013-09-13
C#采集CSDN单个博客所有文章原理：通过HtmlAgilityPack解析html源码得到所需的数据。1、首先通过http://blog.
C#采集CSDN单个博客所有文章
原理：
通过HtmlAgilityPack解析html源码得到所需的数据。
1、首先通过http://blog.csdn.net/gdjlc 底部的“xx条数据共xx页”，获取得总页数；
2、获取每一页的所有文章URL，每一页的URL如下所示： http://blog.csdn.net/gdjlc/article/list/当前页索引,从1一直循环到总页数即可得.
3、获取单个文章的内容。
using System;using System.Collections.Generic;using System.ComponentModel;using System.Data;using System.Drawing;using System.Linq;using System.Text;using System.Windows.Forms;using System.IO;using System.Diagnostics;using System.Collections;using System.Threading;using System.Threading.Tasks;using System.Xml.Linq;using System.Net;using HtmlAgilityPack;namespace Demo{    public partial class FrmCSDN : Form    {               const string BLOGUSER = "gdjlc"; //博客用户名        const string BLOGURL = "http://blog.csdn.net";        const string PAGECOUNTPATH = "//div[@id='papelist']/span[1]";//总页数PATH        const string ARTICLEURLPATH = "//span[@class='link_title']/a[1]"; //文章URL的PATH            const string ARTICLETITLEPATH = "//div[@class='article_title']/h3/span/a";//文章标题PATH        const string POSTDATEPATH = "//span[@class='link_postdate']"; //文章创建日期PATH        const string ARTICLECONTENTPATH = "//div[@id='article_content']"; //文章内容PATH        List<string> articleUrlList = new List<string>(); //所有文章的URL        private object moniter = new object();        Stopwatch stopwatch = new Stopwatch();        int cnt = 0;        public FrmCSDN()        {            InitializeComponent();        }        //获取总页数        private int GetPageCount(string pageCountUrl)        {                     HtmlNode rootNode = GetHtmlNodeByUrl(pageCountUrl, Encoding.UTF8);            if (rootNode == null)                return 0;                        //形如“177条数据 共12页”            string pageCountText = GetNodeInnerText(rootNode, PAGECOUNTPATH);            int firstIndex = pageCountText.LastIndexOf("共") + 1;            int lastIndex = pageCountText.LastIndexOf("页");            string result = pageCountText.Substring(firstIndex, lastIndex - firstIndex);            return Convert.ToInt32(result);        }        //开始采集按钮        private void btnCollect_Click(object sender, EventArgs e)        {            stopwatch.Restart();                     Task.Factory.StartNew(() =>            {                cnt = 0;                int pageCount = GetPageCount(BLOGURL + "/" + BLOGUSER);                if (pageCount == 0)                     return;                                //所有文章的URL                for (int pageIndex = 1; pageIndex <= pageCount; pageIndex++)                {                    string pageIndexUrl = BLOGURL + "/" + BLOGUSER + "/article/list/" + pageIndex.ToString();                    HtmlNode rootNode = GetHtmlNodeByUrl(pageIndexUrl, Encoding.UTF8);                    if (rootNode == null)                        continue;                    HtmlNodeCollection ArticleUrlList = rootNode.SelectNodes(ARTICLEURLPATH);                    foreach (HtmlNode articleUrlNode in ArticleUrlList)                    {                        string articleUrl = BLOGURL + articleUrlNode.Attributes["href"].Value;                        articleUrlList.Add(articleUrl);                    }                }            }).ContinueWith((x) =>            {                TaskFactory taskFactory = new TaskFactory();                Task[] tasks = new Task[articleUrlList.Count];                for (int i = 0; i < articleUrlList.Count; i++)                {                    tasks[i] = new Task(CollectArticle, articleUrlList[i]);                    tasks[i].Start();                }                taskFactory.ContinueWhenAll(tasks, TaskEnded, TaskContinuationOptions.None);            });        }        /// <summary>        /// 单篇文章采集        /// </summary>        /// <param name="state"></param>        void CollectArticle(object state)        {            Interlocked.Increment(ref cnt);            lock (moniter)            {                SetStatuText(string.Format("总共{0}篇文章, 正在采集中第{1}篇.", articleUrlList.Count, cnt));                 string articleUrl = (string)state;                 string firstArticleHtml = GetHtmlSource(articleUrl, Encoding.UTF8);                 if (string.IsNullOrEmpty(firstArticleHtml)) return;                 HtmlNode rootNode = GetHtmlNodeByHtml(firstArticleHtml);                 string articleTitle = GetNodeInnerText(rootNode, ARTICLETITLEPATH);                 string postDate = GetNodeInnerText(rootNode, POSTDATEPATH);                 string articleContent = GetNodeInnerText(rootNode, ARTICLECONTENTPATH);                 //采集结果处理：保存到数据库或其它......                 string blogFile = BLOGUSER + ".txt";                 using (StreamWriter sw = new StreamWriter(blogFile, true))                 {                     sw.WriteLine(articleUrl);                     sw.WriteLine(articleTitle);                     sw.WriteLine(postDate);                     sw.WriteLine(articleContent);                 }                     }                             }                private void TaskEnded(Task[] task)        {            SetStatuText("采集结束，耗时 " + stopwatch.Elapsed.Minutes + "分" + stopwatch.Elapsed.Seconds + "秒");        }        //通过网页URL获取HtmlNode        private HtmlNode GetHtmlNodeByUrl(string url, Encoding encoding)        {            string html = GetHtmlSource(url, encoding);            if (string.IsNullOrEmpty(html)) return null;            HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument();            document.LoadHtml(html);            HtmlNode rootNode = document.DocumentNode;            return rootNode;        }        //通过网页html源代码获取HtmlNode        private HtmlNode GetHtmlNodeByHtml(string htmlSource)        {            HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument();            document.LoadHtml(htmlSource);            HtmlNode rootNode = document.DocumentNode;            return rootNode;        }        /// <summary>        /// 获取网页源代码        /// </summary>              private string GetHtmlSource(string url, Encoding encoding)        {            string result = "";            try            {                WebRequest request = WebRequest.Create(url);                using (WebResponse response = request.GetResponse())                                   using (StreamReader reader = new StreamReader(response.GetResponseStream(), encoding))                                           result = reader.ReadToEnd();             }            catch            {                result = "";            }            return result;        }        private string GetNodeInnerText(HtmlNode srcNode, string path)        {            HtmlNode temp = srcNode.SelectSingleNode(path);            if (temp == null)                return null;            return temp.InnerText;        }        private void SetStatuText(string s)        {            this.SafeCall(() =>            {                lblStatusInfo.Text = s;            });        }             }    public static class Extenstions    {        public static void SafeCall(this Control ctrl, Action callback)        {            if (ctrl.InvokeRequired)                ctrl.Invoke(callback);            else                callback();        }    }}
热点排行