用webBrowser翻页抓取
页面有一个js翻页,想抓每页的内容,下面代码只能抓到第一页的数据。
public Form1()
{
InitializeComponent();
string url = "http://www.xxxx.cc/";
webBrowser1.Navigate(url);
}
private void webBrowser1_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
for (int i = 1; i < 8; i++)
{
string[] args = { "AspNetPager1", i.ToString() };
webBrowser1.Document.InvokeScript("__doPostBack", args);
if (webBrowser1.IsBusy == false && webBrowser1.StatusText == "完成")
{
string body = webBrowser1.Document.Body.OuterHtml;
string exp = " 共([\\s\\S]*?)页,当前为第([\\s\\S]*?)页,每页10条";
Match match = Regex.Match(body, exp, RegexOptions.IgnoreCase);
string num = match.Groups[2].Value.ToString();
}
}
}
using System;
using System.Collections.Generic;
using System.Linq;
using System.Windows.Forms;
namespace WindowsFormsApplication1
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
this.webBrowser1.DocumentCompleted += First_DocumentCompleted;
}
public class session
{
public int 页号;
public Uri uri;
}
private List<session> Pages;
private int TryInt(string s)
{
int x;
if (!int.TryParse(s, out x))
return -1;
return x;
}
void First_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
if (this.webBrowser1.ReadyState == WebBrowserReadyState.Complete)
{
this.webBrowser1.DocumentCompleted -= First_DocumentCompleted;
var nav_panel = (from x in this.webBrowser1.Document.All.OfType<HtmlElement>()
where x.GetAttribute("className") == "page_nav"
select x).First();
Pages = (from li in nav_panel.GetElementsByTagName("LI").OfType<HtmlElement>()
from a in li.GetElementsByTagName("A").OfType<HtmlElement>()
let num = TryInt(a.InnerText)
where num >0
select new session
{
页号 = num,
uri = new Uri(a.GetAttribute("href"))
}).ToList();
this.webBrowser1.DocumentCompleted += Flip_DocumentCompleted;
Go();
}
}
void Flip_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
if (this.webBrowser1.ReadyState == WebBrowserReadyState.Complete)
Go();
}
private void Go()
{
var index = Pages.Count - 1;
if (index >= 0)
{
var page = Pages[index];
Pages.RemoveAt(index);
this.webBrowser1.Navigate(page.uri);
this.Text = string.Format("正在加载第 {0} 页:{1}", page.页号, page.uri.ToString());
}
}
}
}