数据采集 嵌套div 正则匹配
匹配区域:<div id="hot">
html数据区域,该区域有子div嵌套
</div>
提取id=hot的div中所有超链接 地址和超链接中文说明
提取网址:http://news.qq.com/
获取html函数:
public static string GetContent(string url, string regStr)
{
HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(url);
req.Method = "GET";
req.ContentType = "application/x-www-form-urlencoded";
HttpWebResponse wsp = (HttpWebResponse)req.GetResponse();
Stream st = wsp.GetResponseStream();
if (wsp.ContentEncoding.ToLower().Contains("gzip"))
{
st = new GZipStream(st, CompressionMode.Decompress);
}
StreamReader sr = new StreamReader(st, Encoding.Default);
string value = sr.ReadToEnd();
Regex reg = new Regex(regStr);
foreach (Match m in reg.Matches(value))
{
var a = m.Groups[1].Value;
}
string s = reg.Matches(value)[0].Groups[1].Value;
return s;
}
[解决办法]
用HtmlAgilityPack比较方便:
HtmlDocument htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml("采集到的string");
HtmlNodeCollection anchors = htmlDoc.DocumentNode.SelectNodes(@"//div[@id='hot']//a");
foreach (HtmlNode anchor in anchors)
{
Response.Write(anchor.Attributes["href"].Value + "<br/>");
Response.Write(anchor.InnerText + "<br/><br/>");
}