首页 诗词 字典 板报 句子 名言 友答 励志 学校 网站地图
当前位置: 首页 > 教程频道 > .NET > C# >

应用C#抓取网页内容并分析获取数据

2012-10-31 
使用C#抓取网页内容并分析获取数据private void button5_Click(object sender, EventArgs e){string html

使用C#抓取网页内容并分析获取数据

private void button5_Click(object sender, EventArgs e)        {            string html = "";            WebHeaderCollection header = new WebHeaderCollection();            header.Set("Pragma", "no-cache");            html = getHtml("http://www.biomart.cn/info/infoDemand.htm?pge=1",header);            Regex regex = new Regex("<!-- 列表 -->(?<1>.*)<!-- /列表 -->");            //MessageBox.Show(regex.Match(html).Groups.Count.ToString());            html = regex.Match(html).Groups[1].Value;            regex = new Regex("href="(?<1>http://www\\.biomart\\.cn/infodemand/\\w+\\.htm)"");            MatchCollection ms = regex.Matches(html);            header.Set(HttpRequestHeader.Cookie, "__utma=124945049.1686326021.1305093063.1305164868.1305187067.3; __utmz=124945049.1305093063.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); JSESSIONID=9D7F7F4B5D73F453DA54B40A53D5E7C8; __utmc=124945049; __utmb=124945049.2.10.1305187067");            foreach(Match m in  ms){                MessageBox.Show(m.Groups[1].Value);                String content = getHtml(m.Groups[1].Value,header);                 regex = new Regex("<div class="product_card">(?<1>.*)\\s+</p>\\s+</div>");                 MessageBox.Show(regex.Match(content).Groups[1].Value);            }        }  ?private String getHtml(String url, WebHeaderCollection header)        {            WebHeaderCollection header = new WebHeaderCollection();            header.Set("", "");            header.Set(HttpRequestHeader.Cookie, "");            HttpWebRequest request = (HttpWebRequest)WebRequest.Create("http://www.biomart.cn/info/infoDemand.htm?pge=1");            request.Timeout = 30000;            request.Headers = header;            HttpWebResponse response = (HttpWebResponse)request.GetResponse();            Stream stream = response.GetResponseStream();            Encoding encoding = Encoding.GetEncoding("UTF-8");            StreamReader reader = new StreamReader(stream);            String content = reader.ReadToEnd();            content = Regex.Replace(content, "\\t|\\r|\\n", "");            return content;        }

热点排行