首页 诗词 字典 板报 句子 名言 友答 励志 学校 网站地图
当前位置: 首页 > 教程频道 > .NET > C# >

winform中怎么提取指定网页中的类似<a href="news.asp?id=3"的超连接

2012-01-06 
winform中如何提取指定网页中的类似a hrefnews.asp?id3的超连接如标题CSDN上都是如下的答案,可获得超

winform中如何提取指定网页中的类似<a href="news.asp?id=3"的超连接
如标题
CSDN上都是如下的答案,可获得超连接都是http://开头的,而网页中以 <a   href= "news.asp?id=3 "   > 内容 </a>   中的news.asp?id=3如何获得?检测它是否可以正常打开?
using   System;  
using   System.Xml;  
using   System.Text;  
using   System.Net;  
using   System.IO;  
using   System.Collections;  
using   System.Text.RegularExpressions;  

public   class   App  
{  
public   static   void   Main()  
{  
string   strCode;  
ArrayList   alLinks;  

Console.Write( "请输入一个网页地址: ");  
string   strURL   =   Console.ReadLine();  
if(strURL.Substring(0,7)   !=   @ "http:// ")  
{  
strURL   =   @ "http:// "   +   strURL;  
}  

Console.WriteLine( "正在获取页面代码,请稍侯... ");  
strCode   =   GetPageSource(strURL);  

Console.WriteLine( "正在提取超链接,请稍侯... ");  
alLinks   =   GetHyperLinks(strCode);  

Console.WriteLine( "正在写入文件,请稍侯... ");  
WriteToXml(strURL,alLinks);  
}  

//   获取指定网页的HTML代码  
static   string   GetPageSource(string   URL)  
{  
Uri   uri   =new   Uri(URL);  

HttpWebRequest   hwReq   =   (HttpWebRequest)WebRequest.Create(uri);  
HttpWebResponse   hwRes   =   (HttpWebResponse)hwReq.GetResponse();  

hwReq.Method   =   "Get ";  

hwReq.KeepAlive   =   false;  

StreamReader   reader   =   new   StreamReader(hwRes.GetResponseStream(),System.Text.Encoding.GetEncoding( "GB2312 "));  

return   reader.ReadToEnd();  
}  

//   提取HTML代码中的网址  
static   ArrayList   GetHyperLinks(string   htmlCode)  
{  
ArrayList   al   =   new   ArrayList();  

string   strRegex   =   @ "http://([\w-]+\.)+[\w-]+(/[\w-   ./?%&=]*)? ";  

Regex   r   =   new   Regex(strRegex,RegexOptions.IgnoreCase);  
MatchCollection   m   =   r.Matches(htmlCode);  

for(int   i=0;   i <=m.Count-1;   i++)  
{  
bool   rep   =   false;  
string   strNew   =   m[i].ToString();  

//   过滤重复的URL  
foreach(string   str   in   al)  
{  
if(strNew==str)  
{  
rep   =true;  
break;  
}  
}  

if(!rep)   al.Add(strNew);  
}  

al.Sort();  

return   al;  
}  

//   把网址写入xml文件  
static   void   WriteToXml(string   strURL,   ArrayList   alHyperLinks)  
{  
XmlTextWriter   writer   =   new   XmlTextWriter( "HyperLinks.xml ",Encoding.UTF8);  

writer.Formatting   =   Formatting.Indented;  
writer.WriteStartDocument(false);  
writer.WriteDocType( "HyperLinks ",   null,   "urls.dtd ",   null);  
writer.WriteComment( "提取自 "   +   strURL   +   "的超链接 ");  


writer.WriteStartElement( "HyperLinks ");  
writer.WriteStartElement( "HyperLinks ",   null);  
writer.WriteAttributeString( "DateTime ",DateTime.Now.ToString());  


foreach(string   str   in   alHyperLinks)  
{  
string   title   =   GetDomain(str);  
string   body   =   str;  
writer.WriteElementString(title,null,body);  
}  

writer.WriteEndElement();  
writer.WriteEndElement();  

writer.Flush();  
writer.Close();  
}  

//   获取网址的域名后缀  
static   string   GetDomain(string   strURL)  
{  
string   retVal;  

string   strRegex   =   @ "(\.com/|\.net/|\.cn/|\.org/|\.gov/) ";  

Regex   r   =   new   Regex(strRegex,RegexOptions.IgnoreCase);  
Match   m   =   r.Match(strURL);  
retVal   =   m.ToString();  

strRegex   =   @ "\.|/$ ";  
retVal   =   Regex.Replace(retVal,   strRegex,   " ").ToString();  

if(retVal   ==   " ")  
retVal   =   "other ";  

return   retVal;  
}  
}  


[解决办法]
正则表达式分组捕捉
href= "(? <Url> .*?)\ "\s+[> ]
[解决办法]
如果用webBrowser控件,非常简单就能获得。

private void webBrowser1_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
if (webBrowser1.Document.GetElementsByTagName( "A ") != null)
{
foreach (HtmlElement obj in webBrowser1.Document.GetElementsByTagName( "A "))
{
if (obj.GetAttribute( "href ") != null && obj.GetAttribute( "href ") != -1)
{
}
}
}

......



[解决办法]
" <a*?href=(\ "(? <href> [^\ "]*)\ "| '(? <href> [^ ']*) '|(? <href> *))[^> ]*?> (? <title> *?) </a> "

热点排行