倾家荡产--求解获取网页内容;
我想通过程序从这个网页中获取内容:(诸如:姓名,电话,地址......)
http://kevdb.infospace.com/_1_2TICU1D026IBWZ9__intldb/wp/results/kevdb?KCFG=UK&otmpl=/wp/results.htm&qsubcat=1&KSN=intl-uk&KS=_0_n_0_0_Hrnd_1_3s_0_19qiua_149sPF&MinKS=_7BuK_0&MaxKS=_7DwR_0&CurKS=_7BuK_14i&QN=john&QF=&QC=&QS=&QP=&QST=&QHN=&QTAPPEND=qn,qf&QFM=N&QK=5&QO=uk&QD=&DM=&qi=35
[解决办法]
System.Net.WebClient
获取数据,再用正则分析数据,获取指定内容
[解决办法]
#region 解析企业信息
/// <summary>
/// 解析企业信息
/// </summary>
/// <param name= "url "> </param>
private string[] AnalysisHtml(string strIndex,string strProfile,string strContact)
{
string[] companyInfo = new string[19];
// "公司名称 ", "企业性质 ", "所属省 ", "地区 ", "公司简介 ", "主要产品 ", "注册资金 ", "总经理 ", "主要客户 ", "成立时间 ",
// "年营业额 ", "主营产品 ", "公司地址 ", "邮编 ", "联系电话 ", "传真 ", "联系人 ", "电子信箱 ", "网站地址 "
// <b> 公司名称: </b> 林燕企业有限公司 </font> </td>
string companyName = Regex.Match(strProfile, @ " <b> 公司名称: </b> (.*) </font> </td> ").Groups[1].Value;
companyName = companyName.Trim();
//string companyName = Regex.Match(strProfile, @ " <td\s*width=\ " "575\ " "\s*height=\ " "23\ " "\s*valign=\ " "top\ " "\s*class=\ " "t10h18\ " "> (.*) </td> ").Groups[1].Value;
// <b> 企业性质: </b> 贸易商 </font> </td>
string company_type = Regex.Match(strProfile, @ " <b> 企业性质: </b> (.*)\s* </font> </td> ").Groups[1].Value;
company_type = company_type.Trim();
string province = " ";
string city = " ";
string companyIntro = Regex.Match(strIndex, @ " <td\s*width=\ " "575\ " "\s*height=\ " "23\ " "\s*valign=\ " "top\ " "\s*class=\ " "t10h18\ " "> (.*)\s* </td> ").Groups[1].Value;
companyIntro = companyIntro.Trim();
string main_product = Regex.Match(strIndex, @ " <b> 主要产品: </b> </span> <br> (.*)\s* </td> ").Groups[1].Value;
main_product = main_product.Trim();
string registMoney = Regex.Match(strProfile, @ " <b> 注册资金: </b> (.*)\s* </font> </td> ").Groups[1].Value;
registMoney = registMoney.Trim();
string manager = Regex.Match(strProfile, @ " <b> 总\s*经\s*理: </b> (.*)\s* </font> </td> ").Groups[1].Value;
manager = manager.Trim();
string main_customer = Regex.Match(strProfile, @ " <b> 主要客户: </b> (.*)\s* </font> </td> ").Groups[1].Value;
main_customer = main_customer.Trim();
string registTime = Regex.Match(strProfile, @ " <b> 成立时间: </b> (.*)\s* </font> </td> ").Groups[1].Value;
registTime = registTime.Trim();
string turnOver = Regex.Match(strProfile, @ " <b> 年营业额: </b> (.*)\s* </font> </td> ").Groups[1].Value;
turnOver = turnOver.Trim();
string sale_product = Regex.Match(strProfile, @ " <b> 主营产品: </b> <br> \s* (.*)\s* </font> </td> ").Groups[1].Value;
sale_product = sale_product.Trim();
string address = Regex.Match(strContact, @ "公司地址: </td> \s* <td\s*height=\ " "23\ " "\s*width=\ " "82%\ " "> (.*) </td> ").Groups[1].Value;
address = address.Trim();
string postcode = Regex.Match(strContact, @ "邮政编码: </td> \s* <td\s*height=\ " "23\ " "> (.*) </td> ").Groups[1].Value;
postcode = postcode.Trim();
string telephone = Regex.Match(strContact, @ "联系电话: </td> \s* <td\s*height=\ " "23\ " "> (.*)\s* </td> ").Groups[1].Value;
telephone = telephone.Trim();
string fax = Regex.Match(strContact, @ "联系传真: </td> \s* <td\s*height=\ " "23\ " "> (.*)\s* </td> ").Groups[1].Value;
fax = fax.Trim();
string contact_person = Regex.Match(strContact, @ "联\s*系\s*人: </td> \s* <td\s*height=\ " "23\ " "> (.*)\ </td> ").Groups[1].Value;
contact_person = contact_person.Trim();
string email = Regex.Match(strContact, @ "电子信箱:\s* </td> \s* <td\s*height=\ " "23\ " "> <a\s*href=\ " "mailto:(.*)\ " "> ").Groups[1].Value;
//为什么Email最后有\r ?
//string email = Regex.Match(strCode, @ " </div> \s*(\S* <img\ssrc=\ " "[.]*/images/at[.]gif\ " "> .*)\s* </td> ").Groups[1].Value;
//email = Regex.Replace(email, " <img src=\ "../images/at.gif\ "> ", "@ ");
string website = Regex.Match(strContact, @ "网站地址:\s* </td> \s* <td\s*height=\ " "23\ " "> <a\s*href=\ " "(.*)\ " "\s*target=\ " "_blank\ " "> ").Groups[1].Value;
companyInfo[0] = companyName;
companyInfo[1] = company_type;
companyInfo[2] = province;
companyInfo[3] = city;
companyInfo[4] = companyIntro;
companyInfo[5] = main_product;
companyInfo[6] = registMoney;
companyInfo[7] = manager;
companyInfo[8] = main_customer;
companyInfo[9] = registTime;
companyInfo[10] = turnOver;
companyInfo[11] = sale_product;
companyInfo[12] = address;
companyInfo[13] = postcode;
companyInfo[14] = telephone;
companyInfo[15] = fax;
companyInfo[16] = contact_person;
companyInfo[17] = email;
companyInfo[18] = website;
return companyInfo;
}
#endregion
可参考这个
[解决办法]
///获取网页源文件
private string GetHtmlSource(string url)
{
string html = " ";
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream stream = response.GetResponseStream();
StreamReader reader = new StreamReader(stream, Encoding.GetEncoding( "GB2312 "));
html = reader.ReadToEnd();
}
catch
{
MessageBox.Show( "无法连接到远程服务器,请检查您的网络是否正常! ", "提示 ");
}
return html;
}
#endregion
[解决办法]
先获取源代码,然后再解析
[解决办法]
http://www.cnblogs.com/skyiv/archive/2005/10/01/GetIP.html
这里有“获取网页的HTML内容”的代码。
[解决办法]
真快,接分
[解决办法]
帮顶吧!
[解决办法]
try
//string src = GetHtmlSource( "http://www.whitepages.com/5116/search/Replay?search_id=40051390946113532971 ");
string src = GetHtmlSource( "http://www.whitepages.com/5116/search/FindPerson?firstname=&name=wa&name_begins_with=1&city_zip=+01002&state_id=All+US&x=53&y=16 ");
MatchCollection mc = Regex.Matches(src, @ " <div\s+class= " "description " "> \s* <h2> <a[^> ]*> (? <name> [\s\S]*?) </a> \s* </h2> \s* <p> (? <adress> [\s\S]*?) </p> \s* <p> (? <code> [\s\S]*?) </p> \s* <p[^> ]*> (? <phone> [\s\S]*?) </p> ", RegexOptions.IgnoreCase);
foreach (Match m in mc)
{
richTextBox2.Text += m.Groups[ "name "].Value + "\n ";
richTextBox2.Text += m.Groups[ "adress "].Value + "\n ";
richTextBox2.Text += m.Groups[ "code "].Value + "\n ";
richTextBox2.Text += m.Groups[ "phone "].Value + "\n\n ";
}
private string GetHtmlSource(string url)
{
System.Net.HttpWebRequest request = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(url);
request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.1) Web-Sniffer/1.0.24 ";
System.Net.WebResponse response = request.GetResponse();
System.IO.Stream resStream = response.GetResponseStream();
System.IO.StreamReader sr = new System.IO.StreamReader(resStream, System.Text.Encoding.UTF8);
string htmlSource = (sr.ReadToEnd());
resStream.Close();
sr.Close();
return htmlSource;
}
[解决办法]
-_-!!! 有没有那么严重...