倾家荡产-求解获取网页内容

2011-12-20

倾家荡产--求解获取网页内容我想通过程序从这个网页中获取内容:(诸如:姓名,电话,地址......)http://kevdb

倾家荡产--求解获取网页内容;
我想通过程序从这个网页中获取内容:(诸如:姓名,电话,地址......)

http://kevdb.infospace.com/_1_2TICU1D026IBWZ9__intldb/wp/results/kevdb?KCFG=UK&otmpl=/wp/results.htm&qsubcat=1&KSN=intl-uk&KS=_0_n_0_0_Hrnd_1_3s_0_19qiua_149sPF&MinKS=_7BuK_0&MaxKS=_7DwR_0&CurKS=_7BuK_14i&QN=john&QF=&QC=&QS=&QP=&QST=&QHN=&QTAPPEND=qn,qf&QFM=N&QK=5&QO=uk&QD=&DM=&qi=35

[解决办法]
System.Net.WebClient
获取数据，再用正则分析数据，获取指定内容
[解决办法]
#region 解析企业信息
/// <summary>
/// 解析企业信息
/// </summary>
/// <param name= "url "> </param>
private string[] AnalysisHtml(string strIndex,string strProfile,string strContact)
{
string[] companyInfo = new string[19];
// "公司名称 ", "企业性质 ", "所属省 ", "地区 ", "公司简介 ", "主要产品 ", "注册资金 ", "总经理 ", "主要客户 ", "成立时间 ",
// "年营业额 ", "主营产品 ", "公司地址 ", "邮编 ", "联系电话 ", "传真 ", "联系人 ", "电子信箱 ", "网站地址 "

// 公司名称： 林燕企业有限公司 </td>
string companyName = Regex.Match(strProfile, @ " 公司名称： (.*) </td> ").Groups[1].Value;
companyName = companyName.Trim();
//string companyName = Regex.Match(strProfile, @ " <td\s*width=\ " "575\ " "\s*height=\ " "23\ " "\s*valign=\ " "top\ " "\s*class=\ " "t10h18\ " "> (.*) </td> ").Groups[1].Value;

// 企业性质： 贸易商 </td>
string company_type = Regex.Match(strProfile, @ " 企业性质： (.*)\s* </td> ").Groups[1].Value;
company_type = company_type.Trim();

string province = " ";
string city = " ";

string companyIntro = Regex.Match(strIndex, @ " <td\s*width=\ " "575\ " "\s*height=\ " "23\ " "\s*valign=\ " "top\ " "\s*class=\ " "t10h18\ " "> (.*)\s* </td> ").Groups[1].Value;
companyIntro = companyIntro.Trim();

string main_product = Regex.Match(strIndex, @ " 主要产品： (.*)\s* </td> ").Groups[1].Value;
main_product = main_product.Trim();

string registMoney = Regex.Match(strProfile, @ " 注册资金： (.*)\s* </td> ").Groups[1].Value;
registMoney = registMoney.Trim();

string manager = Regex.Match(strProfile, @ " 总\s*经\s*理： (.*)\s* </td> ").Groups[1].Value;
manager = manager.Trim();

string main_customer = Regex.Match(strProfile, @ " 主要客户： (.*)\s* </td> ").Groups[1].Value;
main_customer = main_customer.Trim();

string registTime = Regex.Match(strProfile, @ " 成立时间： (.*)\s* </td> ").Groups[1].Value;
registTime = registTime.Trim();

string turnOver = Regex.Match(strProfile, @ " 年营业额： (.*)\s* </td> ").Groups[1].Value;

turnOver = turnOver.Trim();

string sale_product = Regex.Match(strProfile, @ " 主营产品： \s*    (.*)\s* </td> ").Groups[1].Value;
sale_product = sale_product.Trim();

string address = Regex.Match(strContact, @ "公司地址： </td> \s* <td\s*height=\ " "23\ " "\s*width=\ " "82%\ " "> (.*) </td> ").Groups[1].Value;
address = address.Trim();

string postcode = Regex.Match(strContact, @ "邮政编码： </td> \s* <td\s*height=\ " "23\ " "> (.*) </td> ").Groups[1].Value;
postcode = postcode.Trim();

string telephone = Regex.Match(strContact, @ "联系电话： </td> \s* <td\s*height=\ " "23\ " "> (.*)\s* </td> ").Groups[1].Value;
telephone = telephone.Trim();

string fax = Regex.Match(strContact, @ "联系传真： </td> \s* <td\s*height=\ " "23\ " "> (.*)\s* </td> ").Groups[1].Value;
fax = fax.Trim();

string contact_person = Regex.Match(strContact, @ "联\s*系\s*人： </td> \s* <td\s*height=\ " "23\ " "> (.*)\ </td> ").Groups[1].Value;
contact_person = contact_person.Trim();

string email = Regex.Match(strContact, @ "电子信箱：\s* </td> \s* <td\s*height=\ " "23\ " "> <a\s*href=\ " "mailto:(.*)\ " "> ").Groups[1].Value;

//为什么Email最后有\r ？
//string email = Regex.Match(strCode, @ " </div> \s*(\S* <img\ssrc=\ " "[.]*/images/at[.]gif\ " "> .*)\s* </td> ").Groups[1].Value;
//email = Regex.Replace(email, " <img src=\ "../images/at.gif\ "> ", "@ ");

string website = Regex.Match(strContact, @ "网站地址：\s* </td> \s* <td\s*height=\ " "23\ " "> <a\s*href=\ " "(.*)\ " "\s*target=\ " "_blank\ " "> ").Groups[1].Value;

companyInfo[0] = companyName;
companyInfo[1] = company_type;
companyInfo[2] = province;
companyInfo[3] = city;
companyInfo[4] = companyIntro;
companyInfo[5] = main_product;
companyInfo[6] = registMoney;
companyInfo[7] = manager;
companyInfo[8] = main_customer;
companyInfo[9] = registTime;
companyInfo[10] = turnOver;
companyInfo[11] = sale_product;
companyInfo[12] = address;
companyInfo[13] = postcode;
companyInfo[14] = telephone;
companyInfo[15] = fax;
companyInfo[16] = contact_person;
companyInfo[17] = email;
companyInfo[18] = website;

return companyInfo;

}
#endregion

可参考这个
[解决办法]
///获取网页源文件
private string GetHtmlSource(string url)
{
string html = " ";
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);

HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream stream = response.GetResponseStream();

StreamReader reader = new StreamReader(stream, Encoding.GetEncoding( "GB2312 "));
html = reader.ReadToEnd();
}
catch
{
MessageBox.Show( "无法连接到远程服务器，请检查您的网络是否正常！ ", "提示 ");
}
return html;

}
#endregion
[解决办法]
先获取源代码，然后再解析
[解决办法]

http://www.cnblogs.com/skyiv/archive/2005/10/01/GetIP.html

这里有“获取网页的HTML内容”的代码。

[解决办法]
真快，接分
[解决办法]
帮顶吧！
[解决办法]
try

//string src = GetHtmlSource( "http://www.whitepages.com/5116/search/Replay?search_id=40051390946113532971 ");
string src = GetHtmlSource( "http://www.whitepages.com/5116/search/FindPerson?firstname=&name=wa&name_begins_with=1&city_zip=+01002&state_id=All+US&x=53&y=16 ");
MatchCollection mc = Regex.Matches(src, @ " <div\s+class= " "description " "> \s* <h2> <a[^> ]*> (? <name> [\s\S]*?) </a> \s* </h2> \s* (? <adress> [\s\S]*?) \s* (? <code> [\s\S]*?) \s* <p[^> ]*> (? <phone> [\s\S]*?) ", RegexOptions.IgnoreCase);
foreach (Match m in mc)
{
richTextBox2.Text += m.Groups[ "name "].Value + "\n ";
richTextBox2.Text += m.Groups[ "adress "].Value + "\n ";
richTextBox2.Text += m.Groups[ "code "].Value + "\n ";
richTextBox2.Text += m.Groups[ "phone "].Value + "\n\n ";
}

private string GetHtmlSource(string url)
{
System.Net.HttpWebRequest request = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(url);
request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.1) Web-Sniffer/1.0.24 ";
System.Net.WebResponse response = request.GetResponse();
System.IO.Stream resStream = response.GetResponseStream();
System.IO.StreamReader sr = new System.IO.StreamReader(resStream, System.Text.Encoding.UTF8);
string htmlSource = (sr.ReadToEnd());
resStream.Close();
sr.Close();
return htmlSource;
}
[解决办法]
-_-!!! 有没有那么严重...

热点排行

C#

倾家荡产-求解获取网页内容