获取网页的html在做一个抓取网页信息的小软件,对方网页使用url重写,不能获得真实的url,我该怎么获取他的ht
获取网页的html 在做一个抓取网页信息的小软件,对方网页使用url重写,不能获得真实的url,我该怎么获取他的html呢?源码是
C# code<a name="EntrezSystem2.PEntrez.Pubmed.Pubmed_ResultsPanel.Entrez_Pager.Page" title="Next page of results" class="active page_link next" href="#" sid="3" page="2" id="EntrezSystem2.PEntrez.Pubmed.Pubmed_ResultsPanel.Entrez_Pager.Page">Next ></a> ,
如何获取next的html,最好有代码,给个思路也行
[解决办法] dom 有了解吗 他是为了domhtml内容的工具。
[解决办法] 也看这个 我之前代码,获得html的所有数据 遍历到一个treeview上。自己学习下
private void button1_Click(object sender, EventArgs e)
{
string test = Application.StartupPath + "WriteLines.html";
string textResult = Convert(test );
MessageBox.Show(textResult );
}
public static string Convert(string html)
{
if (string.IsNullOrEmpty(html.Trim()))
{
return string.Empty;
}
using (SgmlReader reader = new SgmlReader())
{
reader.DocType = "HTML";
reader.InputStream = new StringReader(html);
using (StringWriter stringWriter = new StringWriter())
{
using (XmlTextWriter writer = new XmlTextWriter(stringWriter))
{
reader.WhitespaceHandling = WhitespaceHandling.None;
writer.Formatting = Formatting.Indented;
XmlDocument doc = new XmlDocument();
doc.Load(reader);
doc.Save("c:\\txt.xml");
if (doc.DocumentElement == null)
{
return string.Empty;
}
else
{
doc.DocumentElement.WriteContentTo(writer);
}
writer.Close();
string xhtml = stringWriter.ToString();
return xhtml;
}
}
}
}
private void button2_Click(object sender, EventArgs e)
{
object Zero = 0;
object EmptyString = "";
axWebBrowser1.Navigate(textBox1.Text, ref Zero, ref EmptyString, ref EmptyString, ref EmptyString);
}
private void axWebBrowser1_DocumentComplete(object sender, AxSHDocVw.DWebBrowserEvents2_DocumentCompleteEvent e)
{
IHTMLDocument2 HTMLDocument = (IHTMLDocument2)axWebBrowser1.Document;
IHTMLElementCollection links = HTMLDocument.links;
listBox1.Items.Clear();
string uspath = Application.StartupPath + "\\WriteLines.html";
uspath.Remove(0);
//using ( StreamWriter sw = new StreamWriter(@"C:\WriteLines.html", true))
using (StreamWriter sw = new StreamWriter(uspath, true))
foreach (HTMLAnchorElementClass el in links)
{
listBox1.Items.Add(el.outerHTML);
sw.WriteLine(el.outerHTML);
sw.Close();
}
}
private void button3_Click(object sender, EventArgs e)
{
string uspath = Application.StartupPath + "\\WriteLines.html";
StreamReader objreder = new StreamReader(uspath );
string sling = "";
ArrayList arlist = new ArrayList();
while (sling != null)
{
sling = objreder.ReadLine();
//插入数组;
Convert(sling );
if (sling != null)
arlist.Add(sling);
}
objreder.Close();
foreach (string strout in arlist)
{
MessageBox.Show(strout );
}
}
}
[解决办法] HtmlAgilityPack 这个玩意 也可以。
[解决办法] Fiddler2工具可以获取真实的url
[解决办法] 探讨 引用: 也看这个 我之前代码,获得html的所有数据 遍历到一个treeview上。自己学习下 private void button1_Click(object sender, EventArgs e) { string test = Application.StartupPath + "WriteLines.html"; string textResult =……
[解决办法] 我自己写的提取网页中URL地址代码
protected void Button1_Click(object sender, EventArgs e)
{
ArrayList allinks;
//获取网页中的代码
if (t1.Text == " ")
Response.Write( " <script> alert( '请入网址 ') </script> ");
else
{
try
{
string str1 = t1.Text;
HttpWebRequest req = WebRequest.Create(t1.Text) as HttpWebRequest;
HttpWebResponse res = req.GetResponse() as HttpWebResponse;
StreamReader sr = new StreamReader(res.GetResponseStream(), System.Text.Encoding.Default);
string html = sr.ReadToEnd();
t2.Text = html;
//获取超链接
allinks = GetHyperLinks(html);
for (int i = 0; i < allinks.Count; i++)
{
ListBox1.Items.Add(allinks[i].ToString()); ;
}
}
catch (Exception ee)
{
string ss= ee.Message.ToString();
Response.Write(ss);
//Response.Write(ss);
}
}
/*//获取网页中的超链接
ArrayList al = new ArrayList();
string strRegex = @ "http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)? ";
Regex r = new Regex(strRegex,RegexOptions.IgnoreCase);
MatchCollection m = r.Matches();*/
}
// 提取HTML代码中的网址
static ArrayList GetHyperLinks(string htmlCode)
{
ArrayList al = new ArrayList();
//定义网页的正则表达式
string strRegex = @ "http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)? ";
Regex r = new Regex(strRegex, RegexOptions.IgnoreCase);
MatchCollection m = r.Matches(htmlCode);
for (int i = 0; i <= m.Count - 1; i++)
{
bool rep = false;
string strNew = m[i].ToString();
// 过滤重复的URL
foreach (string str in al)
{
if (strNew == str)
{
rep = true;
break;
}
}
if (!rep) al.Add(strNew);
}
al.Sort();
return al;
}
}
}
[解决办法] 研究一下HttpRequest类
[解决办法] 在做一个抓取网页信息的小软件,对方网页使用url重写,不能获得真实的url
我觉得真实的url是获取不到的,对访问者而言是透明的。