Jsoup解析网页、文件
Jsoup网站:http://jsoup.org/
所有的使用方法都可以从api获得,api地址:http://jsoup.org/apidocs/
html的结构,可以参考wiki:http://en.wikipedia.org/wiki/HTML_element
----------------------Jsoup连接---------------------
连接url:
import java.io.IOException;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;public class JsoupTest {public static void main(String[] args) {Document doc = null;String url = " http://slashdot.org/";try {doc = Jsoup.connect(url).header("User-Agent","Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1").header("Accept", "text ml,application/xhtml+xml").header("Accept-Language", "zh-cn,zh;q=0.5").header("Accept-Charset", "GB2312,utf-8;q=0.7,*;q=0.7").get();Element body = doc.body();System.out.println(body.text());} catch (IOException e) {e.printStackTrace();}}}
doc = Jsoup.connect(url).get();
String baseUrl = "";File input = new File(url);Document doc = Jsoup.parse(input, "UTF-8", baseUrl);
<div id="bodycol"><div id="jobheadertop"> </div><div id="jobheader"><img border="0" src="./102708474_files/pixel.gif" alt="Jsoup解析网页、资料" id="companyLogo" onerror="removeLogo()"><p id="companyNameHeader" style="display: block; ">DiSalvo LLC recruiting</p> <div id="subicons"><img src="./102708474_files/pixel(1).gif" height="1" width="1" alt="Jsoup解析网页、资料" style="margin:0px"></div><div style="clear:both;height:1px"> </div><div id="jobheaderbottom"> </div></div><div id="jobwrappertop2"> </div><div id="jobwrapper"> <div id="jobsummary"> <div id="jobsummary_content"> <h2>Job Summary</h2> <dl> <dt>Company</dt> <dd><span name="code">private String seletorJobSum = "div#jobsummary";Elements elements = element.select(seletorJobSum);if(elements.size() == 0){return null;}Element section = elements.first();
<dt>Salary</dt><dd><span name="code">private String selectorSalary = "dt:contains(Salary) + dd";Elements salaries = section.select(selectorSalary);
seletor = "h2:contains(Job Information) + div";