HttpClient依据jsoup解析网页

2012-08-27

HttpClient根据jsoup解析网页package jsoupimport org.apache.http.HttpEntityimport org.apache.http.H

HttpClient根据jsoup解析网页

package jsoup;import org.apache.http.HttpEntity;import org.apache.http.HttpResponse;import org.apache.http.HttpStatus;import org.apache.http.client.HttpClient;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.DefaultHttpClient;import org.apache.http.util.EntityUtils;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;/** * 利用HttpClient获取html代码，然后使用jsoup对html代码进行解析 * @author Administrator * */public class JustTest {public static void main(String[] args) {String html = getHtmlByUrl("http://www.iteye.com/");if (html != null && !"".equals(html)) {Document doc = Jsoup.parse(html);Elements linksElements = doc.select("div#page>div#content>div#main>div.left>div#recommend>ul>li>a");// 以上代码的意思是 找id为“page”的div里面 id为“content”的div里面 id为“main”的div里面// class为“left”的div里面 id为“recommend”的div里面ul里面li里面a标签for (Element ele : linksElements) {String href = ele.attr("href");String title = ele.text();System.out.println(href + "," + title);}}}/** * 根据URL获得所有的html信息 *  * @param url * @return */public static String getHtmlByUrl(String url) {String html = null;HttpClient httpClient = new DefaultHttpClient();// 创建httpClient对象HttpGet httpget = new HttpGet(url);// 以get方式请求该URLtry {HttpResponse responce = httpClient.execute(httpget);// 得到responce对象int resStatu = responce.getStatusLine().getStatusCode();// 返回码if (resStatu == HttpStatus.SC_OK) {// 200正常 其他就不对// 获得相应实体HttpEntity entity = responce.getEntity();if (entity != null) {html = EntityUtils.toString(entity);// 获得html源代码System.out.println(html);}}} catch (Exception e) {System.out.println("访问【" + url + "】出现异常!");e.printStackTrace();} finally {httpClient.getConnectionManager().shutdown();}return html;}}

热点排行

JavaScript

HttpClient依据jsoup解析网页