首页 诗词 字典 板报 句子 名言 友答 励志 学校 网站地图
当前位置: 首页 > 教程频道 > 网站开发 > JavaScript >

2011.06.29——— Jsoup HttpClient 抓取网络下的图片

2012-11-06 
2011.06.29——— Jsoup HttpClient 抓取网络上的图片2011.06.29——— Jsoup HttpClient 抓取网络上的图片参考:

2011.06.29——— Jsoup HttpClient 抓取网络上的图片
2011.06.29——— Jsoup HttpClient 抓取网络上的图片

参考:http://www.iteye.com/topic/1106648
http://www.ibm.com/developerworks/cn/java/j-lo-jsouphtml/index.html?ca=drs-

jsoup 官方网站:http://jsoup.org

需要的主要jar包
httpclient-4.0.1jar jsoup-1.5.2.jar

主要代码 如下

Exmaple3.java

package com.th.spider.test;import java.io.BufferedOutputStream;import java.io.FileOutputStream;import org.apache.commons.logging.Log;import org.apache.commons.logging.LogFactory;import org.apache.http.HttpEntity;import org.apache.http.HttpResponse;import org.apache.http.HttpStatus;import org.apache.http.client.HttpClient;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.DefaultHttpClient;import org.apache.http.params.CoreConnectionPNames;import org.apache.http.util.EntityUtils;import org.jsoup.Connection;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;public class Exmaple3 {private static final Log log = LogFactory.getLog(Exmaple3.class);/** * 抓取图片存放目录 */private static final String PIC_DIR = "/home/li/pic";/** * 链接超时 */private static final int TIME_OUT = 5000;static void go3(String url) throws Exception {        Connection conn= Jsoup.connect(url);        Document doc = conn.get();        Elements links = doc.select("div.piclist img[src]");        for(int i=0;i<links.size();i++){            Element element = links.get(i);            final String imgUrl = element.attr("src");            log.info(imgUrl);            Thread.sleep(500);            new Thread(new Runnable() {                public void run() {                    try {                        save(imgUrl);                    } catch (Exception e) {                        // TODO Auto-generated catch block                        e.printStackTrace();                    }                }            }).start();        }    }static void go2(String url) throws Exception {    Connection conn= Jsoup.connect(url);    Document doc = conn.get();    Elements links = doc.select("div.cc a[href]");    for(int i=0;i<links.size();i++){        Element element = links.get(i);        final String dirUrl = "http://www.3lian.com"+element.attr("href");        log.info(dirUrl);        Thread.sleep(500);            new Thread(new Runnable() {                public void run() {                    try {                        Connection conn= Jsoup.connect(dirUrl);                        Document doc = conn.get();                        Elements images = doc.select("div.mb_jjnr img[src]");                        for(int j=0;j<images.size();j++){                            Element img = images.get(j);                            String imgUrl = img.attr("src");                            log.info(imgUrl);                            save(imgUrl);                        }                    } catch (Exception e) {                        e.printStackTrace();                    }                }            }).start();    }    }/** * 处理帖子URL * @param url * @throws Exception */static void go(String url) throws Exception {// JSOP创建链接Connection conn = Jsoup.connect(url);// 请求返回整个文档对象Document doc = conn.post();// 选择所有class=zoom 的img标签对象Elements imgs = doc.select("img[class=zoom]");// 循环每个img标签for (int i = 0; i < imgs.size(); i++) {Element img = imgs.get(i);// 取得图片的下载地址String picURL = doc.baseUri() + img.attr("file");log.info(picURL);// 保存图片save(picURL);}}//<img src="static/image/common/none.gif" file="data/attachment/forum/201105/08/174412nz3jq4z90s33s2t0.jpg" width="770" onclick="zoom(this, this.src)" id="aimg_180565" onmouseover="showMenu({'ctrlid':this.id,'pos':'12'})" alt="2011.06.29——— Jsoup HttpClient 抓取网络下的图片" title="img_src_29620.jpg" />//doc.select("img[class=zoom]")/** * 保存图片 * @param url * @param i * @throws Exception */static void save(String url) throws Exception {String fileName = url.substring(url.lastIndexOf("/"));String filePath = PIC_DIR + "/" + fileName;BufferedOutputStream out = null;byte[] bit = getByte(url);if (bit.length > 0) {try {out = new BufferedOutputStream(new FileOutputStream(filePath));out.write(bit);out.flush();log.info("Create File success! [" + filePath + "]");} finally {if (out != null)out.close();}}}/** * 获取图片字节流 * @param uri * @return * @throws Exception */static byte[] getByte(String uri) throws Exception {HttpClient client = new DefaultHttpClient();client.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, TIME_OUT);HttpGet get = new HttpGet(uri);get.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, TIME_OUT);try {HttpResponse resonse = client.execute(get);if (resonse.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {HttpEntity entity = resonse.getEntity();if (entity != null) {return EntityUtils.toByteArray(entity);}}} catch (Exception e) {e.printStackTrace();} finally {client.getConnectionManager().shutdown();}return new byte[0];}public static void main(String[] args) throws Exception {// 开始抓取图片    go2("http://www.3lian.com/gif/more/03/0301.html");//go3("http://www.ivsky.com/tupian/nvxing_gouwu_qingjing_v6969/");}}





热点排行