首页 诗词 字典 板报 句子 名言 友答 励志 学校 网站地图
当前位置: 首页 > 教程频道 > 网站开发 > JavaScript >

jsoup httpclient 爬取网页并上载google图标

2012-07-04 
jsoup httpclient 爬取网页并下载google图标jsoup下载地址?http://www.jsoup.orghttpclient下载地址?http:

jsoup httpclient 爬取网页并下载google图标

jsoup下载地址?http://www.jsoup.org

httpclient下载地址?http://hc.apache.org/downloads.cgi

其他jar包见附件

package jsoup;import java.io.File;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.util.HashMap;import java.util.Map;import org.apache.commons.io.FileUtils;import org.apache.commons.io.IOUtils;import org.apache.http.HttpEntity;import org.apache.http.HttpResponse;import org.apache.http.HttpStatus;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.DefaultHttpClient;import org.apache.http.params.HttpProtocolParams;import org.apache.http.util.EntityUtils;import com.google.api.translate.Language;import com.google.api.translate.Translate;/** * google logo 下载程序 */public abstract class Crawler {/** * 使用google 翻译api *  * @param en * @return */public String translateEnToCinese(String en) {Translate.setHttpReferrer("http://www.xxx.com");try {return Translate.execute(en, Language.ENGLISH, Language.CHINESE);} catch (Exception e) {e.printStackTrace();}return "";}/** * 获取一个Map *  * @return */public Map<String, Object> getMap() {return new HashMap<String, Object>(0);}/** * 下载文件 *  * @param url *            文件http地址 * @param dir *            目标文件 * @throws IOException */public void downloadFile(String url, String dir) throws Exception {DefaultHttpClient httpClient = new DefaultHttpClient();HttpProtocolParams.setUserAgent(httpClient.getParams(),"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");HttpGet httpGet = new HttpGet();httpGet.setURI(new java.net.URI(url));InputStream input = null;FileOutputStream output = null;try {HttpResponse response = httpClient.execute(httpGet);HttpEntity entity = response.getEntity();input = entity.getContent();File file = new File(dir);output = FileUtils.openOutputStream(file);IOUtils.copy(input, output);} catch (Exception e){e.printStackTrace();} finally {IOUtils.closeQuietly(output);IOUtils.closeQuietly(input);}}/** * 处理GET请求,返回整个页面 *  * @param url *            访问地址 * @param params *            编码参数 * @return * @throws Exception */public synchronized String doGet(String url, String... params)throws Exception {DefaultHttpClient httpClient = new DefaultHttpClient(); // 创建httpClient实例HttpProtocolParams.setUserAgent(httpClient.getParams(),"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");String charset = "UTF-8";if (null != params && params.length >= 1) {charset = params[0];}HttpGet httpGet = new HttpGet(); // 创建get方法实例String content = "";httpGet.setURI(new java.net.URI(url));try {HttpResponse response = httpClient.execute(httpGet); // 执行请求,得到response对象int resStatu = response.getStatusLine().getStatusCode(); // 得到返回的状态码if (resStatu == HttpStatus.SC_OK) { // 200正常HttpEntity entity = response.getEntity(); // 获得相应的实体if (entity != null) {// 使用EntityUtils的toString方法,传递默认编码,在EntityUtils中的默认编码是ISO-8859-1content = EntityUtils.toString(entity, charset);}}} catch (Exception e) {System.out.println("访问【" + url + "】出现异常!");e.printStackTrace();} finally {// 关闭资源httpGet.abort();httpClient.getConnectionManager().shutdown();}return content;}}

?

?

package jsoup;import java.io.File;import java.io.IOException;import java.util.ArrayList;import java.util.Date;import java.util.List;import java.util.Map;import org.apache.commons.io.FileUtils;import org.apache.commons.lang.StringUtils;import org.json.JSONArray;import org.json.JSONObject;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;/** * google logo 下载程序 */public class GoogleLogoCrawler extends Crawler {private static final String URL = "http://www.logocollect.com/google/year.php?key=%y&page=%p"; private static final String LOGO_URL = "http://www.logocollect.com/google/";private static final String[] YEARS = new String[] { //"1998", "1999", "2000",//"2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009", "2010", "2011", "2012" };private static final String INDEX = "http://www.logocollect.com/google/year.php?key=%y"; private static final String DIR_PATH = "D:\\googlelogos\";public void doStart() {JSONArray array = new JSONArray();for (String year : YEARS) {String ind = INDEX.replaceAll("%y", year);int pageCount = getPageCount(ind);for (int i = 1; i < pageCount+1; i++) {String url = URL.replaceAll("%y", year).replaceAll("%p", i + "");String path = year + "_" + i;start(url, array, DIR_PATH + path + "\", path);}}try {FileUtils.writeStringToFile(new File(DIR_PATH + "json"), array.toString(), "UTF-8");} catch (IOException e) {e.printStackTrace();}System.out.println(array);}public int getPageCount(String url) {int pageCount = 1;try {org.jsoup.nodes.Document doc = Jsoup.connect(url).get();String els = doc.html().toString();int start = els.indexOf("总页数") + 4;String temp = els.substring(start);int end = temp.indexOf(",");pageCount = Integer.parseInt(els.substring(start,start+end));System.out.println(pageCount);} catch (IOException e) {e.printStackTrace();}return pageCount;}public void start(String url, JSONArray array, String dir, String path) {try {String content = super.doGet(url);Document doc = Jsoup.parse(content);Elements dds = doc.select(".img img");List<Map<String, Object>> list = new ArrayList<Map<String, Object>>(0);for (int i = 0; i < dds.size(); i++) {Element img = dds.get(i);String src = img.select("img").first().attr("src");String title = img.select("img").first().attr("title");Map<String, Object> map = super.getMap();map.put("url", LOGO_URL + src);map.put("title", title);list.add(map);}JSONArray tempJsonArray = new JSONArray();for (Map<String, Object> map : list) {JSONObject jsonObject = new JSONObject();String proxy = StringUtils.substringAfterLast(map.get("url").toString(), ".");long date = new Date().getTime();String name = date + "." + proxy;jsonObject.put("url", map.get("url").toString());jsonObject.put("dir", name);jsonObject.put("title", map.get("title").toString());// 翻译//String dateZh = super.translateEnToCinese(map.get("date")//.toString());//String titleZh = super.translateEnToCinese(map.get("title")//.toString());//json.put("title_zh_cn", dateZh + " - " + titleZh);// 下载图片super.downloadFile(map.get("url").toString(), dir + name);tempJsonArray.put(jsonObject);}array.put(new JSONObject().put(path, tempJsonArray));} catch (Exception e) {e.printStackTrace();}}public static void main(String[] args) throws Exception {new GoogleLogoCrawler().doStart();}}
?

?

热点排行