首页 诗词 字典 板报 句子 名言 友答 励志 学校 网站地图
当前位置: 首页 > 教程频道 > 其他教程 > 互联网 >

一个建议的网页爬虫,可用于上载在线API文档

2012-12-24 
一个建议的网页爬虫,可用于下载在线API文档package wkximport java.io.Fileimport java.io.FileNotFound

一个建议的网页爬虫,可用于下载在线API文档

package wkx;import java.io.File;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.util.HashSet;import java.util.Set;import org.apache.commons.httpclient.HttpClient;import org.apache.commons.httpclient.HttpMethod;import org.apache.commons.httpclient.HttpStatus;import org.apache.commons.httpclient.methods.PostMethod;public class Main {private static Set<String> filenames = new HashSet<String>();public static String getSource(String url) {String response = null;HttpClient client = new HttpClient();HttpMethod method = null;try {method = new PostMethod(url);client.executeMethod(method);if (method.getStatusCode() == HttpStatus.SC_OK) {response = method.getResponseBodyAsString();}} catch (IOException e) {System.out.println("Get Source Error!");} finally {if (method != null)method.releaseConnection();}return response;}public static void create(String url, String froot, String cur) {String curUrl = url + "/" + cur;if (filenames.contains(curUrl)) {return;}filenames.add(curUrl);String cont = getSource(curUrl);if (cont == null) {return;}File f = null;FileOutputStream fos = null;try {f = new File(froot);if (!f.exists()) {f.mkdirs();}f = new File(froot + "\\" + cur);f.createNewFile();fos = new FileOutputStream(f);fos.write(cont.getBytes());fos.flush();} catch (FileNotFoundException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();} finally {if (fos != null) {try {fos.close();} catch (IOException e) {e.printStackTrace();}}}String[] files = cont.split("\"");for (String file : files) {if (file.matches(".*\\.html$") && !file.startsWith("http")) {if (file.contains(" "))continue;String turl = url;String tfroot = froot;file.replaceAll("//", "/");int tindex = file.lastIndexOf("?");if (tindex != -1) {file = file.substring(0, tindex);}int index = 0;while ((index = file.indexOf("/")) != -1) {if (file.charAt(index - 1 < 0 ? 0 : index - 1) == '.') {turl = turl.substring(0, turl.lastIndexOf("/"));tfroot = tfroot.substring(0, tfroot.lastIndexOf("\\"));} else {turl = turl + "/" + file.substring(0, index);tfroot = tfroot + "\\" + file.substring(0, index);}file = file.substring(index + 1);}create(turl, tfroot, file);}}}public static void main(String[] args) {String url = "http://localhost";String froot = "C:\\Users\\Jack_Wong\\Desktop\\api";create(url, froot, "index.html");}}

热点排行