一个简单的网页抓取例子
package net;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStreamReader;import java.net.MalformedURLException;import java.net.URL;import java.util.ArrayList;import java.util.HashMap;import java.util.Iterator;import java.util.List;import java.util.regex.Matcher;import java.util.regex.Pattern;public class WebContent {/** * 读取一个网页全部内容 * * @param htmlurl * @return String 网页内容 * @throws IOException */public String getOneHtml(final String htmlurl) throws IOException {URL url;String temp;final StringBuffer htmlContent = new StringBuffer();try {url = new URL(htmlurl);System.out.println(url.getProtocol());final BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream(), "utf-8"));// 读取网页全部内容while ((temp = in.readLine()) != null) {htmlContent.append(temp);}in.close();} catch (final MalformedURLException me) {System.out.println("你输入的URL格式有问题!请仔细输入");me.getMessage();throw me;} catch (final IOException e) {e.printStackTrace();throw e;}return htmlContent.toString();}/** * * @param s * @return 获得网页标题 */public String getTitle(final String s) {String regex = "<title>.*?</title>";;String title = "";final List<String> list = new ArrayList<String>();final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);final Matcher ma = pa.matcher(s);while (ma.find()) {list.add(ma.group());}for (int i = 0; i < list.size(); i++) {title = title + list.get(i);}return outTag(title);}/** * * @param s * @return 获得链接 */public List<String> getLink(final String s) {String regex;final List<String> list = new ArrayList<String>();regex = "<a[^>]*href=(\"([^\"]*)\"|\'([^\']*)\'|([^\\s>]*))[^>]*>(.*?)</a>";final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);final Matcher ma = pa.matcher(s);while (ma.find()) {list.add(ma.group());}return list;}/** * * @param s * @return 获得脚本代码 */public List<String> getScript(final String s) {String regex;final List<String> list = new ArrayList<String>();regex = "<script.*?</script>";final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);final Matcher ma = pa.matcher(s);while (ma.find()) {list.add(ma.group());}return list;}/** * * @param s * @return 获得CSS */public List<String> getCSS(final String s) {String regex;final List<String> list = new ArrayList<String>();regex = "<style.*?</style>";final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);final Matcher ma = pa.matcher(s);while (ma.find()) {list.add(ma.group());}return list;}/** * * @param s * @return 去掉标记 */public String outTag(final String s) {return s.replaceAll("<.*?>", "");}/** * * @param s * @return */public HashMap<String, List<String>> getFromUrls(final String url) {final HashMap<String, List<String>> result = new HashMap<String, List<String>>();String content = "";System.out.println("\n------------------开始读取网页(" + url+ ")--------------------");try {content = getOneHtml(url);} catch (final Exception e) {e.getMessage();return null;}System.out.println("------------------读取网页(" + url+ ")结束--------------------\n");System.out.println("------------------分析网页(" + url+ ")结果如下--------------------\n");List<String> title = new ArrayList<String>();title.add(getTitle(content));result.put("title", title);result.put("css", getCSS(content));result.put("script", getScript(content));result.put("link", getLink(content));return result;}/** * @param args */public static void main(final String args[]) {String url = "";final List<String> list = new ArrayList<String>();System.out.print("输入URL,一行一个,输入结束后输入 go 程序开始运行: \n");final BufferedReader br = new BufferedReader(new InputStreamReader(System.in));try {while (!(url = br.readLine()).equals("go")) // 如果输入不是go那么一直读取{list.add(url);}} catch (final Exception e) {e.getMessage();}final WebContent wc = new WebContent();HashMap<String, List<String>> hashMap = new HashMap<String, List<String>>();for (int i = 0; i < list.size(); i++) {if (wc.getFromUrls(list.get(i)) != null) {hashMap = wc.getFromUrls(list.get(i));}for (Iterator<String> iter = hashMap.keySet().iterator(); iter.hasNext();) {String key = iter.next();List<String> list2 = hashMap.get(key);System.out.println("--" + key + "内容如下:");for (int j = 0; j < list2.size(); j++) {System.out.println(list2.get(j));}}}}}