利用HtmlParser回提取网页内容

2012-12-28

利用HtmlParser来提取网页内容?本文参考：李腾飞CSM实战?使用正则表达式来分析网页的内容比较麻烦，因为html

利用HtmlParser来提取网页内容

?本文参考：李腾飞CSM实战

使用正则表达式来分析网页的内容比较麻烦，因为html标签不区分大小写，而且有的时候没有结尾。

HtmlParser也有下载网页的功能，不过他不是专门用来做这个，所以我们这边不用他的这个功能。

具体内容请看下面代码及注释：

public class PageParserTest extends TestCase {private String localFile="d:/temp.html";//使用httpChient来获取一个本地网页public void testFetch01() {try {// HttpClient主要负责执行请求，可以把它看做是一个浏览器HttpClient httpclient = new DefaultHttpClient();// 利用HTTP GET向服务器发起请求HttpGet get = new HttpGet("http://www.ibm.com/developerworks/cn/java/j-javaroundtable/index.html");// 获得服务器响应的的所有信息HttpResponse response = httpclient.execute(get);// 获得服务器响应回来的消息体（不包括HTTP HEAD）HttpEntity entity = response.getEntity();if (entity != null) {// 获得响应的字符集编码信息// 即获取HTTP HEAD的：Content-Type:text/html;charset=UTF-8中的字符集信息String charset = EntityUtils.getContentCharSet(entity);InputStream is = entity.getContent();IOUtils.copy(is, new FileOutputStream(localFile));}// 释放所有的链接资源，一般在所有的请求处理完成之后，才需要释放httpclient.getConnectionManager().shutdown();} catch (ClientProtocolException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();}}//使用HttpParser来提取网页中的图片链接地址public void testParse01() throws Exception{//把文件的内容读出来String html= IOUtils.toString(new FileInputStream(localFile),"UTF-8");//创建一个html解释器Parser parser=new Parser();parser.setInputHTML(html);//提取所有<img>标签的地址NodeList imageTags=parser.parse(new NodeClassFilter(ImageTag.class));for(int i=0;i<imageTags.size();i++){ImageTag it=(ImageTag) imageTags.elementAt(i);String imageUrl=it.getImageURL();System.out.println(imageUrl);}}//提取具有某种特征的标签public void testParse02() throws Exception{//把文件的内容读出来String html= IOUtils.toString(new FileInputStream(localFile),"UTF-8");//创建一个html解释器Parser parser=new Parser();parser.setInputHTML(html);//提取name="title"的meta标签NodeList metaTags=parser.parse(new NodeFilter(){@Overridepublic boolean accept(Node node) {if(node instanceof MetaTag){MetaTag mt=(MetaTag) node;if(mt.getMetaTagName()!=null && mt.getMetaTagName().equals("title")){return true;}}return false;}});for(int i=0; i<metaTags.size();i++){MetaTag mt=(MetaTag) metaTags.elementAt(i);System.out.println("文章的标题是："+mt.getMetaContent());}}//提取文章的简介和关键字public void testParse03() throws Exception{//把文件的内容读出来String html= IOUtils.toString(new FileInputStream(localFile),"UTF-8");MetaTag metaTag=ParseUtils.parseTag(html, MetaTag.class, "name", "Abstract");System.out.println("文章的简介是："+metaTag.getMetaContent());metaTag=ParseUtils.parseTag(html, MetaTag.class, "name", "Keywords");System.out.println("文章的关键字是："+metaTag.getMetaContent());}//提取文章的作者信息public void testParse04() throws Exception{//把文件的内容读出来String html= IOUtils.toString(new FileInputStream(localFile),"UTF-8");List<Div> authors=ParseUtils.parseTags(html, Div.class, "class", "author");for (Div div : authors) {System.out.println(ParseUtils.parseTag(div.getStringText(), LinkTag.class).getStringText());}}//提取文章的内容public void testParse05() throws Exception{//把文件的内容读出来String html= IOUtils.toString(new FileInputStream(localFile),"UTF-8");String content=StringUtils.substringBetween(html, "<!-- MAIN_COLUMN_BEGIN -->", "<!-- CMA");System.out.println(content);}//把文章内容中的图片下载到本地@Testpublic void testParse06() throws Exception{HttpClient httpClient=new DefaultHttpClient();//把文件的内容读出来String html= IOUtils.toString(new FileInputStream(localFile),"UTF-8");String content=StringUtils.substringBetween(html, "<!-- MAIN_COLUMN_BEGIN -->", "<!-- CMA");//提取内容中的图片信息Parser parser=new Parser();parser.setInputHTML(content);//提取所有<img>标签的地址NodeList imageTags=parser.parse(new NodeClassFilter(ImageTag.class));for(int i=0;i<imageTags.size();i++){ImageTag it=(ImageTag) imageTags.elementAt(i);String imageUrl=it.getImageURL();String imageName=FilenameUtils.getName(imageUrl);System.out.println(imageUrl);String url="http://www.ibm.com/developerworks/cn/java/j-javaroundtable/"+imageUrl;byte[] image=HttpUtils.getImage(httpClient, url);//存储到本地的某个磁盘IOUtils.write(image, new FileOutputStream("d:/temp/"+imageName));}httpClient.getConnectionManager().shutdown();}}

ParseUtils类：

public class ParseUtils {/** * 提取具有某个属性值的标签列表 * @param html 被提取的html文本 * @param tagType 标签的类型 * @param attributeName 某个属性的名称 * @param attributeValue 属性应取的值 * @return */public static <T extends TagNode> List<T> parseTags(String html,final Class<T> tagType,final String attributeName,final String attributeValue){try {//创建一个html解释器Parser parser=new Parser();parser.setInputHTML(html);NodeList tagList = parser.parse(new NodeFilter(){@Overridepublic boolean accept(Node node) {if(node.getClass()==tagType){T tn=(T) node;String attrValue=tn.getAttribute(attributeName);if(attrValue!=null && attrValue.equals(attributeValue)){return true;}}return false;}});List<T> tags=new ArrayList<T>();for(int i=0; i<tagList.size();i++){T t=(T) tagList.elementAt(i);tags.add(t);}return tags;} catch (ParserException e) {// TODO Auto-generated catch blocke.printStackTrace();}return null;}/** * 提取具有某个属性值的标签 * @param html 被提取的html文本 * @param tagType 标签的类型 * @param attributeName 某个属性的名称 * @param attributeValue 属性应取的值 * @return */public static <T extends TagNode> T parseTag(String html,final Class<T> tagType,final String attributeName,final String attributeValue){List<T> tags=parseTags(html, tagType, attributeName, attributeValue);if(tags!=null&&tags.size()>0){return tags.get(0);}return null;}/** * 提取具有某个属性值的标签 * @param html 被提取的html文本 * @param tagType 标签的类型 * @return */public static <T extends TagNode> T parseTag(String html,final Class<T> tagType){return parseTag(html, tagType,null,null);}/** * 提取具有某个属性值的标签列表 * @param html 被提取的html文本 * @param tagType 标签的类型 * @return */public static <T extends TagNode> List<T>  parseTags(String html,final Class<T> tagType){return parseTags(html, tagType,null,null);}}

HttpUtils类

public class HttpUtils {public static String getHtml(HttpClient httpClient,String Url){try {// 利用HTTP GET向服务器发起请求HttpGet get = new HttpGet(Url);// 获得服务器响应的的所有信息HttpResponse response = httpClient.execute(get);// 获得服务器响应回来的消息体（不包括HTTP HEAD）HttpEntity entity = response.getEntity();if (entity != null) {// 获得响应的字符集编码信息// 即获取HTTP HEAD的：Content-Type:text/html;charset=UTF-8中的字符集信息String charset = EntityUtils.getContentCharSet(entity);InputStream is = entity.getContent();return IOUtils.toString(is, charset);}} catch (ClientProtocolException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();}return null;}public static byte[] getImage(HttpClient httpClient,String Url){try {// 利用HTTP GET向服务器发起请求HttpGet get = new HttpGet(Url);// 获得服务器响应的的所有信息HttpResponse response = httpClient.execute(get);// 获得服务器响应回来的消息体（不包括HTTP HEAD）HttpEntity entity = response.getEntity();if (entity != null) {// 获得响应的字符集编码信息// 即获取HTTP HEAD的：Content-Type:text/html;charset=UTF-8中的字符集信息String charset = EntityUtils.getContentCharSet(entity);InputStream is = entity.getContent();return IOUtils.toByteArray(is);}} catch (ClientProtocolException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();}return null;}}

热点排行

CSS

利用HtmlParser回提取网页内容