首页 诗词 字典 板报 句子 名言 友答 励志 学校 网站地图
当前位置: 首页 > 计算机考试 > 等级考试 > 复习指导 >

使用java将网页保存为mht格式(1)(1)

2009-01-05 
mht格式网页

    package com.tag;
  import java.io.BufferedInputStream;
  import java.io.BufferedOutputStream;
  import java.io.BufferedReader;
  import java.io.ByteArrayInputStream;
  import java.io.DataOutputStream;
  import java.io.File;
  import java.io.FileInputStream;
  import java.io.FileOutputStream;
  import java.io.FileWriter;
  import java.io.IOException;
  import java.io.InputStream;
  import java.io.InputStreamReader;
  import java.io.OutputStream;
  import java.io.Reader;
  import java.net.MalformedURLException;
  import java.net.URL;
  import java.util.*;
  import org.htmlparser.Parser;
  import org.htmlparser.Tag;
  import org.htmlparser.filters.TagNameFilter;
  import org.htmlparser.lexer.Lexer;
  import org.htmlparser.lexer.Page;
  import org.htmlparser.util.DefaultParserFeedback;
  import org.htmlparser.util.NodeList;
  import org.htmlparser.util.ParserException;
  import toptrack.tools.JQuery;
  import javax.activation.DataHandler;
  import javax.activation.DataSource;
  import javax.activation.MimetypesFileTypeMap;
  import javax.mail.Message;
  import javax.mail.MessagingException;
  import javax.mail.Multipart;
  import javax.mail.Session;
  import javax.mail.internet.InternetAddress;
  import javax.mail.internet.MimeBodyPart;
  import javax.mail.internet.MimeMessage;
  import javax.mail.internet.MimeMultipart;
  import javax.mail.internet.MimePartDataSource;
  /**
  * mht文件解析类
  * @author examda
  */
  public class Html2MHTCompiler {
  private URL strWeb = null; /**网页地址*/
  private String strText = null; /**网页文本内容*/
  private String strFileName = null; /**本地文件名*/
  private String strEncoding = null; /**网页编码*/
  //mht格式附加信息
  private String from = "dongle2001@126.com";
  private String to;
  private String subject = "mht compile";
  private String cc;
  private String bcc;
  private String smtp = "localhost";
  public static void main(String[] args) {
  String strUrl = "http://www.mtime.com/my/tropicofcancer/blog/843555/";
  String strEncoding = "utf-8";
  String strText = JQuery.getHtmlText(strUrl, strEncoding, null);
  if (strText == null)
  return;
  Html2MHTCompiler h2t = new Html2MHTCompiler(strText, strUrl, strEncoding, "test.mht");
  h2t.compile();
  //Html2MHTCompiler.mht2html("test.mht", "a.html");
  }
  /**
  *方法说明:初始化
  *输入参数:strText 网页文本内容; strUrl 网页地址; strEncoding 网页编码; strFileName 本地文件名
  *返回类型:
  */
  public Html2MHTCompiler(String strText, String strUrl, String strEncoding, String strFileName) {
  // TODO Auto-generated constructor stub
  try {
  strWeb = new URL(strUrl);
  } catch (MalformedURLException e) {
  // TODO Auto-generated catch block
  e.printStackTrace();
  return;
  }
  this.strText = strText;
  this.strEncoding = strEncoding;
  this.strFileName = strFileName;
  }
  /**
  *方法说明:执行下载操作
  *输入参数:
  *返回类型:
  */

    public boolean compile() {
  if (strWeb == null || strText == null || strFileName == null || strEncoding == null)
  return false;
  HashMap urlMap = new HashMap();
  NodeList nodes = new NodeList();
  try {
  Parser parser = createParser(strText);
  parser.setEncoding(strEncoding);
  nodes = parser.parse(null);
  } catch (ParserException e) {
  // TODO Auto-generated catch block
  e.printStackTrace();
  }
  extractAllScriptNodes(nodes);
  ArrayList urlScriptList = extractAllScriptNodes(nodes, urlMap);
  ArrayList urlImageList = extractAllImageNodes(nodes, urlMap);
  for (Iterator iter = urlMap.entrySet().iterator(); iter.hasNext();) {
  Map.Entry entry = (Map.Entry) iter.next();
  String key = (String)entry.getKey();
  String val = (String)entry.getValue();
  strText = JHtmlClear.replace(strText, val, key);
  }
  try {
  createMhtArchive(strText, urlScriptList, urlImageList);
  } catch (Exception e) {
  // TODO Auto-generated catch block
  e.printStackTrace();
  return false;
  }
  return true;
  }
  /**
  *方法说明:建立HTML parser
  *输入参数:inputHTML 网页文本内容
  *返回类型:HTML parser
  */
  private Parser createParser(String inputHTML) {
  // TODO Auto-generated method stub
  Lexer mLexer = new Lexer(new Page(inputHTML));
  return new Parser(mLexer, new DefaultParserFeedback(DefaultParserFeedback.QUIET));
  }
  /**
  *方法说明:抽取基础URL地址
  *输入参数:nodes 网页标签集合
  *返回类型:
  */

热点排行