首页 诗词 字典 板报 句子 名言 友答 励志 学校 网站地图
当前位置: 首页 > 计算机考试 > 等级考试 > 复习指导 >

使用java将网页保存为mht格式(1)(2)

2009-01-05 
mht格式网页

  private void extractAllScriptNodes(NodeList nodes) {
  NodeList filtered = nodes.extractAllNodesThatMatch(new TagNameFilter(
  "BASE"), true);
  if (filtered != null && filtered.size() > 0) {
  Tag tag = (Tag) filtered.elementAt(0);
  String href = tag.getAttribute("href");
  if (href != null && href.length() > 0) {
  try {
  strWeb = new URL(href);
  } catch (MalformedURLException e) {
  // TODO Auto-generated catch block
  e.printStackTrace();
  }
  }
  }
  }
  /**
  *方法说明:抽取网页包含的css,js链接
  *输入参数:nodes 网页标签集合; urlMap 已存在的url集合
  *返回类型:css,js链接的集合
  */
  private ArrayList extractAllScriptNodes(NodeList nodes, HashMap urlMap) {
  ArrayList urlList = new ArrayList();
  NodeList filtered = nodes.extractAllNodesThatMatch(new TagNameFilter("script"), true);
  for (int i = 0; i < filtered.size(); i++) {
  Tag tag = (Tag) filtered.elementAt(i);
  String src = tag.getAttribute("src");
  // Handle external css file’s url
  if (src != null && src.length() > 0) {
  String innerURL = src;
  String absoluteURL = makeAbsoluteURL(strWeb, innerURL);
  if (absoluteURL != null && !urlMap.containsKey(absoluteURL)) {
  urlMap.put(absoluteURL, innerURL);
  ArrayList urlInfo = new ArrayList();
  urlInfo.add(innerURL);
  urlInfo.add(absoluteURL);
  urlList.add(urlInfo);
  }
  tag.setAttribute("src", absoluteURL);
  }
  }
  filtered = nodes.extractAllNodesThatMatch(new TagNameFilter("link"), true);
  for (int i = 0; i < filtered.size(); i++) {
  Tag tag = (Tag) filtered.elementAt(i);
  String type = (tag.getAttribute("type"));
  String rel = (tag.getAttribute("rel"));
  String href = tag.getAttribute("href");
  boolean isCssFile = false;
  if (rel != null) {
  isCssFile = rel.indexOf("stylesheet") != -1;
  } else if (type != null) {
  isCssFile |= type.indexOf("text/css") != -1;
  }
  // Handle external css file’s url
  if (isCssFile && href != null && href.length() > 0) {
  String innerURL = href;
  String absoluteURL = makeAbsoluteURL(strWeb, innerURL);
  if (absoluteURL != null && !urlMap.containsKey(absoluteURL)) {
  urlMap.put(absoluteURL, innerURL);
  ArrayList urlInfo = new ArrayList();
  urlInfo.add(innerURL);
  urlInfo.add(absoluteURL);
  urlList.add(urlInfo);
  }
  tag.setAttribute("href", absoluteURL);
  }
  }
  return urlList;
  }
  /**
  *方法说明:抽取网页包含的图像链接
  *输入参数:nodes 网页标签集合; urlMap 已存在的url集合
  *返回类型:图像链接集合
  */

    private ArrayList extractAllImageNodes(NodeList nodes, HashMap urlMap) {
  ArrayList urlList = new ArrayList();
  NodeList filtered = nodes.extractAllNodesThatMatch(new TagNameFilter("IMG"), true);
  for (int i = 0; i < filtered.size(); i++) {
  Tag tag = (Tag) filtered.elementAt(i);
  String src = tag.getAttribute("src");
  // Handle external css file’s url
  if (src != null && src.length() > 0) {
  String innerURL = src;
  String absoluteURL = makeAbsoluteURL(strWeb, innerURL);
  if (absoluteURL != null && !urlMap.containsKey(absoluteURL)) {
  urlMap.put(absoluteURL, innerURL);
  ArrayList urlInfo = new ArrayList();
  urlInfo.add(innerURL);
  urlInfo.add(absoluteURL);
  urlList.add(urlInfo);
  }
  tag.setAttribute("src", absoluteURL);
  }
  }
  return urlList;
  }
  /**
  *方法说明:相对路径转绝对路径
  *输入参数:strWeb 网页地址; innerURL 相对路径链接
  *返回类型:绝对路径链接
  */
  public static String makeAbsoluteURL(URL strWeb, String innerURL) {
  // TODO Auto-generated method stub
  //去除后缀
  int pos = innerURL.indexOf("?");
  if (pos != -1) {
  innerURL = innerURL.substring(0, pos);
  }
  if (innerURL != null
  && innerURL.toLowerCase().indexOf("http") == 0) {
  System.out.println(innerURL);
  return innerURL;
  }
  URL linkUri = null;
  try {
  linkUri = new URL(strWeb, innerURL);
  } catch (MalformedURLException e) {
  //TODO Auto-generated catch block
  e.printStackTrace();
  return null;
  }
  String absURL = linkUri.toString();
  absURL = JHtmlClear.replace(absURL, "../", "");
  absURL = JHtmlClear.replace(absURL, "./", "");
  System.out.println(absURL);
  return absURL;
  }
  /**
  *方法说明:创建mht文件
  *输入参数:content 网页文本内容; urlScriptList 脚本链接集合; urlImageList 图片链接集合
  *返回类型:
  */
  private void createMhtArchive(String content, ArrayList urlScriptList, ArrayList urlImageList) throws Exception {
  //Instantiate a Multipart object
  MimeMultipart mp = new MimeMultipart("related");
  Properties props = new Properties();
  props.put("mail.smtp.host", smtp);
  Session session = Session.getDefaultInstance(props, null);
  MimeMessage msg = new MimeMessage(session);
  // set mailer
  msg.setHeader("X-Mailer", "Code Manager .SWT");
  // set from
  if (from != null) {
  msg.setFrom(new InternetAddress(from));
  }
  // set subject
  if (subject != null) {
  msg.setSubject(subject);
  }
  // to
  if (to != null) {
  InternetAddress[] toAddresses = getInetAddresses(to);
  msg.setRecipients(Message.RecipientType.TO, toAddresses);
  }
  // cc
  if (cc != null) {
  InternetAddress[] ccAddresses = getInetAddresses(cc);
  msg.setRecipients(Message.RecipientType.CC, ccAddresses);
  }
  // bcc
  if (bcc != null) {
  InternetAddress[] bccAddresses = getInetAddresses(bcc);
  msg.setRecipients(Message.RecipientType.BCC, bccAddresses);
  }

 

3COME考试频道为您精心整理,希望对您有所帮助,更多信息在http://www.reader8.net/exam/

热点排行