HTML工具类
?
package ssh.util;import java.util.regex.Matcher;import java.util.regex.Pattern;/** * HTML工具 * @author gary * */public class HTMLUtil {//>public static final String GT = ">";//<public static final String LT = "<";//"public static final String QUOT = """;//&public static final String AMP = "&";//空格public static final String SPACE = " ";//?public static final String COPYRIGHT = "©";//?public static final String REG = "®";//?public static final String TM = "™";//¥public static final String RMB = "¥";/** * 删除script标签 * @param str * @return */public static String delScriptTag(String str){String regEx_script = "<script[^>]*?>[\\s\\S]*?<\\/script>";Pattern p_script = Pattern.compile(regEx_script,Pattern.CASE_INSENSITIVE);Matcher m_script = p_script.matcher(str); str = m_script.replaceAll(""); return str.trim();}/** * 删除style标签 * @param str * @return */public static String delStyleTag(String str){String regEx_style="<style[^>]*?>[\\s\\S]*?<\\/style>"; Pattern p_style = Pattern.compile(regEx_style,Pattern.CASE_INSENSITIVE); Matcher m_style = p_style.matcher(str); str = m_style.replaceAll(""); return str;}/** * 删除HTML标签 * @param str * @return */public static String delHTMLTag(String str){String regEx_html = "<[^>]+>"; Pattern p_html = Pattern.compile(regEx_html,Pattern.CASE_INSENSITIVE); Matcher m_html = p_html.matcher(str); str = m_html.replaceAll(""); return str;}/** * 删除所有标签 * @param str * @return */public static String delAllTag(String str){ //删script str = delScriptTag(str); //删style str = delStyleTag(str); //删HTML str = delHTMLTag(str); return str; }/** * 清除标签,恢复HTML转义字符 * @param str * @return */public static String clean(String str){str = delAllTag(str);str = str.replaceAll(SPACE, " ");str = str.replaceAll(GT, ">");str = str.replaceAll(LT, "<");str = str.replaceAll(QUOT, """);str = str.replaceAll(AMP, "&");str = str.replaceAll(COPYRIGHT, "?");str = str.replaceAll(REG,"?");str = str.replaceAll(TM,"?");str = str.replaceAll(RMB,"¥");return str;}/** * 过滤指定标签 * @param str * @param tag * 指定标签 * @return String */public static String fiterHtmlTag(String str, String tag) {String regxp = "<\\s*" + tag + "\\s+([^>]*)\\s*>";Pattern pattern = Pattern.compile(regxp);Matcher matcher = pattern.matcher(str);StringBuffer sb = new StringBuffer();boolean result1 = matcher.find();while (result1) {matcher.appendReplacement(sb, "");result1 = matcher.find();}matcher.appendTail(sb);return sb.toString();}/** * 替换指定的标签 * @param str * @param beforeTag * 要替换的标签 * @param tagAttrib * 要替换的标签属性值 * @param startTag * 新标签开始标记 * @param endTag * 新标签结束标记 * @return String * example:替换img标签的src属性值为[img]属性值[/img] */public static String replaceHtmlTag(String str, String beforeTag,String tagAttrib, String startTag, String endTag) {String regxpForTag = "<\\s*" + beforeTag + "\\s+([^>]*)\\s*>";String regxpForTagAttrib = tagAttrib + "="([^"]+)"";Pattern patternForTag = Pattern.compile(regxpForTag);Pattern patternForAttrib = Pattern.compile(regxpForTagAttrib);Matcher matcherForTag = patternForTag.matcher(str);StringBuffer sb = new StringBuffer();boolean result = matcherForTag.find();while (result) {StringBuffer sbreplace = new StringBuffer();Matcher matcherForAttrib = patternForAttrib.matcher(matcherForTag.group(1));if (matcherForAttrib.find()) {matcherForAttrib.appendReplacement(sbreplace, startTag+ matcherForAttrib.group(1) + endTag);}matcherForTag.appendReplacement(sb, sbreplace.toString());result = matcherForTag.find();}matcherForTag.appendTail(sb);return sb.toString();}public static void main(String[] args) {System.out.println(clean(URLUtil.url2Str("http://www.baidu.com")));}} 1 楼 JE帐号 2011-05-13 Pattern 好像是线程安全的.所以可以考虑把 各个Pattern提升为静态属性.