lexer html解析一个js过滤的改进
问题描述,使用htmlparser的lexer解析器进行页面解析时发现类似如下的页面会有问题:
?
?
import java.net.URLConnection;import org.htmlparser.Node;import org.htmlparser.lexer.Lexer;import org.htmlparser.lexer.Page;import org.htmlparser.nodes.TagNode;import org.htmlparser.util.ParserException;import org.slf4j.Logger;import org.slf4j.LoggerFactory;/** * @author edwardpro * */public class LexerFixed extends Lexer {private static final Logger logger = LoggerFactory.getLogger(LexerFixed.class);/** * */private static final long serialVersionUID = 8425806017089419815L;//script标签标记,如果发现当前在script里就掠过所有的< >private int script=0;/** * */public LexerFixed() {super();}/** * @param page */public LexerFixed(Page page) {super(page);}/** * @param text */public LexerFixed(String text) {super(text);}/** * @param connection * @throws ParserException */public LexerFixed(URLConnection connection) throws ParserException {super(connection);}@Overridepublic Node nextNode(boolean quotesmart) throws ParserException {Node ret = super.nextNode(quotesmart);checkTag(ret);return (ret);}/** * checkTag用于修改tagNode的方法当有入参数时都会进行一次参数修正另外對內容進行一下escape操作並且會進行判斷是否存在已經escape的蹟象 * * @param node */private void checkTag(Node node) {if (node != null && node instanceof TagNode&& !((TagNode) node).isEmptyXmlTag()) {String tagName = ((TagNode) node).getTagName();if("SCRIPT".equalsIgnoreCase(tagName)){if (!((TagNode) node).isEndTag() ) {this.script=1;} else{this.script=0;}}}}@Overrideprotected Node parseString(int start, boolean quotesmart)throws ParserException {boolean done;char ch;char quote;done = false;quote = 0;while (!done) {ch = mPage.getCharacter(mCursor);if (Page.EOF == ch)done = true;else if (0x1b == ch) // escape{ch = mPage.getCharacter(mCursor);if (Page.EOF == ch)done = true;else if ('$' == ch) {ch = mPage.getCharacter(mCursor);if (Page.EOF == ch)done = true;// JIS X 0208-1978 and JIS X 0208-1983else if ('@' == ch || 'B' == ch)scanJIS(mCursor);/* * // JIS X 0212-1990 else if ('(' == ch) { ch = * mPage.getCharacter (mCursor); if (Page.EOF == ch) done = * true; else if ('D' == ch) scanJIS (mCursor); else { * mPage.ungetCharacter (mCursor); mPage.ungetCharacter * (mCursor); mPage.ungetCharacter (mCursor); } } */else {mPage.ungetCharacter(mCursor);mPage.ungetCharacter(mCursor);}} elsemPage.ungetCharacter(mCursor);} else if (quotesmart && (0 == quote)&& (('\'' == ch) || ('"' == ch)))quote = ch; // enter quoted state// patch from Gernot Fricke to handle escaped closing quoteelse if (quotesmart && (0 != quote) && ('\\' == ch)) {ch = mPage.getCharacter(mCursor); // try to consume escapeif ((Page.EOF != ch) && ('\\' != ch) // escaped backslash&& (ch != quote)) // escaped quote character// ( reflects ["] or ['] whichever opened the quotation)mPage.ungetCharacter(mCursor); // unconsume char if char not// an escape} else if (quotesmart && (ch == quote))quote = 0; // exit quoted stateelse if (quotesmart && (0 == quote) && (ch == '/')) {// handle multiline and double slash comments (with a quote)// in script like:// I can't handle single quotations.ch = mPage.getCharacter(mCursor);if (Page.EOF == ch)done = true;else if ('/' == ch) {doch = mPage.getCharacter(mCursor);while ((Page.EOF != ch) && ('\n' != ch));} else if ('*' == ch) {do {doch = mPage.getCharacter(mCursor);while ((Page.EOF != ch) && ('*' != ch));ch = mPage.getCharacter(mCursor);if (ch == '*')mPage.ungetCharacter(mCursor);} while ((Page.EOF != ch) && ('/' != ch));} elsemPage.ungetCharacter(mCursor);} else if ((0 == quote) && ('<' == ch)) {ch = mPage.getCharacter(mCursor);if (Page.EOF == ch)done = true;// the order of these tests might be optimized for speed:else if ('/' == ch|| (Character.isLetter(ch) && this.script==0)|| '!' == ch || '%' == ch || '?' == ch) {done = true;mPage.ungetCharacter(mCursor);mPage.ungetCharacter(mCursor);} else {// it's not a tag, so keep going, but check for quotesmPage.ungetCharacter(mCursor);}}}return (makeString(start, mCursor.getPosition()));}}?