java 正则表达式的使用及代码重构(优雅代码实现)案列
java 正则表达式的使用及代码重构(优雅代码实现)案列
实现功能 将?the first item is a hamer,with a cost of $132.99. 分解成单个的单词,数字,标点。
/**The following program ?It tokenizes a string into its
?* ?textual components: words, numbers,
?* or punctuation. Although it is a simple example, it illustrates
?* the basic techniques used to tokenize any type of input.
?*/
?
?
import java.util.regex.Matcher;
import java.util.regex.Pattern;
?
/**
?* @author chenhao
?*
?*/
/**
?* @author chenhao
?*
?*/
public final class SimpleTextTokenizer {
?? ?/**
?? ? *
?? ? */
?? ?private SimpleTextTokenizer() { }
?? ? /**
?? ? ?*"\G"表示前一匹配的结尾 , "/z"表示字符串的结尾(包括)行终止符 end 用来匹配字符串的结尾.
?? ? ?*/
?? ? private static Pattern end = Pattern.compile("//G//z");
?
?? ? /**
?? ? * "\g\w+" 表示从前一批匹配的结尾开始匹配一个或多个可以成为单词的的.
?? ? ?* 一部分的字符. 如A - Z a-z 0 - 9 及下划线 (单词字符)
?? ? ?*/
?? ? private static Pattern wrod = Pattern.compile("\\G\\w+");
?
?? ? /**
?? ? ?* . "/p{Punct}" 包含所有的标点符号
*/
private static Pattern punct = Pattern.compile("\\G\\p{Punct}");
/**
*"\s" 匹配空格.
*/
private static Pattern space = Pattern.compile("\\G\\s");
?
/**
* . 匹配数字
*/
private static Pattern number = Pattern.compile("\\G\\d+\\.?\\d*");
?
/**
* the method returns the next token
* retrieved from the Matcher passed to.
* @param mat
* ? ? ? ? ? ?the Matcher of the text
* @return the next token of the text
*/
static String getTextToken(final Matcher mat) {
// First skip leading spaces
mat.usePattern(space);
mat.find();
// Next, obtain the next token in the string
// by attempting to match each pattern.
// The token found by the first matching pattern
// is returned. The order in which the patterns
// are tried matters. Checking for a word
// before checking for a number can change the results.
// First check for a number
mat.usePattern(number);
if (mat.find()) {
return mat.group();
}
// if not a number check for word
mat.usePattern(wrod);
if (mat.find()) {
return mat.group();
}
?
// check for punction
mat.usePattern(punct);
if (mat.find()) {
return mat.group();
}
?
// finall check for end of string
mat.usePattern(end);
if (mat.find()) {
return "";
}
?
// token is not recognized
return null; // invalid token
}
?
/**
* @param args
*/
public static void main(final String[] args) {
String token;
// Create a matcher
Matcher mat = end.matcher("the first item is a hamer,"
+ "with a cost of $132.99");
// display the tokens in the string
do {
token = getTextToken(mat);
if (token == null) {
System.out.println("invalid token");
break;
}
if (token.length() != 0) {
System.out.println("Token " + token);
} else {
System.out.println("End of String");
}
} while (token.length() != 0);
}
}
?
?
代码要写得优雅,但是上面这段代码很明显是不优雅的
1注释太多;
2方法不可重用
3有丑陋的条件判断句。
4可扩展性不强
?
改进方法
1利用好的命名来代替注释
2使用单一功能的方法
3利用多态,及好的设计模式来解决条件判断
综上可以写出很优雅的代码,可读性与可扩展性,可维护性都会大大提高
?
下面是代码实现
SimpleTextTokenizer.java
?
import java.util.LinkedList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
?
/**The following program ?It tokenizes a string into its
?* ?textual components: words, numbers,
?* or punctuation. Although it is a simple example, it illustrates
?* the basic techniques used to tokenize any type of input.
?*/
?
?
/**
?* @author chenhao
?*
?*/
public final class SimpleTextTokenizer {
?
?? ?/**
?? ? *"\G"表示前一匹配的结尾 , "/z"表示字符串的结尾(包括)行终止符 end 用来匹配字符串的结尾.
?? ? */
?? ?private static Pattern end = Pattern.compile("\\G\\z");
?
?? ?/**
?? ? * "\g\w+" 表示从前一批匹配的结尾开始匹配一个或多个可以成为单词的的. 一部分的字符. 如A - Z a-z 0 - 9 及下划线
?? ? * (单词字符)
?? ? */
?? ?private static Pattern word = Pattern.compile("\\G\\w+");
?
?? ?/**
?? ? * . "/p{Punct}" 包含所有的标点符号
?? ? */
?? ?private static Pattern punct = Pattern.compile("\\G\\p{Punct}");
?? ?/**
?? ? *"\s" 匹配空格.
?? ? */
?? ?private static Pattern space = Pattern.compile("\\G\\s");
?
?? ?/**
?? ? * . 匹配数字
?? ? */
?? ?private static Pattern number = Pattern.compile("\\G\\d+\\.?\\d*");
?
?? ?private SimpleTextTokenizer() { }
?? ?class InvalidToken implements TokenPrinter {
?? ? ? ?/*
?? ? ? ? * (non-Javadoc)
?? ? ? ? * @see TokenPrinter#print()
?? ? ? ? */
?? ? ? ?public int print() {
?? ? ? ? ? ?System.out.println("invalid token");
?? ? ? ? ? ?return 0;
?? ? ? ?}
?? ?}
?
?? ?class EndToken implements TokenPrinter {
?
?? ? ? ?/* (non-Javadoc)
?? ? ? ? * @see TokenPrinter#print()
?? ? ? ? */
?? ? ? ?public int print() {
?? ? ? ? ? ?System.out.println("End of String");
?? ? ? ? ? ?return 0;
?? ? ? ?}
?
?? ?}
?
?? ?class NormalToken implements TokenPrinter {
?? ? ? ?private String token;
?
?? ? ? ?public NormalToken(final String token) {
?? ? ? ? ? ?this.token = token;
?? ? ? ?}
?
?? ? ? ?/*
?? ? ? ? * (non-Javadoc)
?? ? ? ? * @see TokenPrinter#print()
?? ? ? ? */
?? ? ? ?public int print() {
?? ? ? ? ? ?System.out.println("Token " + token);
?? ? ? ? ? ?return 1;
?? ? ? ?}
?
?? ?}
?
?? ?/**
?? ? * the method returns the next token retrieved from the Matcher passed to.
?? ? * @param mat
?? ? * ? ? ? ? ? ?the Matcher of the text
?? ? * @return the next token of the text
?? ? */
?? ?static TokenPrinter getNextTokenMtchPattern(final Matcher mat) {
?? ? ? ?skipLeadingSpaces(mat);
?? ? ? ?return nextTokenMtchPattern(mat);
?? ?}
?
?? ?/**
?? ? * @param mat
?? ? * ? ? ? ? ? ?the matcher of a string
?? ? * @return the next token of the string
?? ? */
?? ?private static TokenPrinter nextTokenMtchPattern(final Matcher mat) {
?? ? ? ?// Next, obtain the next token in the string
?? ? ? ?// by attempting to match each pattern.
?? ? ? ?// The token found by the first matching pattern
?? ? ? ?// is returned. The order in which the patterns
?? ? ? ?// are tried matters. Checking for a word
?? ? ? ?// before checking for a number can change the results.
?? ? ? ?// First check for a number
?? ? ? ?LinkedList<Pattern> patternList = createPatternList();
?? ? ? ?for (Pattern pat : patternList) {
?? ? ? ? ? ?mat.usePattern(pat);
?? ? ? ? ? ?if (mat.find()) {
?? ? ? ? ? ? ? ?return ?createTokenPrinter(mat.group());
?? ? ? ? ? ?}
?? ? ? ?}
?? ? ? ?return createTokenPrinter(null); // the patternList is null;
?? ?}
?
?? ?/**
?? ? * @return a linkedList of Pattern;
?? ? */
?? ?private static LinkedList<Pattern> createPatternList() {
?? ? ? ?LinkedList<Pattern> patternList = new LinkedList<Pattern>();
?
?? ? ? ?patternList.add(number);
?? ? ? ?patternList.add(word);
?? ? ? ?patternList.add(punct);
?? ? ? ?patternList.add(end);
?? ? ? ?return patternList;
?? ?}
?
?? ?/**
?? ? * create a TokenPrinter use the token match the Pattern.
?? ? * @param token the token match the Pattern
?? ? * @return tokenPrinter
?? ? */
?? ?private static TokenPrinter createTokenPrinter(final String token) {
?? ? ? ?SimpleTextTokenizer textTokenizer = new SimpleTextTokenizer();
?? ? ? ?if (token == null) {
?? ? ? ? ? ?return textTokenizer.new InvalidToken();
?? ? ? ?}
?? ? ? ?if (token.length() == 0) {
?? ? ? ? ? ?return textTokenizer.new EndToken();
?? ? ? ?} else {
?? ? ? ? ? ?return textTokenizer.new NormalToken(token);
?? ? ? ?}
?
?? ?}
?? ?/**
?? ? * @param mat
?? ? * ? ? ? ? ? ?the matcher of a string. we won't let the mat point to anther
?? ? * ? ? ? ? ? ?object so it is final
?? ? */
?? ?private ?static void displayTokensInString(final Matcher mat) {
?? ? ? ?TokenPrinter tokenPtinter;
?? ? ? ?do {
?? ? ? ? ? ?tokenPtinter = getNextTokenMtchPattern(mat);
?? ? ? ?} while (tokenPtinter.print() != 0);
?? ?}
?? ?/**
?? ? * @param mat
?? ? * ? ? ? ? ? ?the matcher
?? ? */
?? ?private static void skipLeadingSpaces(final Matcher mat) {
?? ? ? ?mat.usePattern(space);
?? ? ? ?mat.find();
?? ?}
?
?? ?/**
?? ? * @param args
?? ? * ?does nothing in this programs
?? ? */
?? ?public static void main(final String[] args) {
?? ? ? ?// Create a matcher
?? ? ? ?Matcher mat = end.matcher("the first item is a hamer,"
?? ? ? ? ? ? ? ?+ "with a cost of $132.99.");
?? ? ? ?displayTokensInString(mat);
?? ?}
}
?
TokenPrinter.java
/**
?* @author chenhao
?* print the token itself.
?*/
interface TokenPrinter {
?? ?/**
?? ? * print the token itself.
?? ? * @return the type of the token
?? ? */
?? ?int print();
}
?