paoding基于词典如何分词
上次介绍了Paoding的字典数据结构,这次介绍下paoding是如何对待分词文本依据词典分词的。paoding在查找字典时依据两个类: BinaryDictionary 和 HashBinaryDictionary。上次也已经介绍过这两个数据结构,这里不再重复。
HashBinaryDictionary其实就是把大块数据词典切分成小块的词典,并用BinaryDictionary存储。在用HashBinaryDictionary的search方法查找时,其采用的是递归方法,最终还是会进入BinaryDictionary中。看代码便知道
public Hit search(CharSequence input, int begin, int count) {SubDictionaryWrap subDic = (SubDictionaryWrap) subs.get(keyOf(input.charAt(hashIndex + begin)));if (subDic == null) {return Hit.UNDEFINED;}Dictionary dic = subDic.dic;// 对count==hashIndex + 1的处理if (count == hashIndex + 1) {Word header = dic.get(0);if (header.length() == hashIndex + 1) {if (subDic.wordIndexOffset + 1 < this.ascWords.length) {return new Hit(subDic.wordIndexOffset, header,this.ascWords[subDic.wordIndexOffset + 1]);} else {return new Hit(subDic.wordIndexOffset, header, null);}} else {return new Hit(Hit.UNCLOSED_INDEX, null, header);}}// count > hashIndex + 1Hit word = dic.search(input, begin, count);if (word.isHit()) {int index = subDic.wordIndexOffset + word.getIndex();word.setIndex(index);if (word.getNext() == null && index < size()) {word.setNext(get(index + 1));}}return word;}public Hit search(CharSequence input, int begin, int count) {int left = this.start;int right = this.end - 1;int pointer = 0;Word word = null;int relation;//while (left <= right) {pointer = (left + right) >> 1;word = ascWords[pointer];relation = compare(input, begin, count, word);if (relation == 0) {// System.out.println(new String(input,begin, count)+"***" +// word);int nextWordIndex = pointer + 1;if (nextWordIndex >= ascWords.length) {return new Hit(pointer, word, null);}else {return new Hit(pointer, word, ascWords[nextWordIndex]);}}if (relation < 0)right = pointer - 1;elseleft = pointer + 1;}//if (left >= ascWords.length) {return Hit.UNDEFINED;}//boolean asPrex = true;Word nextWord = ascWords[left];if (nextWord.length() < count) {asPrex = false;}for (int i = begin, j = 0; asPrex && j < count; i++, j++) {if (input.charAt(i) != nextWord.charAt(j)) {asPrex = false;}}return asPrex ? new Hit(Hit.UNCLOSED_INDEX, null, nextWord) : Hit.UNDEFINED;}Arrays.sort(array);这个方法,sort的解释如下:
public class TestArraySort {public static void main(String[] args) {HashSet<String> set = new HashSet<String>();set.add("三心二意");set.add("五谷丰登");set.add("六六大顺");set.add("三个人");set.add("五个人");set.add("六个人");Object[] array = set.toArray();Arrays.sort(array);for (int i = 0; i < array.length; i++) {System.out.println(array[i]);}}}public class TestCharactor {public static void main(String[] args) {int c1 = '三';int c2 = '五';System.out.println("The category of c1 is: " + c1);System.out.println("The category of c2 is: " + c2);}}public class TestCharactor {public static void main(String[] args) {int c1 = '个';int c2 = '心';System.out.println("The category of c1 is: " + c1);System.out.println("The category of c2 is: " + c2);}}