从一篇文章中筛选处辞典生词本中没有的单词,导出成txt文件
代码些的比较烂,只是从几篇文章中过滤处单词本中没有的单词,导出成单独的一个文件,再手动一个一个录入到单词本中。。小工具。。mark一下。。使用dom4j。
<?xml version="1.0" encoding="UTF-8"?><wordbook><item><word>daunting</word><trans><![CDATA[adj. 使人畏缩的;使人气馁的;令人怯步的 daunting: 令人沮丧 | 使人畏缩的 | 使人气馁的]]></trans><phonetic><![CDATA[['d?:nti?]]]></phonetic><tags/><progress>10</progress></item><item><word>informative</word><trans><![CDATA[adj. 教育性的,有益的;情报的;见闻广博的 informative: 告知性的 | 使知道消息的 | 有益的]]></trans><phonetic><![CDATA[[in'f?:m?tiv]]]></phonetic><tags/><progress>10</progress></item><item><word>contribute</word><trans><![CDATA[vt. 贡献,出力;投稿;捐献 vt. 贡献,出力;投稿;捐献 contribute: 贡献 | 捐助 | 做出贡献]]></trans><phonetic><![CDATA[[k?n'tribju:t]]]></phonetic><tags/><progress>10</progress></item>
import java.io.BufferedInputStream;import java.io.BufferedReader;import java.io.File;import java.io.FileInputStream;import java.io.FileWriter;import java.io.IOException;import java.io.InputStreamReader;import java.util.HashMap;import java.util.Iterator;import java.util.List;import java.util.Map;import org.dom4j.Document;import org.dom4j.DocumentException;import org.dom4j.Element;import org.dom4j.Node;import org.dom4j.io.SAXReader;public class WordNewMain {/** * @param args * @throws DocumentException * @throws IOException */public static void main(String[] args) throws DocumentException, IOException {Map<String, Word> listMap = new HashMap<String, Word>();SAXReader saxReader = new SAXReader();saxReader.setEncoding("utf-8");Document whole1Xml = saxReader.read(new BufferedInputStream(new FileInputStream("all_sych.xml")));List<Element> whole1List = whole1Xml.selectNodes("//wordbook/item");System.out.println("whole1 List Size:" + whole1List.size());for (int i = 0; i < whole1List.size(); i++) {Element e = whole1List.get(i);Node word = e.selectSingleNode("word");Node trans = e.selectSingleNode("trans");Node phonetic = e.selectSingleNode("phonetic");Node tags = e.selectSingleNode("tags");Node progress = e.selectSingleNode("progress");Word w = listMap.get(word.getStringValue());if (w != null && Integer.parseInt(w.getProgress()) < Integer.parseInt(progress.getStringValue())) {w.setProgress(progress.getStringValue());} else if (w == null) {e.detach();w = new Word(word.getStringValue(), trans.getStringValue(), phonetic.getStringValue(),tags.getStringValue(), progress.getStringValue());}listMap.put(word.getStringValue().toLowerCase(), w);}// txtBufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(new File("8word.txt"))));System.out.println("Ok, find the file!");String line = null;byte[] wordB = new byte[30];Map<String, String> countArea = new HashMap<String, String>();int wordBP = 0;String theWord = null;System.out.println("Start count~");FileWriter fw = new FileWriter("result.txt");while ((line = br.readLine()) != null) {boolean inWord = true;byte[] lineB = line.getBytes();for (int i = 0; i < lineB.length; i++) {// is a characterif ((lineB[i] < 91 && lineB[i] > 64) || (lineB[i] < 123 && lineB[i] > 96)) {wordB[wordBP] = lineB[i];wordBP = wordBP + 1;inWord = true;} else if (inWord) {theWord = new String(wordB).trim().toLowerCase();if (listMap.get(theWord) == null && theWord.length() > 1) {countArea.put(theWord, theWord);}wordBP = 0;inWord = false;wordB = new byte[30];}}if (inWord) {theWord = new String(wordB).trim().toLowerCase();if (listMap.get(theWord) == null && theWord.length() > 1) {countArea.put(theWord, theWord);}wordBP = 0;inWord = false;wordB = new byte[30];}}br.close();// steven txtbr = new BufferedReader(new InputStreamReader(new FileInputStream(new File("steve_4p.txt"))));System.out.println("Ok, find the file!");while ((line = br.readLine()) != null) {boolean inWord = true;byte[] lineB = line.getBytes();for (int i = 0; i < lineB.length; i++) {// is a characterif ((lineB[i] < 91 && lineB[i] > 64) || (lineB[i] < 123 && lineB[i] > 96)) {wordB[wordBP] = lineB[i];wordBP = wordBP + 1;inWord = true;} else if (inWord) {theWord = new String(wordB).trim().toLowerCase();if (listMap.get(theWord) == null && theWord.length() > 1) {countArea.put(theWord, theWord);}wordBP = 0;inWord = false;wordB = new byte[30];}}if (inWord) {theWord = new String(wordB).trim().toLowerCase();if (listMap.get(theWord) == null && theWord.length() > 1) {countArea.put(theWord, theWord);}wordBP = 0;inWord = false;wordB = new byte[30];}}// GRE textbr = new BufferedReader(new InputStreamReader(new FileInputStream(new File("gre.txt"))));System.out.println("Ok, find the file!");while ((line = br.readLine()) != null) {boolean inWord = true;byte[] lineB = line.getBytes();for (int i = 0; i < lineB.length; i++) {// is a characterif ((lineB[i] < 91 && lineB[i] > 64) || (lineB[i] < 123 && lineB[i] > 96)) {wordB[wordBP] = lineB[i];wordBP = wordBP + 1;inWord = true;} else if (inWord) {theWord = new String(wordB).trim().toLowerCase();if (listMap.get(theWord) == null && theWord.length() > 1) {countArea.put(theWord, theWord);}wordBP = 0;inWord = false;wordB = new byte[30];}}if (inWord) {theWord = new String(wordB).trim().toLowerCase();if (listMap.get(theWord) == null && theWord.length() > 1) {countArea.put(theWord, theWord);}wordBP = 0;inWord = false;wordB = new byte[30];}}// GaoZhong textbr = new BufferedReader(new InputStreamReader(new FileInputStream(new File("gz.txt"))));System.out.println("Ok, find the file!");while ((line = br.readLine()) != null) {boolean inWord = true;byte[] lineB = line.getBytes();for (int i = 0; i < lineB.length; i++) {// is a characterif ((lineB[i] < 91 && lineB[i] > 64) || (lineB[i] < 123 && lineB[i] > 96)) {wordB[wordBP] = lineB[i];wordBP = wordBP + 1;inWord = true;} else if (inWord) {theWord = new String(wordB).trim().toLowerCase();if (listMap.get(theWord) == null && theWord.length() > 1) {countArea.put(theWord, theWord);}wordBP = 0;inWord = false;wordB = new byte[30];}}if (inWord) {theWord = new String(wordB).trim().toLowerCase();if (listMap.get(theWord) == null && theWord.length() > 1) {countArea.put(theWord, theWord);}wordBP = 0;inWord = false;wordB = new byte[30];}}// outputIterator<String> it = countArea.keySet().iterator();while (it.hasNext()) {fw.write(it.next() + "\r\n");}fw.close();System.out.println("End count~");System.out.println("Sum word of steve is :" + countArea.size());}}