找出两个大文件中数据不同部分
import java.io.BufferedReader;import java.io.BufferedWriter;import java.io.File;import java.io.FileReader;import java.io.FileWriter;import java.io.IOException;import java.util.ArrayList;import java.util.HashSet;import java.util.Iterator;import java.util.List;import java.util.Set;public class TestBigData {// 分割的文件数private static final int CUTTED_FILE_NUM = 30;//文件后缀名private static final String FILE_EXTENSIONS = ".log";//回车换行private static final String NEWLINE ="\r\n";/** * 输入大文件的路径,根据Hash函数讲大文件分割成若干个小文件 * * @param sourceFilePath * @param destinationFilePath */public static void hashCutFile(String sourceFilePath,String destinationDirPath) {File fr = new File(sourceFilePath);BufferedReader br = null;BufferedWriter bw = null;String[] filePath = new String[CUTTED_FILE_NUM];for (int i = 0; i < filePath.length; i++) {filePath[i] = destinationDirPath + i + FILE_EXTENSIONS;}String[] split = new String[2];try {br = new BufferedReader(new FileReader(fr));String line = br.readLine();while (line != null) {// 数据格式为00001016114116820131725061748117041361&4580337030|||112050// 规范化数据split = line.split("\\|\\|\\|");String url = split[0];// 采用字符串自带的hashCode作为Hash函数int hashcode = new Integer(url.hashCode());int hashResult = hashcode % CUTTED_FILE_NUM;if (hashResult < 0) {hashResult = hashResult + CUTTED_FILE_NUM;}bw = new BufferedWriter(new FileWriter(new File(filePath[hashResult]), true));bw.write(url);bw.write(NEWLINE);bw.close();line = br.readLine();}} catch (Exception e) {e.printStackTrace();} finally {try {br.close();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}}}/** * 查找两个文件中不同的内容 * * @param fileA * @param fileB */public static List<String> findDifference(String fileA, String fileB) {List<String> partialResult = new ArrayList<String>();File frA = new File(fileA);File frB = new File(fileB);BufferedReader brA = null;BufferedReader brB = null;List<String> listA = new ArrayList<String>();List<String> listB = new ArrayList<String>();Set<String> hashset = new HashSet<String>();try {brA = new BufferedReader(new FileReader(frA));brB = new BufferedReader(new FileReader(frB));// 把fileA的内容读入到listA中String line = brA.readLine();while (line != null) {listA.add(line);line = brA.readLine();}line = null;// 把fileB的内容读入到listB中line = brB.readLine();while (line != null) {listB.add(line);line = brB.readLine();}} catch (IOException e) {e.printStackTrace();} finally {try {brA.close();brB.close();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}}hashset.addAll(listB);for (int i = 0; i < listA.size(); i++) {String elemA = listA.get(i);if (!hashset.contains(elemA)) {partialResult.add(elemA);}}hashset.clear();hashset.addAll(listA);for (int i = 0; i < listB.size(); i++) {String elemB = listB.get(i);if (!hashset.contains(elemB)) {partialResult.add(elemB);}}return partialResult;}/** * * @param file * @return */public static List<String> findDifference(String file) {List<String> partialResult = new ArrayList<String>();File fr = new File(file);BufferedReader br = null;try {br = new BufferedReader(new FileReader(fr));// 把file的内容读入到list中String line = br.readLine();while (line != null) {partialResult.add(line);line = br.readLine();}} catch (IOException e) {e.printStackTrace();} finally {try {br.close();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}}return partialResult;}/** * list1和list2求交集 * * @param list1 * @param list2 * @return */public static List<String> Intersection(List<String> list1,List<String> list2) {List<String> list = new ArrayList<String>();Set<String> hashSet = new HashSet<String>();hashSet.addAll(list2);for (int i = 0; i < list1.size(); i++) {String temp = list1.get(i);if (hashSet.contains(list1.get(i))) {list.add(temp);}}return list;}/** * 求List1与List2的差集 list1 - list2 * * @param list1 * @param list2 * @return */public static List<String> Complement(List<String> list1, List<String> list2) {List<String> list = new ArrayList<String>();Set<String> hashSet = new HashSet<String>();hashSet.addAll(list2);for (int i = 0; i < list1.size(); i++) {String temp = list1.get(i);if (!hashSet.contains(temp)) {list.add(temp);}}return list;}/** * 归并所有小文件中所有不相同的内容 * * @param dirAPath * 大文件A对应的分割后的小文件目录 * @param dirBPath * 大文件B对应的分割后的小文件目录 * @return */public static Set<String> mergeDifferenceList(String dirAPath,String dirBPath) {Set<String> resultSet = new HashSet<String>();File dirA = new File(dirAPath);File dirB = new File(dirBPath);File[] Afiles = dirA.listFiles();File[] Bfiles = dirB.listFiles();String Afiletemp = dirA.getAbsolutePath();String Bfiletemp = dirB.getAbsolutePath();List<String> AfilesPath = new ArrayList<String>();List<String> BfilesPath = new ArrayList<String>();//存放A和B的交集结果List<String> intersectionList = new ArrayList<String>();//存放A - A与B的交集List<String> complementListA = new ArrayList<String>();//存放B - A与B的交集List<String> complementListB = new ArrayList<String>();for (int i = 0; i < Afiles.length; i++) {AfilesPath.add(Afiles[i].getName());}for (int i = 0; i < Bfiles.length; i++) {BfilesPath.add(Bfiles[i].getName());}intersectionList = Intersection(AfilesPath, BfilesPath);for (int i = 0; i < intersectionList.size(); i++) {resultSet.addAll(findDifference(Afiletemp + "\" + intersectionList.get(i), Bfiletemp+ "\" + intersectionList.get(i)));}complementListA = Complement(AfilesPath, intersectionList);complementListB = Complement(BfilesPath, intersectionList);for (int i = 0; i < complementListA.size(); i++) {resultSet.addAll(findDifference(Afiletemp + "\"+ complementListA.get(i)));}for (int i = 0; i < complementListB.size(); i++) {resultSet.addAll(findDifference(Bfiletemp + "\"+ complementListB.get(i)));}return resultSet;}/** * @param args */public static void main(String[] args) {// TODO Auto-generated method stublong t1 = System.currentTimeMillis();String fileA = "C:/Users/jim/Desktop/big data/sourceFile1.log";String fileB = "C:/Users/jim/Desktop/big data/sourceFile2.log";//String fileA = "C:/Users/jim/Desktop/big data/新建文本文档.txt";//String fileB = "C:/Users/jim/Desktop/big data/新建文本文档 (2).txt";String destinationA = "C:/Users/jim/Desktop/big data/destinationA/";String destinationB = "C:/Users/jim/Desktop/big data/destinationB/";Set<String> hashset = new HashSet<String>();hashCutFile(fileA, destinationA);hashCutFile(fileB, destinationB);hashset = mergeDifferenceList(destinationA, destinationB);for (Iterator<String> it = hashset.iterator(); it.hasNext();) {System.out.println(it.next());}long t2 = System.currentTimeMillis();System.out.println("时间t= " + (t2 - t1) + "ms");}}
?