两个上亿行的大文件取交集
前两天看到一哥们百度面试归来后发的帖子,并分享了其百度面试题,其中有一个题大意如下:
现有两个上亿行的文件,每一行都只有一个数字,求这两个文件的交集。
我的思路如下:首先是分别对这两个文件排序,然后,再同时遍历这两个文件。求出交集即可。
下面我对大文件的排序进行了简单的实现。
基本思路如下,首先对大文件进行拆分,一亿行的大文件可以拆分成10个小文件,并分别对这10个小文件进行排序,之后对这10个有序的小文件进行遍历,比较,再合并成一个大文件即可完成排序。
里面文件的路径为D://bigfile
代码如下。
main函数在bigFile中。
package com.fly.lee.bigfile;import java.io.BufferedReader;import java.io.File;import java.io.FileReader;import java.util.ArrayList;import java.util.Collections;import java.util.List;public class BigFile {private List<String> subFileNameList = new ArrayList<String>();private String bigFilePath = "D:/bigfile/";private String bigFileName = "Source1.txt";private FileGenerator fGenerator = new FileGenerator();private static final int theMaxNumber = 10000000;private List<Integer> theContentWritingToSubFile = new ArrayList<Integer>(theMaxNumber);public void sortTheBigfile(String destination){splitTheBigFile();SourceFolder sFolder = new SourceFolder();sFolder.setFilenameList(subFileNameList);sFolder.setDestination(destination);try {sFolder.compareFiles();} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();}removeTheSubFiles();}public void setBigFilePath(String bigFilePath) {this.bigFilePath = bigFilePath;}public void setBigFileName(String bigFileName) {this.bigFileName = bigFileName;}private void removeTheSubFiles() {for(String fileName:subFileNameList){File file = new File(fileName);if(file.exists()&&file.canWrite()){System.out.println(fileName+":"+file.delete());;}}}public void splitTheBigFile(){System.out.println("begin to spliting the big files...");int counter = 0;File file = new File(bigFilePath+bigFileName); BufferedReader reader = null; FileReader fReader; int fileFlag = 1; try { fReader = new FileReader(file); reader = new BufferedReader(fReader); String tempString = null; while ((tempString = reader.readLine()) != null) { if(tempString.trim().equals("")){ break; } int tempInt = Integer.valueOf(tempString); theContentWritingToSubFile.add(tempInt); if(isTheListFull(counter)){ handleTheFullList(fileFlag); theContentWritingToSubFile.clear(); fileFlag ++; } } handleTheFullList(fileFlag); fReader.close(); reader.close(); System.out.println("finishing the spliting work."); }catch(Exception e){ e.printStackTrace(); }}private void handleTheFullList(int fileFlag) throws Exception {System.out.println("handle the full list...");String tempFilePath = bigFilePath + fileFlag + ".txt";writeTheContentToSubFile(tempFilePath);subFileNameList.add(tempFilePath);theContentWritingToSubFile.clear();System.out.println("the full list is clear now...");}private void writeTheContentToSubFile(String tempFilePath) throws Exception {System.out.println("begin to write the content to sub file...");System.out.println("sub file path:"+tempFilePath);Collections.sort(theContentWritingToSubFile);fGenerator.setFileName(tempFilePath);fGenerator.setTheListNeedToWriteTofile(theContentWritingToSubFile);fGenerator.writeToFileFromTheList(true);System.out.println("finishing this loop of writing the content to sub file...");}private boolean isTheListFull(int counter) {return theContentWritingToSubFile.size() >= theMaxNumber; }public static void main(String[] args) {BigFile bf = new BigFile();bf.setBigFileName("Source1.txt");bf.sortTheBigfile("D:/bigfile/Source1_sorted.txt");}}package com.fly.lee.bigfile;import java.io.BufferedReader;import java.io.FileNotFoundException;import java.io.FileReader;import java.io.IOException;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;public class SourceFolder {private List<String> filenameList;private Map<Integer, Integer> numberCache;private List<BufferedReader> bufferedReaderList;private int endFlagNumber = 0;private List<Integer> contentListWritingToFile;private FileGenerator fileGenerator; private String destination = "D:/bigfile/AllSource.txt";public SourceFolder() {contentListWritingToFile = new ArrayList<Integer>();filenameList = new ArrayList<String>();bufferedReaderList = new ArrayList<BufferedReader>();fileGenerator = new FileGenerator();}public void setDestination(String destination) {this.destination = destination;}public void addFile(String filename) {this.filenameList.add(filename);}public void setFilenameList(List<String> filenameList) {this.filenameList = filenameList;}public void compareFiles() throws Exception {System.out.println("filenameList:"+filenameList);initTheBufferedReaderList();initTheNumberCache();while(!isAllFilesFinishTheComparing()){int theIndexOfReaderNeedingToMove = findTheLastIndexOfTheMinNumber();addTheNumberToFile(theIndexOfReaderNeedingToMove);updateNumberCache(theIndexOfReaderNeedingToMove);}addTheLastListToFile();closeAllIOStreams();}private void closeAllIOStreams() throws IOException {for(BufferedReader bReader:bufferedReaderList){if(bReader != null){bReader.close();}}}private int findTheLastIndexOfTheMinNumber() {int theIndexOfTheMinNumber = 0;int mixNumber = getTheFirstNumberFromTheCache();for (int index = 0; index < numberCache.size(); index++) {if(numberCache.get(index) == null){continue;}int theNumberWillBeCompared = numberCache.get(index);if (mixNumber >= theNumberWillBeCompared) {mixNumber = theNumberWillBeCompared;theIndexOfTheMinNumber = index;}}return theIndexOfTheMinNumber;}private int getTheFirstNumberFromTheCache() {for (int index = 0; index < numberCache.size(); index++) {if(numberCache.get(index) == null){continue;}return numberCache.get(index);}return 0;}private void addTheNumberToFile(int theIndexOfReaderNeedingToMove) throws Exception {contentListWritingToFile.add(numberCache.get(theIndexOfReaderNeedingToMove));if(contentListWritingToFile.size() == 1000000){fileGenerator.setTheListNeedToWriteTofile(contentListWritingToFile);fileGenerator.setFileName(destination);fileGenerator.writeToFileFromTheList( false);contentListWritingToFile.clear();}}private void updateNumberCache(int index) throws Exception {BufferedReader bufferedReader = bufferedReaderList.get(index);String tempString = null;if ((tempString = bufferedReader.readLine()) != null) {if (!"".equals(tempString.trim())) {int tempInt = Integer.valueOf(tempString);numberCache.put(index, tempInt);}} else {numberCache.put(index, null);endFlagNumber ++;}}private void addTheLastListToFile() throws Exception {fileGenerator.setTheListNeedToWriteTofile(contentListWritingToFile);fileGenerator.setFileName(destination);fileGenerator.writeToFileFromTheList(false);}private void initTheBufferedReaderList() throws FileNotFoundException {System.out.println("begin to initial the buffered reader...");for (String filename : filenameList) {BufferedReader bufferedReader = new BufferedReader(new FileReader(filename));bufferedReaderList.add(bufferedReader);}System.out.println("finish initialing the buffered reader...");}private void initTheNumberCache() throws Exception {System.out.println("begin to initial the number cache...");numberCache = new HashMap<Integer, Integer>(filenameList.size());for (int index = 0; index < filenameList.size(); index++) {updateNumberCache(index);}System.out.println("finish initialing the number cache...");}private boolean isAllFilesFinishTheComparing() {return endFlagNumber == filenameList.size();}}package com.fly.lee.bigfile;import java.io.File;import java.io.FileWriter;import java.io.IOException;import java.util.ArrayList;import java.util.List;public class FileGenerator {private int wholeLineNumber = 100000000;private List<Integer> theListNeedToWriteTofile;private String fileName;private FileWriter fWriter = null;//public void writepublic static void main(String[] args) {String fileName = "D:/bigfile/Source_yiwan.txt";FileGenerator fileGenerator = new FileGenerator();fileGenerator.setFileName(fileName);try {fileGenerator.createRandomFile();fileGenerator.closeFileWriter();} catch (Exception e) {e.printStackTrace();}}public void setFileName(String fileName) {this.fileName = fileName;}public void setWholeLineNumber(int wholeLineNumber) {this.wholeLineNumber = wholeLineNumber;}public void setTheListNeedToWriteTofile(List<Integer> theListNeedToWriteTofile) {this.theListNeedToWriteTofile = theListNeedToWriteTofile;}public void createRandomFile() throws Exception {int listSize = 10000000;theListNeedToWriteTofile = new ArrayList<Integer>(listSize);for(int i = 0; i < wholeLineNumber; i ++){int tempRandomInt = (int)(Math.random()*100000000);theListNeedToWriteTofile.add(tempRandomInt);if(theListNeedToWriteTofile.size()==listSize){System.out.println("begin to write to the file...");writeToFileFromTheList(false);theListNeedToWriteTofile.clear();System.out.println("finish this loop...");}}writeToFileFromTheList(false);}public void writeToFileFromTheList(boolean isNeedToCreateNewFile) throws Exception {System.out.println("write the content to file...");try {if(isNeedToCreateNewFile){createNewFile(fileName);}StringBuilder contentWritingToFile = new StringBuilder();int cycleLineNumber = 1000000;int counter = 0;fWriter = new FileWriter(fileName,true);for(int index = 0; index < theListNeedToWriteTofile.size(); index ++){int tempRandomInt = theListNeedToWriteTofile.get(index);contentWritingToFile.append(tempRandomInt+"\n");if(counter == cycleLineNumber){fWriter.append(contentWritingToFile.toString());counter = 0;contentWritingToFile = new StringBuilder();}}//while(theListNeedToWriteTofile.size() != 0){//int tempRandomInt = theListNeedToWriteTofile.remove(0);//contentWritingToFile.append(tempRandomInt+"\n");//if(counter == cycleLineNumber){//fWriter.append(contentWritingToFile.toString());//counter = 0;//contentWritingToFile = new StringBuilder();//}//}fWriter.append(contentWritingToFile.toString());System.out.println("done..........");} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();} finally{closeFileWriter();}}private void closeFileWriter() throws IOException{if(fWriter != null){fWriter.close();}}private void createNewFile(String fileName) throws IOException {File file = new File(fileName);if(file.delete()){file.createNewFile();}}}