diff --git a/README.md b/README.md index cab5d30..3757fd9 100755 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ import com.google.code.externalsorting.ExternalSort; //... inputfile: input file name //... outputfile: output file name // next command sorts the lines from inputfile to outputfile -ExternalSort.mergeSortedFiles(ExternalSort.sortInBatch(new File(inputfile)), new File(outputfile)); +int numLinesWritten = ExternalSort.mergeSortedFiles(ExternalSort.sortInBatch(new File(inputfile)), new File(outputfile)); // you can also provide a custom string comparator, see API ``` @@ -56,7 +56,7 @@ ArrayList header = new ArrayList(); // next two lines sort the lines from inputfile to outputfile List sortInBatch = CsvExternalSort.sortInBatch(file, null, sortOptions, header); // at this point you can access header if you'd like. -CsvExternalSort.mergeSortedFiles(sortInBatch, outputfile, sortOptions, true, header); +int numWrittenLines = CsvExternalSort.mergeSortedFiles(sortInBatch, outputfile, sortOptions, true, header); ``` diff --git a/src/main/java/com/google/code/externalsorting/ExternalSort.java b/src/main/java/com/google/code/externalsorting/ExternalSort.java index f0a8f10..c96e161 100644 --- a/src/main/java/com/google/code/externalsorting/ExternalSort.java +++ b/src/main/java/com/google/code/externalsorting/ExternalSort.java @@ -226,7 +226,7 @@ public int compare(IOStringStack i, pq.add(bfb); } } - long rowcounter = 0; + long numLinesWritten = 0; try { if (!distinct) { while (pq.size() > 0) { @@ -234,7 +234,7 @@ public int compare(IOStringStack i, String r = bfb.pop(); fbw.write(r); fbw.newLine(); - ++rowcounter; + ++numLinesWritten; if (bfb.empty()) { bfb.close(); } else { @@ -248,7 +248,7 @@ public int compare(IOStringStack i, lastLine = bfb.pop(); fbw.write(lastLine); fbw.newLine(); - ++rowcounter; + ++numLinesWritten; if (bfb.empty()) { bfb.close(); } else { @@ -263,8 +263,8 @@ public int compare(IOStringStack i, fbw.write(r); fbw.newLine(); lastLine = r; + ++numLinesWritten; } - ++rowcounter; if (bfb.empty()) { bfb.close(); } else { @@ -278,7 +278,7 @@ public int compare(IOStringStack i, bfb.close(); } } - return rowcounter; + return numLinesWritten; } @@ -460,11 +460,11 @@ public static long mergeSortedFiles(List files, BufferedWriter fbw, BinaryFileBuffer bfb = new BinaryFileBuffer(br); bfbs.add(bfb); } - long rowcounter = mergeSortedFiles(fbw, cmp, distinct, bfbs); + long numLinesWritten = mergeSortedFiles(fbw, cmp, distinct, bfbs); for (File f : files) { f.delete(); } - return rowcounter; + return numLinesWritten; } /** diff --git a/src/main/java/com/google/code/externalsorting/csv/CsvExternalSort.java b/src/main/java/com/google/code/externalsorting/csv/CsvExternalSort.java index a8afaff..0bc62bb 100755 --- a/src/main/java/com/google/code/externalsorting/csv/CsvExternalSort.java +++ b/src/main/java/com/google/code/externalsorting/csv/CsvExternalSort.java @@ -81,7 +81,7 @@ public int compare(CSVRecordBuffer i, CSVRecordBuffer j) { for (CSVRecordBuffer bfb : bfbs) if (!bfb.empty()) pq.add(bfb); - int rowcounter = 0; + int numWrittenLines = 0; CSVPrinter printer = new CSVPrinter(fbw, sortOptions.getFormat()); if(! sortOptions.isSkipHeader()) { for(CSVRecord r: header) { @@ -98,8 +98,8 @@ public int compare(CSVRecordBuffer i, CSVRecordBuffer j) { } else { printer.printRecord(r); lastLine = r; + ++numWrittenLines; } - ++rowcounter; if (bfb.empty()) { bfb.close(); } else { @@ -113,7 +113,7 @@ public int compare(CSVRecordBuffer i, CSVRecordBuffer j) { bfb.close(); } - return rowcounter; + return numWrittenLines; } public static int mergeSortedFiles(List files, File outputfile, final CsvSortOptions sortOptions, @@ -131,14 +131,14 @@ public static int mergeSortedFiles(List files, File outputfile, final CsvS BufferedWriter fbw = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(outputfile, append), sortOptions.getCharset())); - int rowcounter = mergeSortedFiles(fbw, sortOptions, bfbs, header); + int numWrittenLines = mergeSortedFiles(fbw, sortOptions, bfbs, header); for (File f : files) { if (!f.delete()) { LOG.log(Level.WARNING, String.format("The file %s was not deleted", f.getName())); } } - return rowcounter; + return numWrittenLines; } public static List sortInBatch(long size_in_byte, final BufferedReader fbr, final File tmpdirectory, diff --git a/src/test/java/com/google/code/externalsorting/ExternalSortTest.java b/src/test/java/com/google/code/externalsorting/ExternalSortTest.java index 528dc74..e3d71d7 100755 --- a/src/test/java/com/google/code/externalsorting/ExternalSortTest.java +++ b/src/test/java/com/google/code/externalsorting/ExternalSortTest.java @@ -1,5 +1,6 @@ package com.google.code.externalsorting; +import static com.google.code.externalsorting.ExternalSort.defaultcomparator; import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; @@ -206,8 +207,8 @@ public int compare(String o1, String o2) { }; File out = File.createTempFile("test_results", ".tmp", null); out.deleteOnExit(); - ExternalSort.mergeSortedFiles(this.fileList, out, cmp, - Charset.defaultCharset(), true); + long numLinesWritten = ExternalSort.mergeSortedFiles(this.fileList, out, cmp, + Charset.defaultCharset(), true); List result = new ArrayList<>(); try (BufferedReader bf = new BufferedReader(new FileReader(out))) { @@ -215,6 +216,8 @@ public int compare(String o1, String o2) { result.add(line); } } + + assertEquals(11, numLinesWritten); assertArrayEquals(Arrays.toString(result.toArray()), EXPECTED_MERGE_DISTINCT_RESULTS, result.toArray()); } @@ -399,9 +402,22 @@ public static void writeStringToFile(File f, String s) throws IOException { public void sortVeryLargeFile() throws IOException { final Path veryLargeFile = getTestFile(); final Path outputFile = Files.createTempFile("Merged-File", ".tmp"); - final long sortedLines = ExternalSort.mergeSortedFiles(ExternalSort.sortInBatch(veryLargeFile.toFile()), outputFile.toFile()); + final long numLinesWritten = ExternalSort.mergeSortedFiles(ExternalSort.sortInBatch(veryLargeFile.toFile()), outputFile.toFile()); final long expectedLines = 2148L * 1000000L; - assertEquals(expectedLines, sortedLines); + assertEquals(expectedLines, numLinesWritten); + } + + @Ignore("This test takes too long to execute") + @Test + public void sortVeryLargeFileWhenDistinctEnabled() throws IOException { + boolean distinctEnabled = true; + final Path veryLargeFile = getTestFile(); + final File outputFile = Files.createTempFile("Merged-File", ".tmp").toFile(); + List veryLargeSortBatch = ExternalSort.sortInBatch(veryLargeFile.toFile()); + + long numLinesWritten = ExternalSort.mergeSortedFiles(veryLargeSortBatch, outputFile, defaultcomparator, distinctEnabled); + + assertEquals(1 /* 😸 */, numLinesWritten); } /** diff --git a/src/test/java/com/google/code/externalsorting/csv/CsvExternalSortTest.java b/src/test/java/com/google/code/externalsorting/csv/CsvExternalSortTest.java index 4ba51c4..f94cada 100755 --- a/src/test/java/com/google/code/externalsorting/csv/CsvExternalSortTest.java +++ b/src/test/java/com/google/code/externalsorting/csv/CsvExternalSortTest.java @@ -9,7 +9,6 @@ import java.io.File; import java.io.FileReader; import java.io.IOException; -import java.lang.reflect.Field; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.nio.file.Files; @@ -21,6 +20,7 @@ import java.util.Map; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotEquals; public class CsvExternalSortTest { @@ -97,7 +97,7 @@ public void testIssue44() throws Exception { List sortInBatch = CsvExternalSort.sortInBatch(file, null, sortOptions, header); - int mergeSortedFiles = CsvExternalSort.mergeSortedFiles(sortInBatch, outputfile, sortOptions, true, header); + CsvExternalSort.mergeSortedFiles(sortInBatch, outputfile, sortOptions, true, header); List lines = Files.readAllLines(Paths.get(outputfile.getPath()), StandardCharsets.UTF_8); for(String a : lines) { @@ -133,9 +133,9 @@ public void testNonLatin() throws Exception { assertEquals(1, sortInBatch.size()); - int mergeSortedFiles = CsvExternalSort.mergeSortedFiles(sortInBatch, outputfile, sortOptions, true, header); + int numLinesWritten = CsvExternalSort.mergeSortedFiles(sortInBatch, outputfile, sortOptions, true, header); - assertEquals(5, mergeSortedFiles); + assertEquals(5, numLinesWritten); List lines = Files.readAllLines(Paths.get(outputfile.getPath()), StandardCharsets.UTF_8); @@ -175,9 +175,9 @@ public void testCVSFormat() throws Exception { assertEquals(1, sortInBatch.size()); - int mergeSortedFiles = CsvExternalSort.mergeSortedFiles(sortInBatch, outputfile, sortOptions, false, header); + int numLinesWritten = CsvExternalSort.mergeSortedFiles(sortInBatch, outputfile, sortOptions, false, header); - assertEquals(4, mergeSortedFiles); + assertEquals(4, numLinesWritten); List lines = Files.readAllLines(outputfile.toPath()); @@ -210,7 +210,7 @@ public void testMultiLineFileWthHeader() throws IOException, ClassNotFoundExcept assertEquals(1, sortInBatch.size()); - int mergeSortedFiles = CsvExternalSort.mergeSortedFiles(sortInBatch, outputfile, sortOptions, true, header); + CsvExternalSort.mergeSortedFiles(sortInBatch, outputfile, sortOptions, true, header); List lines = Files.readAllLines(outputfile.toPath(), sortOptions.getCharset()); @@ -223,6 +223,43 @@ public void testMultiLineFileWthHeader() throws IOException, ClassNotFoundExcept } + @Test + public void testNumLinesWrittenIfDistinctEnabled() throws IOException, ClassNotFoundException { + boolean distinctEnabled = true; + String path = this.getClass().getClassLoader().getResource(FILE_CSV).getPath(); + File file = new File(path); + outputfile = new File("outputSort1.csv"); + + Comparator comparator = Comparator.comparing(op -> op.get(0)); + + CsvSortOptions sortOptions = new CsvSortOptions + .Builder(comparator, CsvExternalSort.DEFAULTMAXTEMPFILES, CsvExternalSort.estimateAvailableMemory()) + .charset(Charset.defaultCharset()) + .distinct(distinctEnabled) + .numHeader(1) + .skipHeader(true) + .format(CSVFormat.DEFAULT) + .build(); + ArrayList header = new ArrayList(); + + List sortInBatch = CsvExternalSort.sortInBatch(file, null, sortOptions, header); + + int numLinesWritten = CsvExternalSort.mergeSortedFiles(sortInBatch, outputfile, sortOptions, true, header); + + BufferedReader reader = new BufferedReader(new FileReader(outputfile)); + + assertEquals(1, sortInBatch.size()); + assertEquals(3, numLinesWritten); + + String firstLine = reader.readLine(); + assertEquals("6,this wont work in other systems,3", firstLine); + + String secondLine = reader.readLine(); + assertNotEquals(firstLine, secondLine); + + reader.close(); + } + @After public void onTearDown() { if(outputfile.exists()) {