Skip to content

Commit

Permalink
#46 Counts row written correctly when distinct enabled, both for csv …
Browse files Browse the repository at this point in the history
…and regular sort
  • Loading branch information
julianSelser committed Apr 4, 2023
1 parent 54bd7b4 commit 16fb3e0
Show file tree
Hide file tree
Showing 5 changed files with 78 additions and 25 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import com.google.code.externalsorting.ExternalSort;
//... inputfile: input file name
//... outputfile: output file name
// next command sorts the lines from inputfile to outputfile
ExternalSort.mergeSortedFiles(ExternalSort.sortInBatch(new File(inputfile)), new File(outputfile));
int numLinesWritten = ExternalSort.mergeSortedFiles(ExternalSort.sortInBatch(new File(inputfile)), new File(outputfile));
// you can also provide a custom string comparator, see API
```

Expand Down Expand Up @@ -56,7 +56,7 @@ ArrayList<CSVRecord> header = new ArrayList<CSVRecord>();
// next two lines sort the lines from inputfile to outputfile
List<File> sortInBatch = CsvExternalSort.sortInBatch(file, null, sortOptions, header);
// at this point you can access header if you'd like.
CsvExternalSort.mergeSortedFiles(sortInBatch, outputfile, sortOptions, true, header);
int numWrittenLines = CsvExternalSort.mergeSortedFiles(sortInBatch, outputfile, sortOptions, true, header);

```

Expand Down
14 changes: 7 additions & 7 deletions src/main/java/com/google/code/externalsorting/ExternalSort.java
Original file line number Diff line number Diff line change
Expand Up @@ -226,15 +226,15 @@ public int compare(IOStringStack i,
pq.add(bfb);
}
}
long rowcounter = 0;
long numLinesWritten = 0;
try {
if (!distinct) {
while (pq.size() > 0) {
IOStringStack bfb = pq.poll();
String r = bfb.pop();
fbw.write(r);
fbw.newLine();
++rowcounter;
++numLinesWritten;
if (bfb.empty()) {
bfb.close();
} else {
Expand All @@ -248,7 +248,7 @@ public int compare(IOStringStack i,
lastLine = bfb.pop();
fbw.write(lastLine);
fbw.newLine();
++rowcounter;
++numLinesWritten;
if (bfb.empty()) {
bfb.close();
} else {
Expand All @@ -263,8 +263,8 @@ public int compare(IOStringStack i,
fbw.write(r);
fbw.newLine();
lastLine = r;
++numLinesWritten;
}
++rowcounter;
if (bfb.empty()) {
bfb.close();
} else {
Expand All @@ -278,7 +278,7 @@ public int compare(IOStringStack i,
bfb.close();
}
}
return rowcounter;
return numLinesWritten;

}

Expand Down Expand Up @@ -460,11 +460,11 @@ public static long mergeSortedFiles(List<File> files, BufferedWriter fbw,
BinaryFileBuffer bfb = new BinaryFileBuffer(br);
bfbs.add(bfb);
}
long rowcounter = mergeSortedFiles(fbw, cmp, distinct, bfbs);
long numLinesWritten = mergeSortedFiles(fbw, cmp, distinct, bfbs);
for (File f : files) {
f.delete();
}
return rowcounter;
return numLinesWritten;
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ public int compare(CSVRecordBuffer i, CSVRecordBuffer j) {
for (CSVRecordBuffer bfb : bfbs)
if (!bfb.empty())
pq.add(bfb);
int rowcounter = 0;
int numWrittenLines = 0;
CSVPrinter printer = new CSVPrinter(fbw, sortOptions.getFormat());
if(! sortOptions.isSkipHeader()) {
for(CSVRecord r: header) {
Expand All @@ -98,8 +98,8 @@ public int compare(CSVRecordBuffer i, CSVRecordBuffer j) {
} else {
printer.printRecord(r);
lastLine = r;
++numWrittenLines;
}
++rowcounter;
if (bfb.empty()) {
bfb.close();
} else {
Expand All @@ -113,7 +113,7 @@ public int compare(CSVRecordBuffer i, CSVRecordBuffer j) {
bfb.close();
}

return rowcounter;
return numWrittenLines;
}

public static int mergeSortedFiles(List<File> files, File outputfile, final CsvSortOptions sortOptions,
Expand All @@ -131,14 +131,14 @@ public static int mergeSortedFiles(List<File> files, File outputfile, final CsvS
BufferedWriter fbw = new BufferedWriter(
new OutputStreamWriter(new FileOutputStream(outputfile, append), sortOptions.getCharset()));

int rowcounter = mergeSortedFiles(fbw, sortOptions, bfbs, header);
int numWrittenLines = mergeSortedFiles(fbw, sortOptions, bfbs, header);
for (File f : files) {
if (!f.delete()) {
LOG.log(Level.WARNING, String.format("The file %s was not deleted", f.getName()));
}
}

return rowcounter;
return numWrittenLines;
}

public static List<File> sortInBatch(long size_in_byte, final BufferedReader fbr, final File tmpdirectory,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package com.google.code.externalsorting;

import static com.google.code.externalsorting.ExternalSort.defaultcomparator;
import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
Expand Down Expand Up @@ -206,15 +207,17 @@ public int compare(String o1, String o2) {
};
File out = File.createTempFile("test_results", ".tmp", null);
out.deleteOnExit();
ExternalSort.mergeSortedFiles(this.fileList, out, cmp,
Charset.defaultCharset(), true);
long numLinesWritten = ExternalSort.mergeSortedFiles(this.fileList, out, cmp,
Charset.defaultCharset(), true);

List<String> result = new ArrayList<>();
try (BufferedReader bf = new BufferedReader(new FileReader(out))) {
while ((line = bf.readLine()) != null) {
result.add(line);
}
}

assertEquals(11, numLinesWritten);
assertArrayEquals(Arrays.toString(result.toArray()), EXPECTED_MERGE_DISTINCT_RESULTS,
result.toArray());
}
Expand Down Expand Up @@ -399,9 +402,22 @@ public static void writeStringToFile(File f, String s) throws IOException {
public void sortVeryLargeFile() throws IOException {
final Path veryLargeFile = getTestFile();
final Path outputFile = Files.createTempFile("Merged-File", ".tmp");
final long sortedLines = ExternalSort.mergeSortedFiles(ExternalSort.sortInBatch(veryLargeFile.toFile()), outputFile.toFile());
final long numLinesWritten = ExternalSort.mergeSortedFiles(ExternalSort.sortInBatch(veryLargeFile.toFile()), outputFile.toFile());
final long expectedLines = 2148L * 1000000L;
assertEquals(expectedLines, sortedLines);
assertEquals(expectedLines, numLinesWritten);
}

@Ignore("This test takes too long to execute")
@Test
public void sortVeryLargeFileWhenDistinctEnabled() throws IOException {
boolean distinctEnabled = true;
final Path veryLargeFile = getTestFile();
final File outputFile = Files.createTempFile("Merged-File", ".tmp").toFile();
List<File> veryLargeSortBatch = ExternalSort.sortInBatch(veryLargeFile.toFile());

long numLinesWritten = ExternalSort.mergeSortedFiles(veryLargeSortBatch, outputFile, defaultcomparator, distinctEnabled);

assertEquals(1 /* 😸 */, numLinesWritten);
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.lang.reflect.Field;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
Expand All @@ -21,6 +20,7 @@
import java.util.Map;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotEquals;


public class CsvExternalSortTest {
Expand Down Expand Up @@ -97,7 +97,7 @@ public void testIssue44() throws Exception {
List<File> sortInBatch = CsvExternalSort.sortInBatch(file, null, sortOptions, header);


int mergeSortedFiles = CsvExternalSort.mergeSortedFiles(sortInBatch, outputfile, sortOptions, true, header);
CsvExternalSort.mergeSortedFiles(sortInBatch, outputfile, sortOptions, true, header);

List<String> lines = Files.readAllLines(Paths.get(outputfile.getPath()), StandardCharsets.UTF_8);
for(String a : lines) {
Expand Down Expand Up @@ -133,9 +133,9 @@ public void testNonLatin() throws Exception {

assertEquals(1, sortInBatch.size());

int mergeSortedFiles = CsvExternalSort.mergeSortedFiles(sortInBatch, outputfile, sortOptions, true, header);
int numLinesWritten = CsvExternalSort.mergeSortedFiles(sortInBatch, outputfile, sortOptions, true, header);

assertEquals(5, mergeSortedFiles);
assertEquals(5, numLinesWritten);

List<String> lines = Files.readAllLines(Paths.get(outputfile.getPath()), StandardCharsets.UTF_8);

Expand Down Expand Up @@ -175,9 +175,9 @@ public void testCVSFormat() throws Exception {

assertEquals(1, sortInBatch.size());

int mergeSortedFiles = CsvExternalSort.mergeSortedFiles(sortInBatch, outputfile, sortOptions, false, header);
int numLinesWritten = CsvExternalSort.mergeSortedFiles(sortInBatch, outputfile, sortOptions, false, header);

assertEquals(4, mergeSortedFiles);
assertEquals(4, numLinesWritten);

List<String> lines = Files.readAllLines(outputfile.toPath());

Expand Down Expand Up @@ -210,7 +210,7 @@ public void testMultiLineFileWthHeader() throws IOException, ClassNotFoundExcept

assertEquals(1, sortInBatch.size());

int mergeSortedFiles = CsvExternalSort.mergeSortedFiles(sortInBatch, outputfile, sortOptions, true, header);
CsvExternalSort.mergeSortedFiles(sortInBatch, outputfile, sortOptions, true, header);

List<String> lines = Files.readAllLines(outputfile.toPath(), sortOptions.getCharset());

Expand All @@ -223,6 +223,43 @@ public void testMultiLineFileWthHeader() throws IOException, ClassNotFoundExcept

}

@Test
public void testNumLinesWrittenIfDistinctEnabled() throws IOException, ClassNotFoundException {
boolean distinctEnabled = true;
String path = this.getClass().getClassLoader().getResource(FILE_CSV).getPath();
File file = new File(path);
outputfile = new File("outputSort1.csv");

Comparator<CSVRecord> comparator = Comparator.comparing(op -> op.get(0));

CsvSortOptions sortOptions = new CsvSortOptions
.Builder(comparator, CsvExternalSort.DEFAULTMAXTEMPFILES, CsvExternalSort.estimateAvailableMemory())
.charset(Charset.defaultCharset())
.distinct(distinctEnabled)
.numHeader(1)
.skipHeader(true)
.format(CSVFormat.DEFAULT)
.build();
ArrayList<CSVRecord> header = new ArrayList<CSVRecord>();

List<File> sortInBatch = CsvExternalSort.sortInBatch(file, null, sortOptions, header);

int numLinesWritten = CsvExternalSort.mergeSortedFiles(sortInBatch, outputfile, sortOptions, true, header);

BufferedReader reader = new BufferedReader(new FileReader(outputfile));

assertEquals(1, sortInBatch.size());
assertEquals(3, numLinesWritten);

String firstLine = reader.readLine();
assertEquals("6,this wont work in other systems,3", firstLine);

String secondLine = reader.readLine();
assertNotEquals(firstLine, secondLine);

reader.close();
}

@After
public void onTearDown() {
if(outputfile.exists()) {
Expand Down

0 comments on commit 16fb3e0

Please sign in to comment.