Skip to content

Commit

Permalink
[SPARK-43066][SQL] Add test for dropDuplicates in JavaDatasetSuite
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?

This PR proposes to add test for dropDuplicates in JavaDatasetSuite.

### Why are the changes needed?

The API dropDuplicates wasn't tested by Java test suite. It'd be better to have a sanity check to verify inter-op between Scala and Java works well.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

CI will verify.

Closes apache#40702 from HeartSaVioR/SPARK-43066.

Authored-by: Jungtaek Lim <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
  • Loading branch information
HeartSaVioR authored and HyukjinKwon committed Apr 8, 2023
1 parent 193deed commit d8b720a
Showing 1 changed file with 71 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import java.sql.Timestamp;
import java.time.*;
import java.util.*;
import java.util.stream.Collectors;
import javax.annotation.Nonnull;

import org.apache.spark.api.java.Optional;
Expand Down Expand Up @@ -535,6 +536,76 @@ public void testJoin() {
joined.collectAsList());
}

private static final Comparator<Tuple2<String, Integer>> comparatorStringAndIntTuple =
new Comparator<Tuple2<String, Integer>>() {
@Override
public int compare(Tuple2<String, Integer> o1, Tuple2<String, Integer> o2) {
if (o1._1.compareTo(o2._1) != 0) {
return o1._1.compareTo(o2._1);
}
return o1._2.compareTo(o2._2);
}

@Override
public boolean equals(Object obj) {
return super.equals(obj);
}
};

private void assertEqualsUnorderly(
List<Tuple2<String, Integer>> expected,
List<Tuple2<String, Integer>> actual) {
Assert.assertEquals(
expected.stream().sorted(comparatorStringAndIntTuple).collect(Collectors.toList()),
actual.stream().sorted(comparatorStringAndIntTuple).collect(Collectors.toList())
);
}

@Test
public void testDropDuplicates() {
List<Tuple2<String, Integer>> data = Arrays.asList(
new Tuple2<>("a", 1), new Tuple2<>("a", 2),
new Tuple2<>("b", 1), new Tuple2<>("a", 1)
);
Dataset<Tuple2<String, Integer>> ds = spark.createDataset(data,
Encoders.tuple(Encoders.STRING(), Encoders.INT()));

assertEqualsUnorderly(
Arrays.asList(tuple2("a", 1), tuple2("a", 2), tuple2("b", 1)),
ds.dropDuplicates().collectAsList()
);

assertEqualsUnorderly(
Arrays.asList(tuple2("a", 1), tuple2("b", 1)),
ds.dropDuplicates("_1").collectAsList()
);

assertEqualsUnorderly(
Arrays.asList(tuple2("a", 1), tuple2("b", 1)),
ds.dropDuplicates(new String[] { "_1" }).collectAsList()
);

assertEqualsUnorderly(
Arrays.asList(tuple2("a", 1), tuple2("a", 2)),
ds.dropDuplicates("_2").collectAsList()
);

assertEqualsUnorderly(
Arrays.asList(tuple2("a", 1), tuple2("a", 2)),
ds.dropDuplicates(new String[] { "_2" }).collectAsList()
);

assertEqualsUnorderly(
Arrays.asList(tuple2("a", 1), tuple2("a", 2), tuple2("b", 1)),
ds.dropDuplicates("_1", "_2").collectAsList()
);

assertEqualsUnorderly(
Arrays.asList(tuple2("a", 1), tuple2("a", 2), tuple2("b", 1)),
ds.dropDuplicates(new String[] { "_1", "_2" }).collectAsList()
);
}

@Test
public void testTupleEncoder() {
Encoder<Tuple2<Integer, String>> encoder2 = Encoders.tuple(Encoders.INT(), Encoders.STRING());
Expand Down

0 comments on commit d8b720a

Please sign in to comment.