From 55b75f45257e685a7aff774578f00ef8fe39c55b Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Fri, 28 Jul 2023 10:33:13 +0800 Subject: [PATCH] Add large_data_test mark in pytest; Add a case to test ORC writing with lots of nulls (#8825) Signed-off-by: Chong Gao --- integration_tests/README.md | 4 +++ integration_tests/conftest.py | 3 ++ integration_tests/pytest.ini | 1 + integration_tests/src/main/python/conftest.py | 4 +++ integration_tests/src/main/python/marks.py | 1 + integration_tests/src/main/python/orc_test.py | 31 ++++++++++++++++++- 6 files changed, 43 insertions(+), 1 deletion(-) diff --git a/integration_tests/README.md b/integration_tests/README.md index bbca167ac8f..0ce52325561 100644 --- a/integration_tests/README.md +++ b/integration_tests/README.md @@ -398,6 +398,10 @@ properly without it. These tests assume Delta Lake is not configured and are dis If Spark has been configured to support Delta Lake then these tests can be enabled by adding the `--delta_lake` option to the command. +### Enabling large data tests +Some tests are testing large data which will take a long time. By default, these tests are disabled. +These tests can be enabled by adding the `--large_data_test` option to the command. + ## Writing tests There are a number of libraries provided to help someone write new tests. diff --git a/integration_tests/conftest.py b/integration_tests/conftest.py index 8c6d5e16cfd..46875f25e35 100644 --- a/integration_tests/conftest.py +++ b/integration_tests/conftest.py @@ -56,3 +56,6 @@ def pytest_addoption(parser): "--force_parquet_testing_tests", action="store_true", default=False, help="if true forces parquet-testing tests to fail if input data cannot be found" ) + parser.addoption( + "--large_data_test", action='store_true', default=False, help="if enable tests with large data" + ) diff --git a/integration_tests/pytest.ini b/integration_tests/pytest.ini index 20e5ca615c7..29530af0c06 100644 --- a/integration_tests/pytest.ini +++ b/integration_tests/pytest.ini @@ -33,5 +33,6 @@ markers = iceberg: Mark a test that requires Iceberg has been configured, skipping if tests are not configured for Iceberg delta_lake: Mark a test that requires Delta Lake has been configured, skipping if tests are not configured for Delta Lake regexp: Mark a test that tests regular expressions on the GPU (only works when UTF-8 is enabled) + large_data_test: Mark tests with large data filterwarnings = ignore:.*pytest.mark.order.*:_pytest.warning_types.PytestUnknownMarkWarning diff --git a/integration_tests/src/main/python/conftest.py b/integration_tests/src/main/python/conftest.py index d487b27177b..814617420fa 100644 --- a/integration_tests/src/main/python/conftest.py +++ b/integration_tests/src/main/python/conftest.py @@ -218,6 +218,10 @@ def pytest_runtest_setup(item): if not item.config.getoption('delta_lake'): pytest.skip('delta lake tests not configured to run') + if item.get_closest_marker('large_data_test'): + if not item.config.getoption('large_data_test'): + pytest.skip('tests for large data not configured to run') + def pytest_configure(config): global _runtime_env _runtime_env = config.getoption('runtime_env') diff --git a/integration_tests/src/main/python/marks.py b/integration_tests/src/main/python/marks.py index c5dc264fb46..9a56b2d1fca 100644 --- a/integration_tests/src/main/python/marks.py +++ b/integration_tests/src/main/python/marks.py @@ -30,3 +30,4 @@ fuzz_test = pytest.mark.fuzz_test iceberg = pytest.mark.iceberg delta_lake = pytest.mark.delta_lake +large_data_test = pytest.mark.large_data_test \ No newline at end of file diff --git a/integration_tests/src/main/python/orc_test.py b/integration_tests/src/main/python/orc_test.py index 9e3053bde92..4ad9f88cb4d 100644 --- a/integration_tests/src/main/python/orc_test.py +++ b/integration_tests/src/main/python/orc_test.py @@ -15,7 +15,7 @@ import pytest from asserts import assert_cpu_and_gpu_are_equal_sql_with_capture, assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_row_counts_equal, assert_gpu_fallback_collect, \ - assert_cpu_and_gpu_are_equal_collect_with_capture, assert_gpu_and_cpu_writes_are_equal_collect + assert_cpu_and_gpu_are_equal_collect_with_capture, assert_gpu_and_cpu_writes_are_equal_collect, assert_gpu_and_cpu_are_equal_sql from data_gen import * from marks import * from pyspark.sql.types import * @@ -902,3 +902,32 @@ def gen_null_df(spark): gpu_file_path = data_path + "/GPU" reader = read_orc_df(gpu_file_path) assert_gpu_and_cpu_are_equal_collect(lambda spark: reader(spark), conf=all_confs) + +@ignore_order +@large_data_test +@pytest.mark.parametrize("reader_confs", reader_opt_confs, ids=idfn) +def test_orc_with_null_column_with_1m_rows(spark_tmp_path, reader_confs): + data_path = spark_tmp_path + "/ORC_DATA" + all_confs = reader_confs + data = [(i, None, None, None, None) for i in range(1000000)] + def gen_null_df(spark): + return spark.createDataFrame( + data, + "c1 int, c2 long, c3 float, c4 double, c5 boolean") + assert_gpu_and_cpu_writes_are_equal_collect( + lambda spark, path: gen_null_df(spark).write.orc(path), + lambda spark, path: spark.read.orc(path), + data_path, + conf=all_confs) + gpu_file_path = data_path + "/CPU" + sqls = ["SELECT * FROM my_large_table", + "SELECT * FROM my_large_table WHERE c2 = 5", + "SELECT COUNT(*) FROM my_large_table WHERE c3 IS NOT NULL", + "SELECT * FROM my_large_table WHERE c4 IS NULL", + "SELECT * FROM my_large_table WHERE c5 IS NULL", + ] + for sql in sqls: + assert_gpu_and_cpu_are_equal_sql( + lambda spark: spark.read.orc(gpu_file_path), + "my_large_table", + sql) \ No newline at end of file