Skip to content

Commit

Permalink
Add large_data_test mark in pytest; Add a case to test ORC writing wi…
Browse files Browse the repository at this point in the history
…th lots of nulls (#8825)

Signed-off-by: Chong Gao <[email protected]>
  • Loading branch information
res-life authored Jul 28, 2023
1 parent a215eab commit 55b75f4
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 1 deletion.
4 changes: 4 additions & 0 deletions integration_tests/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,10 @@ properly without it. These tests assume Delta Lake is not configured and are dis
If Spark has been configured to support Delta Lake then these tests can be enabled by adding the
`--delta_lake` option to the command.

### Enabling large data tests
Some tests are testing large data which will take a long time. By default, these tests are disabled.
These tests can be enabled by adding the `--large_data_test` option to the command.

## Writing tests

There are a number of libraries provided to help someone write new tests.
Expand Down
3 changes: 3 additions & 0 deletions integration_tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,6 @@ def pytest_addoption(parser):
"--force_parquet_testing_tests", action="store_true", default=False,
help="if true forces parquet-testing tests to fail if input data cannot be found"
)
parser.addoption(
"--large_data_test", action='store_true', default=False, help="if enable tests with large data"
)
1 change: 1 addition & 0 deletions integration_tests/pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,6 @@ markers =
iceberg: Mark a test that requires Iceberg has been configured, skipping if tests are not configured for Iceberg
delta_lake: Mark a test that requires Delta Lake has been configured, skipping if tests are not configured for Delta Lake
regexp: Mark a test that tests regular expressions on the GPU (only works when UTF-8 is enabled)
large_data_test: Mark tests with large data
filterwarnings =
ignore:.*pytest.mark.order.*:_pytest.warning_types.PytestUnknownMarkWarning
4 changes: 4 additions & 0 deletions integration_tests/src/main/python/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,10 @@ def pytest_runtest_setup(item):
if not item.config.getoption('delta_lake'):
pytest.skip('delta lake tests not configured to run')

if item.get_closest_marker('large_data_test'):
if not item.config.getoption('large_data_test'):
pytest.skip('tests for large data not configured to run')

def pytest_configure(config):
global _runtime_env
_runtime_env = config.getoption('runtime_env')
Expand Down
1 change: 1 addition & 0 deletions integration_tests/src/main/python/marks.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,4 @@
fuzz_test = pytest.mark.fuzz_test
iceberg = pytest.mark.iceberg
delta_lake = pytest.mark.delta_lake
large_data_test = pytest.mark.large_data_test
31 changes: 30 additions & 1 deletion integration_tests/src/main/python/orc_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import pytest

from asserts import assert_cpu_and_gpu_are_equal_sql_with_capture, assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_row_counts_equal, assert_gpu_fallback_collect, \
assert_cpu_and_gpu_are_equal_collect_with_capture, assert_gpu_and_cpu_writes_are_equal_collect
assert_cpu_and_gpu_are_equal_collect_with_capture, assert_gpu_and_cpu_writes_are_equal_collect, assert_gpu_and_cpu_are_equal_sql
from data_gen import *
from marks import *
from pyspark.sql.types import *
Expand Down Expand Up @@ -902,3 +902,32 @@ def gen_null_df(spark):
gpu_file_path = data_path + "/GPU"
reader = read_orc_df(gpu_file_path)
assert_gpu_and_cpu_are_equal_collect(lambda spark: reader(spark), conf=all_confs)

@ignore_order
@large_data_test
@pytest.mark.parametrize("reader_confs", reader_opt_confs, ids=idfn)
def test_orc_with_null_column_with_1m_rows(spark_tmp_path, reader_confs):
data_path = spark_tmp_path + "/ORC_DATA"
all_confs = reader_confs
data = [(i, None, None, None, None) for i in range(1000000)]
def gen_null_df(spark):
return spark.createDataFrame(
data,
"c1 int, c2 long, c3 float, c4 double, c5 boolean")
assert_gpu_and_cpu_writes_are_equal_collect(
lambda spark, path: gen_null_df(spark).write.orc(path),
lambda spark, path: spark.read.orc(path),
data_path,
conf=all_confs)
gpu_file_path = data_path + "/CPU"
sqls = ["SELECT * FROM my_large_table",
"SELECT * FROM my_large_table WHERE c2 = 5",
"SELECT COUNT(*) FROM my_large_table WHERE c3 IS NOT NULL",
"SELECT * FROM my_large_table WHERE c4 IS NULL",
"SELECT * FROM my_large_table WHERE c5 IS NULL",
]
for sql in sqls:
assert_gpu_and_cpu_are_equal_sql(
lambda spark: spark.read.orc(gpu_file_path),
"my_large_table",
sql)

0 comments on commit 55b75f4

Please sign in to comment.