From 55b75f45257e685a7aff774578f00ef8fe39c55b Mon Sep 17 00:00:00 2001
From: Chong Gao <chongg@nvidia.com>
Date: Fri, 28 Jul 2023 10:33:13 +0800
Subject: [PATCH] Add large_data_test mark in pytest; Add a case to test ORC
 writing with lots of nulls (#8825)

Signed-off-by: Chong Gao <res_life@163.com>
---
 integration_tests/README.md                   |  4 +++
 integration_tests/conftest.py                 |  3 ++
 integration_tests/pytest.ini                  |  1 +
 integration_tests/src/main/python/conftest.py |  4 +++
 integration_tests/src/main/python/marks.py    |  1 +
 integration_tests/src/main/python/orc_test.py | 31 ++++++++++++++++++-
 6 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/integration_tests/README.md b/integration_tests/README.md
index bbca167ac8f..0ce52325561 100644
--- a/integration_tests/README.md
+++ b/integration_tests/README.md
@@ -398,6 +398,10 @@ properly without it. These tests assume Delta Lake is not configured and are dis
 If Spark has been configured to support Delta Lake then these tests can be enabled by adding the
 `--delta_lake` option to the command.
 
+### Enabling large data tests
+Some tests are testing large data which will take a long time. By default, these tests are disabled.
+These tests can be enabled by adding the `--large_data_test` option to the command.
+
 ## Writing tests
 
 There are a number of libraries provided to help someone write new tests.
diff --git a/integration_tests/conftest.py b/integration_tests/conftest.py
index 8c6d5e16cfd..46875f25e35 100644
--- a/integration_tests/conftest.py
+++ b/integration_tests/conftest.py
@@ -56,3 +56,6 @@ def pytest_addoption(parser):
         "--force_parquet_testing_tests", action="store_true", default=False,
         help="if true forces parquet-testing tests to fail if input data cannot be found"
     )
+    parser.addoption(
+        "--large_data_test", action='store_true', default=False, help="if enable tests with large data"
+    )
diff --git a/integration_tests/pytest.ini b/integration_tests/pytest.ini
index 20e5ca615c7..29530af0c06 100644
--- a/integration_tests/pytest.ini
+++ b/integration_tests/pytest.ini
@@ -33,5 +33,6 @@ markers =
     iceberg: Mark a test that requires Iceberg has been configured, skipping if tests are not configured for Iceberg
     delta_lake: Mark a test that requires Delta Lake has been configured, skipping if tests are not configured for Delta Lake
     regexp: Mark a test that tests regular expressions on the GPU (only works when UTF-8 is enabled)
+    large_data_test: Mark tests with large data
 filterwarnings =
     ignore:.*pytest.mark.order.*:_pytest.warning_types.PytestUnknownMarkWarning
diff --git a/integration_tests/src/main/python/conftest.py b/integration_tests/src/main/python/conftest.py
index d487b27177b..814617420fa 100644
--- a/integration_tests/src/main/python/conftest.py
+++ b/integration_tests/src/main/python/conftest.py
@@ -218,6 +218,10 @@ def pytest_runtest_setup(item):
         if not item.config.getoption('delta_lake'):
             pytest.skip('delta lake tests not configured to run')
 
+    if item.get_closest_marker('large_data_test'):
+        if not item.config.getoption('large_data_test'):
+            pytest.skip('tests for large data not configured to run')
+
 def pytest_configure(config):
     global _runtime_env
     _runtime_env = config.getoption('runtime_env')
diff --git a/integration_tests/src/main/python/marks.py b/integration_tests/src/main/python/marks.py
index c5dc264fb46..9a56b2d1fca 100644
--- a/integration_tests/src/main/python/marks.py
+++ b/integration_tests/src/main/python/marks.py
@@ -30,3 +30,4 @@
 fuzz_test = pytest.mark.fuzz_test
 iceberg = pytest.mark.iceberg
 delta_lake = pytest.mark.delta_lake
+large_data_test = pytest.mark.large_data_test
\ No newline at end of file
diff --git a/integration_tests/src/main/python/orc_test.py b/integration_tests/src/main/python/orc_test.py
index 9e3053bde92..4ad9f88cb4d 100644
--- a/integration_tests/src/main/python/orc_test.py
+++ b/integration_tests/src/main/python/orc_test.py
@@ -15,7 +15,7 @@
 import pytest
 
 from asserts import assert_cpu_and_gpu_are_equal_sql_with_capture, assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_row_counts_equal, assert_gpu_fallback_collect, \
-    assert_cpu_and_gpu_are_equal_collect_with_capture, assert_gpu_and_cpu_writes_are_equal_collect
+    assert_cpu_and_gpu_are_equal_collect_with_capture, assert_gpu_and_cpu_writes_are_equal_collect, assert_gpu_and_cpu_are_equal_sql
 from data_gen import *
 from marks import *
 from pyspark.sql.types import *
@@ -902,3 +902,32 @@ def gen_null_df(spark):
     gpu_file_path = data_path + "/GPU"
     reader = read_orc_df(gpu_file_path)
     assert_gpu_and_cpu_are_equal_collect(lambda spark: reader(spark), conf=all_confs)
+
+@ignore_order
+@large_data_test
+@pytest.mark.parametrize("reader_confs", reader_opt_confs, ids=idfn)
+def test_orc_with_null_column_with_1m_rows(spark_tmp_path, reader_confs):
+    data_path = spark_tmp_path + "/ORC_DATA"
+    all_confs = reader_confs
+    data = [(i, None, None, None, None) for i in range(1000000)]
+    def gen_null_df(spark):
+        return spark.createDataFrame(
+            data,
+            "c1 int, c2 long, c3 float, c4 double, c5 boolean")
+    assert_gpu_and_cpu_writes_are_equal_collect(
+        lambda spark, path: gen_null_df(spark).write.orc(path),
+        lambda spark, path: spark.read.orc(path),
+        data_path,
+        conf=all_confs)
+    gpu_file_path = data_path + "/CPU"
+    sqls = ["SELECT * FROM my_large_table",
+            "SELECT * FROM my_large_table WHERE c2 = 5",
+            "SELECT COUNT(*) FROM my_large_table WHERE c3 IS NOT NULL",
+            "SELECT * FROM my_large_table WHERE c4 IS NULL",
+            "SELECT * FROM my_large_table WHERE c5 IS NULL",
+            ]
+    for sql in sqls:
+        assert_gpu_and_cpu_are_equal_sql(
+            lambda spark: spark.read.orc(gpu_file_path),
+            "my_large_table",
+            sql)
\ No newline at end of file