From a97020f9c7e4e2be86788b5f7d83608839d3207b Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Wed, 27 Sep 2023 13:33:48 -0400
Subject: [PATCH] Correct numerous 20054-D: dynamic initialization errors found
 on arm+12.2 (#14108)

Compile issues found by compiling libcudf with the `rapidsai/devcontainers:23.10-cpp-gcc9-cuda12.2-ubuntu20.04` docker container.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/14108
---
 cpp/src/io/avro/avro_common.hpp             |  3 +-
 cpp/src/io/comp/unsnap.cu                   | 18 +++---
 cpp/src/io/orc/orc_gpu.hpp                  | 39 +++++-------
 cpp/src/io/orc/stats_enc.cu                 | 10 +--
 cpp/src/io/orc/stripe_init.cu               | 29 ++++-----
 cpp/src/io/parquet/page_decode.cuh          | 67 +++++++++++----------
 cpp/src/io/parquet/page_hdr.cu              | 12 ++--
 cpp/src/io/parquet/parquet_gpu.hpp          | 56 ++++++++---------
 cpp/src/io/statistics/column_statistics.cuh | 12 ++--
 cpp/src/io/statistics/statistics.cuh        | 30 ++++-----
 10 files changed, 138 insertions(+), 138 deletions(-)

diff --git a/cpp/src/io/avro/avro_common.hpp b/cpp/src/io/avro/avro_common.hpp
index ff8ee206dd4..0058d236d8c 100644
--- a/cpp/src/io/avro/avro_common.hpp
+++ b/cpp/src/io/avro/avro_common.hpp
@@ -25,7 +25,8 @@ namespace cudf {
 namespace io {
 namespace avro {
 struct block_desc_s {
-  block_desc_s() {}
+  block_desc_s() = default;  // required to compile on ctk-12.2 + aarch64
+
   explicit constexpr block_desc_s(
     size_t offset_, uint32_t size_, uint32_t row_offset_, uint32_t first_row_, uint32_t num_rows_)
     : offset(offset_),
diff --git a/cpp/src/io/comp/unsnap.cu b/cpp/src/io/comp/unsnap.cu
index c699502317f..504a2fe377c 100644
--- a/cpp/src/io/comp/unsnap.cu
+++ b/cpp/src/io/comp/unsnap.cu
@@ -52,6 +52,8 @@ struct unsnap_batch_s {
  * @brief Queue structure used to exchange data between warps
  */
 struct unsnap_queue_s {
+  unsnap_queue_s() = default;  // required to compile on ctk-12.2 + aarch64
+
   uint32_t prefetch_wrpos;         ///< Prefetcher write position
   uint32_t prefetch_rdpos;         ///< Prefetch consumer read position
   int32_t prefetch_end;            ///< Prefetch enable flag (nonzero stops prefetcher)
@@ -64,13 +66,15 @@ struct unsnap_queue_s {
  * @brief snappy decompression state
  */
 struct unsnap_state_s {
-  uint8_t const* base;             ///< base ptr of compressed stream
-  uint8_t const* end;              ///< end of compressed stream
-  uint32_t uncompressed_size;      ///< uncompressed stream size
-  uint32_t bytes_left;             ///< remaining bytes to decompress
-  int32_t error;                   ///< current error status
-  uint32_t tstart;                 ///< start time for perf logging
-  volatile unsnap_queue_s q;       ///< queue for cross-warp communication
+  constexpr unsnap_state_s() noexcept {}  // required to compile on ctk-12.2 + aarch64
+
+  uint8_t const* base{};           ///< base ptr of compressed stream
+  uint8_t const* end{};            ///< end of compressed stream
+  uint32_t uncompressed_size{};    ///< uncompressed stream size
+  uint32_t bytes_left{};           ///< remaining bytes to decompress
+  int32_t error{};                 ///< current error status
+  uint32_t tstart{};               ///< start time for perf logging
+  volatile unsnap_queue_s q{};     ///< queue for cross-warp communication
   device_span<uint8_t const> src;  ///< input for current block
   device_span<uint8_t> dst;        ///< output for current block
 };
diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp
index 9b8df50a22a..dba7a9ffda5 100644
--- a/cpp/src/io/orc/orc_gpu.hpp
+++ b/cpp/src/io/orc/orc_gpu.hpp
@@ -59,31 +59,24 @@ struct CompressedStreamInfo {
   explicit constexpr CompressedStreamInfo(uint8_t const* compressed_data_, size_t compressed_size_)
     : compressed_data(compressed_data_),
       uncompressed_data(nullptr),
-      compressed_data_size(compressed_size_),
-      dec_in_ctl(nullptr),
-      dec_out_ctl(nullptr),
-      copy_in_ctl(nullptr),
-      copy_out_ctl(nullptr),
-      num_compressed_blocks(0),
-      num_uncompressed_blocks(0),
-      max_uncompressed_size(0),
-      max_uncompressed_block_size(0)
+      compressed_data_size(compressed_size_)
   {
   }
-  uint8_t const* compressed_data;  // [in] base ptr to compressed stream data
-  uint8_t* uncompressed_data;  // [in] base ptr to uncompressed stream data or NULL if not known yet
-  size_t compressed_data_size;              // [in] compressed data size for this stream
-  device_span<uint8_t const>* dec_in_ctl;   // [in] input buffer to decompress
-  device_span<uint8_t>* dec_out_ctl;        // [in] output buffer to decompress into
-  device_span<compression_result> dec_res;  // [in] results of decompression
-  device_span<uint8_t const>* copy_in_ctl;  // [out] input buffer to copy
-  device_span<uint8_t>* copy_out_ctl;       // [out] output buffer to copy to
-  uint32_t num_compressed_blocks;  // [in,out] number of entries in decctl(in), number of compressed
-                                   // blocks(out)
-  uint32_t num_uncompressed_blocks;      // [in,out] number of entries in dec_in_ctl(in), number of
-                                         // uncompressed blocks(out)
-  uint64_t max_uncompressed_size;        // [out] maximum uncompressed data size of stream
-  uint32_t max_uncompressed_block_size;  // [out] maximum uncompressed size of any block in stream
+  uint8_t const* compressed_data{};  // [in] base ptr to compressed stream data
+  uint8_t*
+    uncompressed_data{};  // [in] base ptr to uncompressed stream data or NULL if not known yet
+  size_t compressed_data_size{};              // [in] compressed data size for this stream
+  device_span<uint8_t const>* dec_in_ctl{};   // [in] input buffer to decompress
+  device_span<uint8_t>* dec_out_ctl{};        // [in] output buffer to decompress into
+  device_span<compression_result> dec_res{};  // [in] results of decompression
+  device_span<uint8_t const>* copy_in_ctl{};  // [out] input buffer to copy
+  device_span<uint8_t>* copy_out_ctl{};       // [out] output buffer to copy to
+  uint32_t num_compressed_blocks{};           // [in,out] number of entries in decctl(in), number of
+                                              // compressed blocks(out)
+  uint32_t num_uncompressed_blocks{};  // [in,out] number of entries in dec_in_ctl(in), number of
+                                       // uncompressed blocks(out)
+  uint64_t max_uncompressed_size{};    // [out] maximum uncompressed data size of stream
+  uint32_t max_uncompressed_block_size{};  // [out] maximum uncompressed size of any block in stream
 };
 
 enum StreamIndexType {
diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu
index 69d7ec95acd..95f1db5bfd1 100644
--- a/cpp/src/io/orc/stats_enc.cu
+++ b/cpp/src/io/orc/stats_enc.cu
@@ -134,11 +134,11 @@ __global__ void __launch_bounds__(block_size, 1)
 }
 
 struct stats_state_s {
-  uint8_t* base;  ///< Output buffer start
-  uint8_t* end;   ///< Output buffer end
-  statistics_chunk chunk;
-  statistics_merge_group group;
-  statistics_dtype stats_dtype;  //!< Statistics data type for this column
+  uint8_t* base{};  ///< Output buffer start
+  uint8_t* end{};   ///< Output buffer end
+  statistics_chunk chunk{};
+  statistics_merge_group group{};
+  statistics_dtype stats_dtype{};  //!< Statistics data type for this column
 };
 
 /*
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index d8a60350356..8eeca504121 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -30,14 +30,14 @@ namespace orc {
 namespace gpu {
 
 struct comp_in_out {
-  uint8_t const* in_ptr;
-  size_t in_size;
-  uint8_t* out_ptr;
-  size_t out_size;
+  uint8_t const* in_ptr{};
+  size_t in_size{};
+  uint8_t* out_ptr{};
+  size_t out_size{};
 };
 struct compressed_stream_s {
-  CompressedStreamInfo info;
-  comp_in_out ctl;
+  CompressedStreamInfo info{};
+  comp_in_out ctl{};
 };
 
 // blockDim {128,1,1}
@@ -208,14 +208,15 @@ __global__ void __launch_bounds__(128, 8)
  * @brief Shared mem state for gpuParseRowGroupIndex
  */
 struct rowindex_state_s {
-  ColumnDesc chunk;
-  uint32_t rowgroup_start;
-  uint32_t rowgroup_end;
-  int is_compressed;
-  uint32_t row_index_entry[3][CI_PRESENT];  // NOTE: Assumes CI_PRESENT follows CI_DATA and CI_DATA2
-  CompressedStreamInfo strm_info[2];
-  RowGroup rowgroups[128];
-  uint32_t compressed_offset[128][2];
+  ColumnDesc chunk{};
+  uint32_t rowgroup_start{};
+  uint32_t rowgroup_end{};
+  int is_compressed{};
+  uint32_t row_index_entry[3]
+                          [CI_PRESENT]{};  // NOTE: Assumes CI_PRESENT follows CI_DATA and CI_DATA2
+  CompressedStreamInfo strm_info[2]{};
+  RowGroup rowgroups[128]{};
+  uint32_t compressed_offset[128][2]{};
 };
 
 enum row_entry_state_e {
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index 26e3c951b2e..5e66885d746 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -26,48 +26,49 @@
 namespace cudf::io::parquet::gpu {
 
 struct page_state_s {
-  uint8_t const* data_start;
-  uint8_t const* data_end;
-  uint8_t const* lvl_end;
-  uint8_t const* dict_base;    // ptr to dictionary page data
-  int32_t dict_size;           // size of dictionary data
-  int32_t first_row;           // First row in page to output
-  int32_t num_rows;            // Rows in page to decode (including rows to be skipped)
-  int32_t first_output_value;  // First value in page to output
-  int32_t num_input_values;    // total # of input/level values in the page
-  int32_t dtype_len;           // Output data type length
-  int32_t dtype_len_in;        // Can be larger than dtype_len if truncating 32-bit into 8-bit
-  int32_t dict_bits;           // # of bits to store dictionary indices
-  uint32_t dict_run;
-  int32_t dict_val;
-  uint32_t initial_rle_run[NUM_LEVEL_TYPES];   // [def,rep]
-  int32_t initial_rle_value[NUM_LEVEL_TYPES];  // [def,rep]
-  int32_t error;
-  PageInfo page;
-  ColumnChunkDesc col;
+  constexpr page_state_s() noexcept {}
+  uint8_t const* data_start{};
+  uint8_t const* data_end{};
+  uint8_t const* lvl_end{};
+  uint8_t const* dict_base{};    // ptr to dictionary page data
+  int32_t dict_size{};           // size of dictionary data
+  int32_t first_row{};           // First row in page to output
+  int32_t num_rows{};            // Rows in page to decode (including rows to be skipped)
+  int32_t first_output_value{};  // First value in page to output
+  int32_t num_input_values{};    // total # of input/level values in the page
+  int32_t dtype_len{};           // Output data type length
+  int32_t dtype_len_in{};        // Can be larger than dtype_len if truncating 32-bit into 8-bit
+  int32_t dict_bits{};           // # of bits to store dictionary indices
+  uint32_t dict_run{};
+  int32_t dict_val{};
+  uint32_t initial_rle_run[NUM_LEVEL_TYPES]{};   // [def,rep]
+  int32_t initial_rle_value[NUM_LEVEL_TYPES]{};  // [def,rep]
+  int32_t error{};
+  PageInfo page{};
+  ColumnChunkDesc col{};
 
   // (leaf) value decoding
-  int32_t nz_count;  // number of valid entries in nz_idx (write position in circular buffer)
-  int32_t dict_pos;  // write position of dictionary indices
-  int32_t src_pos;   // input read position of final output value
-  int32_t ts_scale;  // timestamp scale: <0: divide by -ts_scale, >0: multiply by ts_scale
+  int32_t nz_count{};  // number of valid entries in nz_idx (write position in circular buffer)
+  int32_t dict_pos{};  // write position of dictionary indices
+  int32_t src_pos{};   // input read position of final output value
+  int32_t ts_scale{};  // timestamp scale: <0: divide by -ts_scale, >0: multiply by ts_scale
 
   // repetition/definition level decoding
-  int32_t input_value_count;                  // how many values of the input we've processed
-  int32_t input_row_count;                    // how many rows of the input we've processed
-  int32_t input_leaf_count;                   // how many leaf values of the input we've processed
-  uint8_t const* lvl_start[NUM_LEVEL_TYPES];  // [def,rep]
-  uint8_t const* abs_lvl_start[NUM_LEVEL_TYPES];  // [def,rep]
-  uint8_t const* abs_lvl_end[NUM_LEVEL_TYPES];    // [def,rep]
-  int32_t lvl_count[NUM_LEVEL_TYPES];             // how many of each of the streams we've decoded
-  int32_t row_index_lower_bound;                  // lower bound of row indices we should process
+  int32_t input_value_count{};                  // how many values of the input we've processed
+  int32_t input_row_count{};                    // how many rows of the input we've processed
+  int32_t input_leaf_count{};                   // how many leaf values of the input we've processed
+  uint8_t const* lvl_start[NUM_LEVEL_TYPES]{};  // [def,rep]
+  uint8_t const* abs_lvl_start[NUM_LEVEL_TYPES]{};  // [def,rep]
+  uint8_t const* abs_lvl_end[NUM_LEVEL_TYPES]{};    // [def,rep]
+  int32_t lvl_count[NUM_LEVEL_TYPES]{};             // how many of each of the streams we've decoded
+  int32_t row_index_lower_bound{};                  // lower bound of row indices we should process
 
   // a shared-memory cache of frequently used data when decoding. The source of this data is
   // normally stored in global memory which can yield poor performance. So, when possible
   // we copy that info here prior to decoding
-  PageNestingDecodeInfo nesting_decode_cache[max_cacheable_nesting_decode_info];
+  PageNestingDecodeInfo nesting_decode_cache[max_cacheable_nesting_decode_info]{};
   // points to either nesting_decode_cache above when possible, or to the global source otherwise
-  PageNestingDecodeInfo* nesting_info;
+  PageNestingDecodeInfo* nesting_info{};
 };
 
 // buffers only used in the decode kernel.  separated from page_state_s to keep
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index 0d611643b46..6f8b2f50443 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -45,13 +45,13 @@ static const __device__ __constant__ uint8_t g_list2struct[16] = {0,
                                                                   ST_FLD_LIST};
 
 struct byte_stream_s {
-  uint8_t const* cur;
-  uint8_t const* end;
-  uint8_t const* base;
+  uint8_t const* cur{};
+  uint8_t const* end{};
+  uint8_t const* base{};
   // Parsed symbols
-  PageType page_type;
-  PageInfo page;
-  ColumnChunkDesc ck;
+  PageType page_type{};
+  PageInfo page{};
+  ColumnChunkDesc ck{};
 };
 
 /**
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index a3cc37dee4f..a760c2448dc 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -228,7 +228,7 @@ struct PageInfo {
  * @brief Struct describing a particular chunk of column data
  */
 struct ColumnChunkDesc {
-  ColumnChunkDesc() = default;
+  constexpr ColumnChunkDesc() noexcept {};
   explicit ColumnChunkDesc(size_t compressed_size_,
                            uint8_t* compressed_data_,
                            size_t num_values_,
@@ -275,34 +275,34 @@ struct ColumnChunkDesc {
   {
   }
 
-  uint8_t const* compressed_data;                  // pointer to compressed column chunk data
-  size_t compressed_size;                          // total compressed data size for this chunk
-  size_t num_values;                               // total number of values in this column
-  size_t start_row;                                // starting row of this chunk
-  uint32_t num_rows;                               // number of rows in this chunk
-  int16_t max_level[level_type::NUM_LEVEL_TYPES];  // max definition/repetition level
-  int16_t max_nesting_depth;                       // max nesting depth of the output
-  uint16_t data_type;                              // basic column data type, ((type_length << 3) |
-                                                   // parquet::Type)
+  uint8_t const* compressed_data{};                  // pointer to compressed column chunk data
+  size_t compressed_size{};                          // total compressed data size for this chunk
+  size_t num_values{};                               // total number of values in this column
+  size_t start_row{};                                // starting row of this chunk
+  uint32_t num_rows{};                               // number of rows in this chunk
+  int16_t max_level[level_type::NUM_LEVEL_TYPES]{};  // max definition/repetition level
+  int16_t max_nesting_depth{};                       // max nesting depth of the output
+  uint16_t data_type{};  // basic column data type, ((type_length << 3) |
+                         // parquet::Type)
   uint8_t
-    level_bits[level_type::NUM_LEVEL_TYPES];  // bits to encode max definition/repetition levels
-  int32_t num_data_pages;                     // number of data pages
-  int32_t num_dict_pages;                     // number of dictionary pages
-  int32_t max_num_pages;                      // size of page_info array
-  PageInfo* page_info;                        // output page info for up to num_dict_pages +
-                                              // num_data_pages (dictionary pages first)
-  string_index_pair* str_dict_index;          // index for string dictionary
-  bitmask_type** valid_map_base;              // base pointers of valid bit map for this column
-  void** column_data_base;                    // base pointers of column data
-  void** column_string_base;                  // base pointers of column string data
-  int8_t codec;                               // compressed codec enum
-  int8_t converted_type;                      // converted type enum
-  LogicalType logical_type;                   // logical type
-  int8_t decimal_precision;                   // Decimal precision
-  int32_t ts_clock_rate;  // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns)
-
-  int32_t src_col_index;   // my input column index
-  int32_t src_col_schema;  // my schema index in the file
+    level_bits[level_type::NUM_LEVEL_TYPES]{};  // bits to encode max definition/repetition levels
+  int32_t num_data_pages{};                     // number of data pages
+  int32_t num_dict_pages{};                     // number of dictionary pages
+  int32_t max_num_pages{};                      // size of page_info array
+  PageInfo* page_info{};                        // output page info for up to num_dict_pages +
+                                                // num_data_pages (dictionary pages first)
+  string_index_pair* str_dict_index{};          // index for string dictionary
+  bitmask_type** valid_map_base{};              // base pointers of valid bit map for this column
+  void** column_data_base{};                    // base pointers of column data
+  void** column_string_base{};                  // base pointers of column string data
+  int8_t codec{};                               // compressed codec enum
+  int8_t converted_type{};                      // converted type enum
+  LogicalType logical_type{};                   // logical type
+  int8_t decimal_precision{};                   // Decimal precision
+  int32_t ts_clock_rate{};  // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns)
+
+  int32_t src_col_index{};   // my input column index
+  int32_t src_col_schema{};  // my schema index in the file
 };
 
 /**
diff --git a/cpp/src/io/statistics/column_statistics.cuh b/cpp/src/io/statistics/column_statistics.cuh
index 28e77f62a43..f71fb95949f 100644
--- a/cpp/src/io/statistics/column_statistics.cuh
+++ b/cpp/src/io/statistics/column_statistics.cuh
@@ -34,18 +34,18 @@ namespace io {
  * @brief shared state for statistics calculation kernel
  */
 struct stats_state_s {
-  stats_column_desc col;   ///< Column information
-  statistics_group group;  ///< Group description
-  statistics_chunk ck;     ///< Output statistics chunk
+  stats_column_desc col{};   ///< Column information
+  statistics_group group{};  ///< Group description
+  statistics_chunk ck{};     ///< Output statistics chunk
 };
 
 /**
  * @brief shared state for statistics merge kernel
  */
 struct merge_state_s {
-  stats_column_desc col;         ///< Column information
-  statistics_merge_group group;  ///< Group description
-  statistics_chunk ck;           ///< Resulting statistics chunk
+  stats_column_desc col{};         ///< Column information
+  statistics_merge_group group{};  ///< Group description
+  statistics_chunk ck{};           ///< Resulting statistics chunk
 };
 
 template <int dimension>
diff --git a/cpp/src/io/statistics/statistics.cuh b/cpp/src/io/statistics/statistics.cuh
index 805ca43553e..b6e698fee11 100644
--- a/cpp/src/io/statistics/statistics.cuh
+++ b/cpp/src/io/statistics/statistics.cuh
@@ -98,27 +98,27 @@ union statistics_val {
 };
 
 struct statistics_chunk {
-  uint32_t non_nulls;        //!< number of non-null values in chunk
-  uint32_t null_count;       //!< number of null values in chunk
-  statistics_val min_value;  //!< minimum value in chunk
-  statistics_val max_value;  //!< maximum value in chunk
-  statistics_val sum;        //!< sum of chunk
-  uint8_t has_minmax;        //!< Nonzero if min_value and max_values are valid
-  uint8_t has_sum;           //!< Nonzero if sum is valid
+  uint32_t non_nulls{};        //!< number of non-null values in chunk
+  uint32_t null_count{};       //!< number of null values in chunk
+  statistics_val min_value{};  //!< minimum value in chunk
+  statistics_val max_value{};  //!< maximum value in chunk
+  statistics_val sum{};        //!< sum of chunk
+  uint8_t has_minmax{};        //!< Nonzero if min_value and max_values are valid
+  uint8_t has_sum{};           //!< Nonzero if sum is valid
 };
 
 struct statistics_group {
-  stats_column_desc const* col;  //!< Column information
-  uint32_t start_row;            //!< Start row of this group
-  uint32_t num_rows;             //!< Number of rows in group
-  uint32_t non_leaf_nulls;       //!< Number of null non-leaf values in the group
+  stats_column_desc const* col{};  //!< Column information
+  uint32_t start_row{};            //!< Start row of this group
+  uint32_t num_rows{};             //!< Number of rows in group
+  uint32_t non_leaf_nulls{};       //!< Number of null non-leaf values in the group
 };
 
 struct statistics_merge_group {
-  data_type col_dtype;           //!< Column data type
-  statistics_dtype stats_dtype;  //!< Statistics data type for this column
-  uint32_t start_chunk;          //!< Start chunk of this group
-  uint32_t num_chunks;           //!< Number of chunks in group
+  data_type col_dtype;                       //!< Column data type
+  statistics_dtype stats_dtype{dtype_none};  //!< Statistics data type for this column
+  uint32_t start_chunk{};                    //!< Start chunk of this group
+  uint32_t num_chunks{};                     //!< Number of chunks in group
 };
 
 template <typename T, std::enable_if_t<!std::is_same_v<T, statistics::byte_array_view>>* = nullptr>