rapidsai · wmalpica · Jul 17, 2018 · Jul 18, 2018 · Jul 20, 2018 · Jul 20, 2018
diff --git a/.gitignore b/.gitignore
@@ -18,3 +18,5 @@ python/libgdf_cffi/libgdf_cffi.py
 
 ## eclipse
 .project
+
+build2/
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,6 +1,7 @@
 #=============================================================================
 # Copyright 2018 BlazingDB, Inc.
 #     Copyright 2018 Percy Camilo Triveño Aucahuasi <[email protected]>
+#     Copyright 2018 Cristhian Alberto Gonzales Castillo <[email protected]>
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -25,7 +26,7 @@
 
 PROJECT(libgdf)
 
-cmake_minimum_required(VERSION 2.8)  # not sure about version required
+cmake_minimum_required(VERSION 3.3)  # not sure about version required
 
 set(CMAKE_CXX_STANDARD 11)
 message(STATUS "Using C++ standard: c++${CMAKE_CXX_STANDARD}")
@@ -46,6 +47,7 @@ include(CTest)
 # Include custom modules (see cmake directory)
 include(ConfigureGoogleTest)
 include(ConfigureArrow)
+include(ConfigureParquetCpp)
 
 find_package(CUDA)
 set_package_properties(
@@ -83,12 +85,15 @@ else()
     message(FATAL_ERROR "Apache Arrow not found, please check your settings.")
 endif()
 
+get_property(PARQUETCPP_INCLUDE_DIRS TARGET Apache::ParquetCpp PROPERTY INTERFACE_INCLUDE_DIRECTORIES)
+
 include_directories(
     "${CMAKE_CURRENT_SOURCE_DIR}/include"
     "${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/cub"
     "${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/moderngpu/src"
     "${CUDA_INCLUDE_DIRS}"
     "${ARROW_INCLUDEDIR}"
+    "${PARQUETCPP_INCLUDE_DIRS}"
 )
 
 IF(CUDA_VERSION_MAJOR GREATER 7)
@@ -119,6 +124,19 @@ if(HT_LEGACY_ALLOCATOR)
   set(CUDA_NVCC_FLAGS  ${CUDA_NVCC_FLAGS};-DHT_LEGACY_ALLOCATOR)
 endif()
 
+cuda_add_library(gdf-parquet
+    src/parquet/api.cpp
+    src/parquet/column_reader.cu
+    src/parquet/file_reader.cpp
+    src/parquet/file_reader_contents.cpp
+    src/parquet/page_reader.cpp
+    src/parquet/row_group_reader_contents.cpp
+    src/parquet/decoder/cu_level_decoder.cu
+    src/arrow/cu_decoder.cu
+    src/arrow/util/pinned_allocator.cu
+)
+
+target_link_libraries(gdf-parquet Apache::ParquetCpp)
 
 cuda_add_library(gdf SHARED
     src/binaryops.cu
@@ -198,5 +216,10 @@ if(GTEST_FOUND)
 else()
     message(AUTHOR_WARNING "Google C++ Testing Framework (Google Test) not found: automated tests are disabled.")
 endif()
+
+if(GDF_BENCHMARK)
+	add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/src/bench)
+endif()
+
 # Print the project summary
 feature_summary(WHAT ALL INCLUDE_QUIET_PACKAGES FATAL_ON_MISSING_REQUIRED_PACKAGES)
diff --git a/cmake/Modules/ConfigureArrow.cmake b/cmake/Modules/ConfigureArrow.cmake
@@ -1,6 +1,7 @@
 #=============================================================================
 # Copyright 2018 BlazingDB, Inc.
 #     Copyright 2018 Percy Camilo Triveño Aucahuasi <[email protected]>
+#     Copyright 2018 Cristhian Alberto Gonzales Castillo <[email protected]>
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,7 +16,7 @@
 # limitations under the License.
 #=============================================================================
 
-set(ARROW_DOWNLOAD_BINARY_DIR ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/arrow-download/)
+set(ARROW_DOWNLOAD_BINARY_DIR ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/arrow-download)
 
 # Download and unpack arrow at configure time
 configure_file(${CMAKE_SOURCE_DIR}/cmake/Templates/Arrow.CMakeLists.txt.cmake ${ARROW_DOWNLOAD_BINARY_DIR}/CMakeLists.txt COPYONLY)

diff --git a/cmake/Modules/ConfigureParquetCpp.cmake b/cmake/Modules/ConfigureParquetCpp.cmake
@@ -0,0 +1,89 @@
+#=============================================================================
+# Copyright 2018 BlazingDB, Inc.
+#     Copyright 2018 Cristhian Alberto Gonzales Castillo <[email protected]>
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+# Download and unpack ParquetCpp at configure time
+configure_file(${CMAKE_SOURCE_DIR}/cmake/Templates/ParquetCpp.CMakeLists.txt.cmake ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/parquetcpp-download/CMakeLists.txt)
+
+execute_process(
+    COMMAND ${CMAKE_COMMAND} -F "${CMAKE_GENERATOR}" .
+    RESULT_VARIABLE result
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/parquetcpp-download/
+)
+
+if(result)
+    message(FATAL_ERROR "CMake step for ParquetCpp failed: ${result}")
+endif()
+
+# Transitive dependencies
+set(ARROW_TRANSITIVE_DEPENDENCIES_PREFIX ${ARROW_DOWNLOAD_BINARY_DIR}/arrow-prefix/src/arrow-build)
+set(BROTLI_TRANSITIVE_DEPENDENCY_PREFIX ${ARROW_TRANSITIVE_DEPENDENCIES_PREFIX}/brotli_ep/src/brotli_ep-install/lib/x86_64-linux-gnu)
+set(BROTLI_STATIC_LIB_ENC ${BROTLI_TRANSITIVE_DEPENDENCY_PREFIX}/libbrotlienc.a)
+set(BROTLI_STATIC_LIB_DEC ${BROTLI_TRANSITIVE_DEPENDENCY_PREFIX}/libbrotlidec.a)
+set(BROTLI_STATIC_LIB_COMMON ${BROTLI_TRANSITIVE_DEPENDENCY_PREFIX}/libbrotlicommon.a)
+set(SNAPPY_STATIC_LIB ${ARROW_TRANSITIVE_DEPENDENCIES_PREFIX}/snappy_ep/src/snappy_ep-install/lib/libsnappy.a)
+set(ZLIB_STATIC_LIB ${ARROW_TRANSITIVE_DEPENDENCIES_PREFIX}/zlib_ep/src/zlib_ep-install/lib/libz.a)
+set(LZ4_STATIC_LIB ${ARROW_TRANSITIVE_DEPENDENCIES_PREFIX}/lz4_ep-prefix/src/lz4_ep/lib/liblz4.a)
+set(ZSTD_STATIC_LIB ${ARROW_TRANSITIVE_DEPENDENCIES_PREFIX}/zstd_ep-prefix/src/zstd_ep/lib/libzstd.a)
+set(ARROW_HOME ${ARROW_ROOT})
+
+set(ENV{BROTLI_STATIC_LIB_ENC} ${BROTLI_STATIC_LIB_ENC})
+set(ENV{BROTLI_STATIC_LIB_DEC} ${BROTLI_STATIC_LIB_DEC})
+set(ENV{BROTLI_STATIC_LIB_COMMON} ${BROTLI_STATIC_LIB_COMMON})
+set(ENV{SNAPPY_STATIC_LIB} ${SNAPPY_STATIC_LIB})
+set(ENV{ZLIB_STATIC_LIB} ${ZLIB_STATIC_LIB})
+set(ENV{LZ4_STATIC_LIB} ${LZ4_STATIC_LIB})
+set(ENV{ZSTD_STATIC_LIB} ${ZSTD_STATIC_LIB})
+set(ENV{ARROW_HOME} ${ARROW_HOME})
+
+execute_process(
+    COMMAND ${CMAKE_COMMAND} --build .
+    RESULT_VARIABLE result
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/parquetcpp-download)
+
+if(result)
+    message(FATAL_ERROR "Build step for ParquetCpp failed: ${result}")
+endif()
+
+# Add transitive dependency: Thrift
+set(THRIFT_ROOT ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/parquetcpp-build/thrift_ep/src/thrift_ep-install)
+
+# Locate ParquetCpp package
+set(PARQUETCPP_ROOT ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/parquetcpp-install)
+set(PARQUETCPP_BINARY_DIR ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/parquetcpp-build)
+set(PARQUETCPP_SOURCE_DIR ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/parquetcpp-src)
+
+# Dependency interfaces
+find_package(Boost REQUIRED COMPONENTS regex)
+
+add_library(Apache::Thrift INTERFACE IMPORTED)
+set_target_properties(Apache::Thrift
+    PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${THRIFT_ROOT}/include)
+set_target_properties(Apache::Thrift
+    PROPERTIES INTERFACE_LINK_LIBRARIES ${THRIFT_ROOT}/lib/libthrift.a)
+
+add_library(Apache::Arrow INTERFACE IMPORTED)
+set_target_properties(Apache::Arrow
+    PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${ARROW_ROOT}/include)
+set_target_properties(Apache::Arrow
+    PROPERTIES INTERFACE_LINK_LIBRARIES "${ARROW_ROOT}/lib/libarrow.a;${BROTLI_STATIC_LIB_ENC};${BROTLI_STATIC_LIB_DEC};${BROTLI_STATIC_LIB_COMMON};${SNAPPY_STATIC_LIB};${ZLIB_STATIC_LIB};${LZ4_STATIC_LIB};${ZSTD_STATIC_LIB}")
+
+add_library(Apache::ParquetCpp INTERFACE IMPORTED)
+set_target_properties(Apache::ParquetCpp
+    PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
+        "${PARQUETCPP_ROOT}/include;${PARQUETCPP_BINARY_DIR}/src;${PARQUETCPP_SOURCE_DIR}/src")
+set_target_properties(Apache::ParquetCpp
+    PROPERTIES INTERFACE_LINK_LIBRARIES "${PARQUETCPP_ROOT}/lib/libparquet.a;Apache::Arrow;Apache::Thrift;Boost::regex")
diff --git a/cmake/Templates/Arrow.CMakeLists.txt.cmake b/cmake/Templates/Arrow.CMakeLists.txt.cmake
@@ -1,6 +1,7 @@
 #=============================================================================
 # Copyright 2018 BlazingDB, Inc.
 #     Copyright 2018 Percy Camilo Triveño Aucahuasi <[email protected]>
+#     Copyright 2018 Cristhian Alberto Gonzales Castillo <[email protected]>
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,7 +24,7 @@ project(arrow-download NONE)
 
 include(ExternalProject)
 
-set(ARROW_VERSION "apache-arrow-0.10.0")
+set(ARROW_VERSION "apache-arrow-0.9.0")
 
 if (NOT "$ENV{PARQUET_ARROW_VERSION}" STREQUAL "")
     set(ARROW_VERSION "$ENV{PARQUET_ARROW_VERSION}")
@@ -34,24 +35,19 @@ message(STATUS "Using Apache Arrow version: ${ARROW_VERSION}")
 set(ARROW_URL "https://github.com/apache/arrow/archive/${ARROW_VERSION}.tar.gz")
 
 set(ARROW_CMAKE_ARGS
-    #Arrow dependencies
-    -DARROW_WITH_LZ4=OFF
-    -DARROW_WITH_ZSTD=OFF
-    -DARROW_WITH_BROTLI=OFF
-    -DARROW_WITH_SNAPPY=OFF
-    -DARROW_WITH_ZLIB=OFF
-
     #Build settings
     -DARROW_BUILD_STATIC=ON
     -DARROW_BUILD_SHARED=OFF
     -DARROW_BOOST_USE_SHARED=ON
     -DARROW_BUILD_TESTS=OFF
     -DARROW_TEST_MEMCHECK=OFF
     -DARROW_BUILD_BENCHMARKS=OFF
+    -DARROW_BUILD_UTILITIES=OFF
+    -DARROW_JEMALLOC=OFF
 
     #Arrow modules
     -DARROW_IPC=ON
-    -DARROW_COMPUTE=OFF
+    -DARROW_COMPUTE=ON
     -DARROW_GPU=OFF
     -DARROW_JEMALLOC=OFF
     -DARROW_BOOST_VENDORED=OFF

diff --git a/cmake/Templates/ParquetCpp.CMakeLists.txt.cmake b/cmake/Templates/ParquetCpp.CMakeLists.txt.cmake
@@ -0,0 +1,44 @@
+#=============================================================================
+# Copyright 2018 BlazingDB, Inc.
+#     Copyright 2018 Cristhian Alberto Gonzales Castillo <[email protected]>
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+cmake_minimum_required(VERSION 2.8.12)
+
+project(parquetcpp-download NONE)
+
+include(ExternalProject)
+
+set(PARQUET_VERSION apache-parquet-cpp-1.4.0)
+
+if (NOT $ENV{PARQUET_VERSION} STREQUAL "")
+    set(PARQUET_VERSION $ENV{PARQUET_VETSION})
+endif()
+
+message(STATUS "Using Apache ParquetCpp version: ${PARQUET_VERSION}")
+
+ExternalProject_Add(parquetcpp
+    BINARY_DIR "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/parquetcpp-build"
+    CMAKE_ARGS
+        -DCMAKE_BUILD_TYPE=RELEASE
+        -DCMAKE_INSTALL_PREFIX=${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/parquetcpp-install
+        -DPARQUET_ARROW_LINKAGE=static
+        -DPARQUET_BUILD_SHARED=OFF
+        -DPARQUET_BUILD_TESTS=OFF
+    GIT_REPOSITORY https://github.com/apache/parquet-cpp.git
+    GIT_TAG apache-parquet-cpp-1.4.0
+    INSTALL_DIR "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/parquetcpp-install"
+    SOURCE_DIR "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/thirdparty/parquetcpp-src"
+)
diff --git a/conda_environments/dev_py35.yml b/conda_environments/dev_py35.yml
@@ -24,4 +24,6 @@ dependencies:
 - llvmlite=0.18.0=py35_0
 - numba=0.34.0.dev=np112py35_316
 - cmake=3.6.3=0
+- flex=2.6.0
+- bison=3.0.4
 - pyarrow=0.10.0
diff --git a/include/gdf/cffi/types.h b/include/gdf/cffi/types.h
@@ -48,6 +48,8 @@ typedef enum {
     GDF_INVALID_API_CALL,             /**< The arguments passed into the function were invalid */   
     GDF_JOIN_DTYPE_MISMATCH,          /**< Datatype mismatch between corresponding columns in  left/right tables in the Join function */   
     GDF_JOIN_TOO_MANY_COLUMNS,        /**< Too many columns were passed in for the requested join operation*/       
+
+    GDF_IO_ERROR,                     /**< Error occured in a parquet-reader api which load a parquet file into gdf_columns */
     GDF_DTYPE_MISMATCH,               /**< Type mismatch between columns that should be the same type */
     GDF_UNSUPPORTED_METHOD,           /**< The method requested to perform an operation was invalid or unsupported (e.g., hash vs. sort)*/ 
     GDF_INVALID_AGGREGATOR,           /**< Invalid aggregator was specified for a groupby*/

diff --git a/include/gdf/parquet/api.h b/include/gdf/parquet/api.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright 2018 BlazingDB, Inc.
+ *     Copyright 2018 Cristhian Alberto Gonzales Castillo <[email protected]>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gdf/gdf.h>
+
+#ifdef __cplusplus
+#define BEGIN_NAMESPACE_GDF_PARQUET                                           \
+    namespace gdf {                                                           \
+    namespace parquet {
+#define END_NAMESPACE_GDF_PARQUET                                             \
+    }                                                                         \
+    }
+#else
+#define BEGIN_NAMESPACE_GDF_PARQUET
+#define END_NAMESPACE_GDF_PARQUET
+#endif
+
+BEGIN_NAMESPACE_GDF_PARQUET
+
+/// \brief Read parquet file from file path into array of gdf columns
+/// \param[in] filename path to parquet file
+/// \param[in] columns will be read from the file
+/// \param[out] out_gdf_columns array
+/// \param[out] out_gdf_columns_length number of columns
+extern "C" gdf_error read_parquet(const char *const        filename,
+                                  const char *const *const columns,
+                                  gdf_column **const       out_gdf_columns,
+                                  size_t *const out_gdf_columns_length);
+
+END_NAMESPACE_GDF_PARQUET
+
+#ifdef __cplusplus
+
+#include <string>
+#include <vector>
+#include <arrow/io/file.h>
+
+namespace gdf {
+namespace parquet {
+
+/// \brief Read parquet file from file path into array of gdf columns
+/// \param[in] filename path to parquet file
+/// \param[in] indices of the rowgroups that will be read from the file
+/// \param[in] indices of the columns that will be read from the file
+/// \param[out] out_gdf_columns vector of gdf_column pointers. The data read.
+gdf_error
+read_parquet_by_ids(const std::string &             filename,
+                    const std::vector<std::size_t> &row_group_indices,
+                    const std::vector<std::size_t> &column_indices,
+                    std::vector<gdf_column *> &     out_gdf_columns);
+
+/// \brief Read parquet file from file interface into array of gdf columns
+/// \param[in] filename path to parquet file
+/// \param[in] indices of the rowgroups that will be read from the file
+/// \param[in] indices of the columns that will be read from the file
+/// \param[out] out_gdf_columns vector of gdf_column pointers. The data read.
+gdf_error
+read_parquet_by_ids(std::shared_ptr<::arrow::io::RandomAccessFile> file,
+                    const std::vector<std::size_t> &row_group_indices,
+                    const std::vector<std::size_t> &column_indices,
+                    std::vector<gdf_column *> &     out_gdf_columns);
+
+}  // namespace parquet
+}  // namespace gdf
+
+#endif
Original file line number	Diff line number	Diff line change
Expand Up		@@ -18,3 +18,5 @@ python/libgdf_cffi/libgdf_cffi.py

		## eclipse
		.project

		build2/
Copy link Member harrism Oct 24, 2018 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. Is "build2" a common directory we need to gitignore? Perhaps this file was committed accidentally?