cudnn frontend v1.2.1 release notes. (#69)

[Bug Fix] cudnn-frontend pip wheels will now dlopen the fully version tag first `libucdnn.so.8` or `libcudnn.so.9` first before trying to load `libcudnn.so`. This means the pip wheels in the RUN_PATH will be prioritized over system paths (default behavior of dlopen). This can be overridden by setting the `LD_LIBRARY_PATH`. Source installation will now automatically look at cudnn in site packages before system path. [Documentation] Fixed the google-colab links in the jupyter notebooks. [Documentation] Added a jupyter notebook sample to go over the basics of cudnn FE graph API. `00_introduction.ipynb`
NVIDIA · Mar 20, 2024 · e5fb0ed · e5fb0ed
1 parent b780db8
commit e5fb0ed
Show file tree

Hide file tree

Showing 18 changed files with 347 additions and 55 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.17)
 
-project(cudnn_frontend VERSION 1.2.0)
+project(cudnn_frontend VERSION 1.2.1)
 
 option(CUDNN_FRONTEND_SKIP_NLOHMANN_JSON "Defines whether FE should not include nlohmann/json.hpp." OFF)
 option(CUDNN_FRONTEND_BUILD_SAMPLES "Defines if samples are built or not." ON)
@@ -30,19 +30,11 @@ target_include_directories(
 # Find the cuda compiler
 find_package(CUDAToolkit)
 
-# Find cudnn
-include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/cuDNN.cmake)
-
 target_link_libraries(
     cudnn_frontend INTERFACE
 
     CUDA::cudart
     CUDA::nvrtc
-
-    # cuDNN dlopen's its libraries
-    # Add all libraries in link line as NEEDED
-    # This forces the executable itself to find all cudnn sublibraries initially
-    CUDNN::cudnn_all
 )
 
 target_compile_features(cudnn_frontend INTERFACE cxx_std_17)
@@ -68,4 +60,4 @@ include(GNUInstallDirs)
 install(
     DIRECTORY ${PROJECT_SOURCE_DIR}/include/
     DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
-)
+)
diff --git a/README.FE.1.0.md b/README.FE.1.0.md
@@ -32,18 +32,18 @@ FE v1.0 API follows a functional style of building a graph. Operations take in i
 | Purpose                 | C++ API                                                   | Python API   |
 | ---                     | ---                                                       | ---          |
 | Create tensor           | tensor                                                    | tensor       |
-| Convolution Fprop       | conv_fprop <br>Conv_fprop_attributes                      | conv_fprop   |
-| Convolution Dgrad       | conv_dgrad <br>Conv_dgrad_attributes                      | conv_dgrad   |
-| Convolution Wgrad       | conv_wgrad <br>Conv_wgrad_attributes                      | conv_wgrad   |
-| Matrix Multiplication   | matmul <br> Matmul_attributes                             | matmul       |
-| Pointwise Operations    | pointwise <br> Pointwise_attributes                       | - add<br>- bias<br>- rqsrt<br>- sub<br>- mul<br>- scale<br>- relu<br>- elu<br>- gelu<br>- cmp_gt       |
-| Batch Normalization     | batchnorm <br>Batchnorm_attributes                        | batchnorm    |
-| Batch Norm bprop        | batchnorm_backward <br>Batchnorm_backward_attributes      | batchnorm_backward    |
+| [Convolution Fprop](docs/operations/Convolutions.md)       | conv_fprop <br>Conv_fprop_attributes                      | conv_fprop   |
+| [Convolution Dgrad](docs/operations/Convolutions.md)       | conv_dgrad <br>Conv_dgrad_attributes                      | conv_dgrad   |
+| [Convolution Wgrad](docs/operations/Convolutions.md)       | conv_wgrad <br>Conv_wgrad_attributes                      | conv_wgrad   |
+| [Matrix Multiplication](docs/operations/Matmul.md)   | matmul <br> Matmul_attributes                             | matmul       |
+| [Pointwise Operations](docs/operations/Pointwise.md)    | pointwise <br> Pointwise_attributes                       | - add<br>- bias<br>- rqsrt<br>- sub<br>- mul<br>- scale<br>- relu<br>- elu<br>- gelu<br>- cmp_gt       |
+| [Batch Normalization](docs/operations/Normalizations.md)     | batchnorm <br>Batchnorm_attributes                        | batchnorm    |
+| [Batch Norm bprop](docs/operations/Normalizations.md)        | batchnorm_backward <br>Batchnorm_backward_attributes      | batchnorm_backward    |
 | Generate stats of output| genstats <br>Genstats_attributes                          | genstats     |
 | BN Finalize of stats    | bn_finalize <br>BN_finalize_attributes                    | bn_finalize  |
 | Dbn weight              | dbn_weight <br>DBN_weight_attributes                      | dbn_weight   |
-| Scale dot product attention | sdpa<br> SDPA_attributes | sdpa |
-| Scale dot product attention backward | sdpa_backward<br> SDPA_backward_attributes | sdpa_backward |
+| [Scale dot product attention](docs/operations/Attention.md) | sdpa<br> SDPA_attributes | sdpa |
+| [Scale dot product attention backward](docs/operations/Attention.md) | sdpa_backward<br> SDPA_backward_attributes | sdpa_backward |
 
 ### Create Graph
 Instantiate an object of class `cudnn_frontend::graph::Graph` which will house tensors and operations.  

diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@ While there are two entry points to the graph API (i.e. backend and frontend), i
 
 Also, for those using backend API, FE API source and samples can serve as reference implementation.
 
-FE v1.0 API extends the groundwork of earlier versions and introduces a new set of APIs to further simplify the workflow. In FE v1.0 API, users can describe multiple operations that form subgraph through a persistent cudnn_frontend::graph::Graph object. Unlike the FE v0.x API, users dont need to worry about specifying shapes and sizes of the intermediate virtual tensors. For detailed information of FE v1.0 API, see README.FE.v1.0.md. 
+In FE v1.0 API, users can describe multiple operations that form subgraph through a persistent `cudnn_frontend::graph::Graph` object. Unlike the FE v0.x API, users dont need to worry about specifying shapes and sizes of the intermediate virtual tensors. FE v1.0 API extends the groundwork of earlier versions and introduces a new set of APIs to further simplify the workflow.  For detailed information of FE v1.0 API, see [README.FE.v1.0.md](README.FE.1.0.md). 
 
 Additionally, FE v1.0 API provides python bindings to all API through pybind11. It is recommended that new users of cuDNN start with the frontend v1.0 API. See `samples/cpp` and `samples/python` for more details on its usage.
 
@@ -40,7 +40,7 @@ To run the python samples, additionally, you will need the following python pack
 ### Python API
 
 #### Source installation:
-Install FE python API by running in an virtual env:
+Install FE python API by running:
 ```
 pip install git+https://github.com/NVIDIA/cudnn-frontend.git
 ```

diff --git a/cmake/cuDNN.cmake b/cmake/cuDNN.cmake
@@ -2,7 +2,7 @@ add_library(CUDNN::cudnn_all INTERFACE IMPORTED)
 
 find_path(
     CUDNN_INCLUDE_DIR cudnn.h
-    HINTS $ENV{CUDNN_PATH} ${CUDNN_PATH} ${CUDAToolkit_INCLUDE_DIRS}
+    HINTS $ENV{CUDNN_PATH} ${CUDNN_PATH} ${Python_SITEARCH}/nvidia/cudnn ${CUDAToolkit_INCLUDE_DIRS}
     PATH_SUFFIXES include
     REQUIRED
 )
@@ -14,7 +14,7 @@ string(REGEX MATCH "[1-9]+" CUDNN_MAJOR_VERSION "${macrodef}")
 function(find_cudnn_library NAME)
     find_library(
         ${NAME}_LIBRARY ${NAME} "lib${NAME}.so.${CUDNN_MAJOR_VERSION}"
-        HINTS $ENV{CUDNN_PATH} ${CUDNN_PATH} ${CUDAToolkit_LIBRARY_DIR}
+        HINTS $ENV{CUDNN_PATH} ${CUDNN_PATH} ${Python_SITEARCH}/nvidia/cudnn ${CUDAToolkit_LIBRARY_DIR}
         PATH_SUFFIXES lib64 lib/x64 lib
         REQUIRED
     )

diff --git a/include/cudnn_frontend.h b/include/cudnn_frontend.h
@@ -125,7 +125,7 @@
 
 #define CUDNN_FRONTEND_MAJOR_VERSION 1
 #define CUDNN_FRONTEND_MINOR_VERSION 2
-#define CUDNN_FRONTEND_PATCH_VERSION 0
+#define CUDNN_FRONTEND_PATCH_VERSION 1
 #define CUDNN_FRONTEND_VERSION \
     ((CUDNN_FRONTEND_MAJOR_VERSION * 10000) + (CUDNN_FRONTEND_MINOR_VERSION * 100) + CUDNN_FRONTEND_PATCH_VERSION)
 

diff --git a/include/cudnn_frontend_shim.h b/include/cudnn_frontend_shim.h
@@ -32,28 +32,13 @@
 namespace cudnn_frontend {
 
 #if defined NV_CUDNN_FRONTEND_USE_DYNAMIC_LOADING
+
+// cudnn package initialization set this global handle
+extern void *cudnn_dlhandle;
+
 inline void *
 get_symbol(const char *function_name) {
-    static std::mutex cudnn_fe_lib_mutex;
-    std::lock_guard<std::mutex> lock(cudnn_fe_lib_mutex);
-    char *c                = NULL;
-    c                      = dlerror();
-    static void *dl_handle = dlopen("libcudnn.so", RTLD_NOW);
-    c                      = dlerror();
-    (void)c;
-    if (dl_handle == nullptr) {
-        // Fall back major version name
-        dl_handle = dlopen("libcudnn.so.9", RTLD_NOW);
-        if (dl_handle == nullptr) {
-            dl_handle = dlopen("libcudnn.so.8", RTLD_NOW);
-            if (dl_handle == nullptr) {
-                std::string error_msg = std::string("Unable to dlopen libcudnn.so.[8/9]") + std::string(c);
-                throw std::runtime_error(error_msg.c_str());
-            }
-        }
-    }
-
-    void *ret = dlsym(dl_handle, function_name);
+    void *ret = dlsym(cudnn_dlhandle, function_name);
     return ret;
 }
 
@@ -304,4 +289,4 @@ inline cudnnStatus_t
 destroy_filter(cudnnFilterDescriptor_t filter) {
     NV_FE_CALL_TO_BACKEND(destroy_filter, cudnnDestroyFilterDescriptor, filter);
 }
-}  // namespace cudnn_frontend
+}  // namespace cudnn_frontend
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
@@ -12,8 +12,12 @@ FetchContent_Declare(
 )
 FetchContent_MakeAvailable(dlpack)
 
+# Find python
 find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED)
 
+# Find cudnn
+include(${CMAKE_SOURCE_DIR}/cmake/cuDNN.cmake)
+
 option(CUDNN_FRONTEND_FETCH_PYBINDS_IN_CMAKE "Whether cmake build system should fetch pybinds." ON)
 if(CUDNN_FRONTEND_FETCH_PYBINDS_IN_CMAKE)
     FetchContent_Declare(
@@ -48,6 +52,7 @@ target_compile_features(_compiled_module PRIVATE cxx_std_17)
 target_include_directories(
     _compiled_module
     PRIVATE $<TARGET_PROPERTY:cudnn_frontend,INTERFACE_INCLUDE_DIRECTORIES>
+    PRIVATE $<TARGET_PROPERTY:CUDNN::cudnn_all,INTERFACE_INCLUDE_DIRECTORIES>
 )
 
 target_compile_definitions(_compiled_module PRIVATE NV_CUDNN_FRONTEND_USE_DYNAMIC_LOADING)

diff --git a/python/cudnn/__init__.py b/python/cudnn/__init__.py
@@ -17,7 +17,7 @@
 
 from .datatypes import (_library_type, _is_torch_tensor)
 
-__version__ = '1.2.0'
+__version__ = '1.2.1'
 
 def _tensor(
     self,
@@ -125,4 +125,35 @@ def _execute_plan_at_index(
     self._execute_plan_at_index(uid_to_tensor_pointer, workspace_pointer, index, handle)
 
 pygraph.execute = _execute
-pygraph.execute_plan_at_index = _execute_plan_at_index
+pygraph.execute_plan_at_index = _execute_plan_at_index
+
+def _dlopen_cudnn():
+
+    # The default library name that should be dlopened
+    # In case a FW uses a particular cudnn major version, this variable is overridden later.
+    lib_name = 'libcudnn.so'
+
+    # try to get major version from torch
+    # more FWs can be added as and when needed
+    try:
+        import torch
+        if torch.backends.cudnn.is_available():
+            cudnn_version = torch.backends.cudnn.version()
+            cudnn_major_version = str(cudnn_version)[0]
+            lib_name = 'libcudnn.so.' + cudnn_major_version
+    except ImportError:
+            pass
+
+    # dlopen the library and set the dlhandle inside compiled module
+    try:
+        import ctypes
+        lib = ctypes.CDLL(lib_name)
+        handle = ctypes.cast(lib._handle, ctypes.c_void_p).value
+    except OSError as e:
+        raise Exception(f"Error loading the shared library: {e}")
+    except Exception as e:
+        raise Exception(f"An unexpected error occurred: {e}")
+
+    _compiled_module._set_dlhandle_cudnn(handle)
+
+_dlopen_cudnn()
diff --git a/python/pycudnn.cpp b/python/pycudnn.cpp
@@ -10,6 +10,9 @@ namespace py = pybind11;
 using namespace pybind11::literals;
 
 namespace cudnn_frontend {
+
+void *cudnn_dlhandle = nullptr;
+
 namespace python_bindings {
 
 // Raise C++ exceptions corresponding to C++ FE error codes.
@@ -58,12 +61,19 @@ init_pygraph_submodule(py::module_ &);
 void
 init_properties(py::module_ &);
 
+void
+set_dlhandle_cudnn(std::intptr_t dlhandle) {
+    cudnn_dlhandle = reinterpret_cast<void *>(dlhandle);
+}
+
 PYBIND11_MODULE(_compiled_module, m) {
     m.def("backend_version", &cudnn_frontend::get_backend_version);
 
     init_properties(m);
     init_pygraph_submodule(m);
+
+    m.def("_set_dlhandle_cudnn", &set_dlhandle_cudnn);
 }
 
 }  // namespace python_bindings
-}  // namespace cudnn_frontend
+}  // namespace cudnn_frontend
diff --git a/requirements.txt b/requirements.txt
@@ -2,5 +2,4 @@ jupyter
 numpy
 pybind11[global]
 pytest
-pytest-xdist
-torch
+pytest-xdist
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
@@ -14,6 +14,9 @@ if(NOT Catch2_FOUND)
     FetchContent_MakeAvailable(Catch2)
 endif()
 
+# Find cudnn
+include(${CMAKE_SOURCE_DIR}/cmake/cuDNN.cmake)
+
 add_executable(
     samples
 
@@ -72,6 +75,8 @@ target_link_libraries(
 
     cudnn_frontend
     Catch2::Catch2WithMain
+
+    CUDNN::cudnn_all
 )
 
 # cuDNN dlopen's its libraries